mspire 0.5.0 → 0.6.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +24 -0
- data/Rakefile +51 -0
- data/VERSION +1 -0
- data/lib/cv/description.rb +18 -0
- data/lib/cv/param.rb +33 -0
- data/lib/cv.rb +3 -0
- data/lib/io/bookmark.rb +13 -0
- data/lib/merge.rb +7 -0
- data/lib/ms/cvlist.rb +76 -0
- data/lib/ms/digester.rb +245 -0
- data/lib/ms/fasta.rb +86 -0
- data/lib/ms/ident/peptide/db.rb +243 -0
- data/lib/ms/ident/peptide.rb +72 -0
- data/lib/ms/ident/peptide_hit/qvalue.rb +56 -0
- data/lib/ms/ident/peptide_hit.rb +26 -0
- data/lib/ms/ident/pepxml/modifications.rb +83 -0
- data/lib/ms/ident/pepxml/msms_pipeline_analysis.rb +70 -0
- data/lib/ms/ident/pepxml/msms_run_summary.rb +82 -0
- data/lib/ms/ident/pepxml/parameters.rb +14 -0
- data/lib/ms/ident/pepxml/sample_enzyme.rb +165 -0
- data/lib/ms/ident/pepxml/search_database.rb +49 -0
- data/lib/ms/ident/pepxml/search_hit/modification_info.rb +79 -0
- data/lib/ms/ident/pepxml/search_hit.rb +144 -0
- data/lib/ms/ident/pepxml/search_result.rb +35 -0
- data/lib/ms/ident/pepxml/search_summary.rb +92 -0
- data/lib/ms/ident/pepxml/spectrum_query.rb +85 -0
- data/lib/ms/ident/pepxml.rb +112 -0
- data/lib/ms/ident/protein.rb +33 -0
- data/lib/ms/ident/protein_group.rb +80 -0
- data/lib/ms/ident/search.rb +114 -0
- data/lib/ms/ident.rb +37 -0
- data/lib/ms/isotope/aa.rb +59 -0
- data/lib/ms/mascot.rb +6 -0
- data/lib/ms/mass/aa.rb +79 -0
- data/lib/ms/mass.rb +55 -0
- data/lib/ms/mzml/index_list.rb +98 -0
- data/lib/ms/mzml/plms1.rb +34 -0
- data/lib/ms/mzml.rb +197 -0
- data/lib/ms/obo.rb +38 -0
- data/lib/ms/plms1.rb +156 -0
- data/lib/ms/quant/qspec/protein_group_comparison.rb +22 -0
- data/lib/ms/quant/qspec.rb +112 -0
- data/lib/ms/spectrum.rb +154 -8
- data/lib/ms.rb +3 -10
- data/lib/msplat.rb +2 -0
- data/lib/obo/ims.rb +5 -0
- data/lib/obo/ms.rb +7 -0
- data/lib/obo/ontology.rb +41 -0
- data/lib/obo/unit.rb +5 -0
- data/lib/openany.rb +23 -0
- data/lib/write_file_or_string.rb +18 -0
- data/obo/ims.obo +562 -0
- data/obo/ms.obo +11677 -0
- data/obo/unit.obo +2563 -0
- data/spec/ms/cvlist_spec.rb +60 -0
- data/spec/ms/digester_spec.rb +351 -0
- data/spec/ms/fasta_spec.rb +100 -0
- data/spec/ms/ident/peptide/db_spec.rb +108 -0
- data/spec/ms/ident/pepxml/sample_enzyme_spec.rb +181 -0
- data/spec/ms/ident/pepxml/search_hit/modification_info_spec.rb +37 -0
- data/spec/ms/ident/pepxml_spec.rb +442 -0
- data/spec/ms/ident/protein_group_spec.rb +68 -0
- data/spec/ms/mass_spec.rb +8 -0
- data/spec/ms/mzml/index_list_spec.rb +122 -0
- data/spec/ms/mzml/plms1_spec.rb +62 -0
- data/spec/ms/mzml_spec.rb +50 -0
- data/spec/ms/plms1_spec.rb +38 -0
- data/spec/ms/quant/qspec_spec.rb +25 -0
- data/spec/msplat_spec.rb +24 -0
- data/spec/obo_spec.rb +25 -0
- data/spec/spec_helper.rb +25 -0
- data/spec/testfiles/ms/ident/peptide/db/uni_11_sp_tr.fasta +69 -0
- data/spec/testfiles/ms/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4.yml +728 -0
- data/spec/testfiles/ms/mzml/j24z.idx_comp.3.mzML +271 -0
- data/spec/testfiles/ms/mzml/openms.noidx_nocomp.12.mzML +330 -0
- data/spec/testfiles/ms/quant/kill_extra_tabs.rb +13 -0
- data/spec/testfiles/ms/quant/max_quant_output.provenance.txt +15 -0
- data/spec/testfiles/ms/quant/max_quant_output.txt +199 -0
- data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv_qspecgp +199 -0
- data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv_qspecgp.csv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_final.txt +199 -0
- data/spec/testfiles/ms/quant/pdcd5_final.txt_qspecgp +0 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.CSV.csv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.csv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.csv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv_qspecgp +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv_qspecgp.csv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.txt +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_tabdel.txt +134 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_tabdel.txt_qspecgp +134 -0
- data/spec/testfiles/ms/quant/remove_rest_of_proteins.rb +13 -0
- data/spec/testfiles/ms/quant/unlog_transform.rb +13 -0
- data/spec/testfiles/plms1/output.key +0 -0
- metadata +157 -40
- data/README +0 -77
- data/changelog.txt +0 -196
- data/lib/ms/calc.rb +0 -32
- data/lib/ms/data/interleaved.rb +0 -60
- data/lib/ms/data/lazy_io.rb +0 -73
- data/lib/ms/data/lazy_string.rb +0 -15
- data/lib/ms/data/simple.rb +0 -59
- data/lib/ms/data/transposed.rb +0 -41
- data/lib/ms/data.rb +0 -57
- data/lib/ms/format/format_error.rb +0 -12
- data/lib/ms/support/binary_search.rb +0 -126
data/README.rdoc
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
= mspire
|
2
|
+
|
3
|
+
Tools for working with mass spectrometry data in ruby.
|
4
|
+
|
5
|
+
== Examples
|
6
|
+
|
7
|
+
=== mzml
|
8
|
+
|
9
|
+
require 'ms/mzml'
|
10
|
+
|
11
|
+
MS::Mzml.open("somefile.mzml") do |mzml|
|
12
|
+
spectrum = mzml[0] # the first spectrum ( same as mzml.spectrum(0) )
|
13
|
+
spectrum = mzml["controllerType=0 controllerNumber=1 scan=2"] # query by id string
|
14
|
+
mzml.spectrum_from_scan_num(23) # raises ScanNumbersNotFound or ScanNumbersNotUnique errors if problems
|
15
|
+
end
|
16
|
+
|
17
|
+
require 'ms/mass/aa'
|
18
|
+
|
19
|
+
MS::Mass::AA::MONO['A'] # or access by symbol
|
20
|
+
|
21
|
+
== Copyright
|
22
|
+
|
23
|
+
See LICENSE (MIT)
|
24
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
require 'rspec/core/rake_task'
|
4
|
+
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
8
|
+
gem.name = "mspire"
|
9
|
+
gem.homepage = "http://github.com/princelab/mspire"
|
10
|
+
gem.license = "MIT"
|
11
|
+
gem.summary = %Q{mass spectrometry proteomics, lipidomics, and tools}
|
12
|
+
gem.description = %Q{mass spectrometry proteomics, lipidomics, and tools, a rewrite of mspire, merging of ms-* gems}
|
13
|
+
gem.email = "jtprince@gmail.com"
|
14
|
+
gem.authors = ["John T. Prince", "Simon Chiang"]
|
15
|
+
gem.add_dependency "nokogiri", "~> 1.5"
|
16
|
+
gem.add_development_dependency "rspec", "~> 2.6"
|
17
|
+
gem.add_development_dependency "jeweler", "~> 1.5.2"
|
18
|
+
gem.add_development_dependency "rcov", ">= 0"
|
19
|
+
gem.add_development_dependency "obo", ">= 0.1.0"
|
20
|
+
end
|
21
|
+
Jeweler::RubygemsDotOrgTasks.new
|
22
|
+
|
23
|
+
require 'rspec/core'
|
24
|
+
require 'rspec/core/rake_task'
|
25
|
+
RSpec::Core::RakeTask.new(:spec) do |spec|
|
26
|
+
spec.pattern = FileList['spec/**/*_spec.rb']
|
27
|
+
end
|
28
|
+
|
29
|
+
RSpec::Core::RakeTask.new(:rcov) do |spec|
|
30
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
31
|
+
spec.rcov = true
|
32
|
+
end
|
33
|
+
|
34
|
+
#require 'rcov/rcovtask'
|
35
|
+
#Rcov::RcovTask.new do |spec|
|
36
|
+
# spec.libs << 'spec'
|
37
|
+
# spec.pattern = 'spec/**/*_spec.rb'
|
38
|
+
# spec.verbose = true
|
39
|
+
#end
|
40
|
+
|
41
|
+
task :default => :spec
|
42
|
+
|
43
|
+
require 'rdoc/task'
|
44
|
+
Rake::RDocTask.new do |rdoc|
|
45
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
46
|
+
|
47
|
+
rdoc.rdoc_dir = 'rdoc'
|
48
|
+
rdoc.title = "mspire #{version}"
|
49
|
+
rdoc.rdoc_files.include('README*')
|
50
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
51
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.6.1
|
@@ -0,0 +1,18 @@
|
|
1
|
+
|
2
|
+
module CV
|
3
|
+
class Description < Array
|
4
|
+
def initialize(*args, &block)
|
5
|
+
super(args)
|
6
|
+
self.instance_eval &block
|
7
|
+
end
|
8
|
+
|
9
|
+
# pushes a CV::Param object onto the description array
|
10
|
+
def param(*args)
|
11
|
+
push CV::Param.new(*args)
|
12
|
+
end
|
13
|
+
|
14
|
+
def to_xml(xml)
|
15
|
+
each {|param| param.to_xml(xml) }
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
data/lib/cv/param.rb
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
|
2
|
+
module CV
|
3
|
+
class Param
|
4
|
+
attr_accessor :cv_ref, :accession, :name, :value
|
5
|
+
# A valueless CV::Param object that describes the units being used
|
6
|
+
attr_accessor :unit
|
7
|
+
|
8
|
+
def initialize(cv_ref, accession, name, value=nil)
|
9
|
+
(@cv_ref, @accession, @name, @value) = [cv_ref, accession, name, value]
|
10
|
+
end
|
11
|
+
|
12
|
+
def to_xml(xml, name=:cvParam)
|
13
|
+
hash_to_send = {:cvRef => @cvref, :accession => @accession, :name => @name}
|
14
|
+
hash_to_send[:value] = @value if @value
|
15
|
+
if unit
|
16
|
+
hash_to_send.merge!( { :unitCvRef => unit.cv_ref,
|
17
|
+
:unitAccession => unit.accession,
|
18
|
+
:unitName => unit.name } )
|
19
|
+
end
|
20
|
+
xml.send(name, hash_to_send)
|
21
|
+
end
|
22
|
+
|
23
|
+
def ==(other)
|
24
|
+
if !other.nil? && other.is_a?(CV::Param)
|
25
|
+
[:cv_ref, :accession, :name, :value, :unit].inject(true) do |bool, mthd|
|
26
|
+
bool && (self.send(mthd) == other.send(mthd))
|
27
|
+
end
|
28
|
+
else ; false
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
data/lib/cv.rb
ADDED
data/lib/io/bookmark.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
|
2
|
+
class IO
|
3
|
+
# saves the position and returns to it after the block
|
4
|
+
# is executed. Returns the block's reply. if rewind, io.rewind is called
|
5
|
+
# before handing the io object to the block.
|
6
|
+
def bookmark(rewind=false, &block)
|
7
|
+
start = self.pos
|
8
|
+
self.rewind if rewind
|
9
|
+
reply = block.call(self)
|
10
|
+
self.pos = start
|
11
|
+
reply
|
12
|
+
end
|
13
|
+
end
|
data/lib/merge.rb
ADDED
data/lib/ms/cvlist.rb
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
|
2
|
+
require 'cv'
|
3
|
+
require 'obo/ms'
|
4
|
+
require 'obo/ims'
|
5
|
+
require 'obo/unit'
|
6
|
+
|
7
|
+
module MS
|
8
|
+
module CV
|
9
|
+
Obo = {
|
10
|
+
'MS' => Obo::MS.id_to_name,
|
11
|
+
'IMS' => Obo::IMS.id_to_name,
|
12
|
+
'UO' => Obo::Unit.id_to_name,
|
13
|
+
}
|
14
|
+
|
15
|
+
class Param < ::CV::Param
|
16
|
+
# takes a variety of arguments (acc = accession):
|
17
|
+
#
|
18
|
+
# acc#
|
19
|
+
# acc#, value
|
20
|
+
# acc#, unit_acc# or CV::Param object
|
21
|
+
# acc#, value, unit_acc# or CV::Param object
|
22
|
+
# cvref, acc#, name
|
23
|
+
# cvref, acc#, name, value
|
24
|
+
# cvref, acc#, name, unit_acc# or CV::Param object
|
25
|
+
# cvref, acc#, name, value, unit_acc# or CV::Param object
|
26
|
+
def initialize(*args)
|
27
|
+
@unit =
|
28
|
+
if args.size > 1 && ((args.last.is_a?(::CV::Param) || args.last =~ /[A-Za-z]+:\d+/))
|
29
|
+
unit_arg = args.pop
|
30
|
+
unit_arg.is_a?(::CV::Param) ? unit_arg : self.class.new(unit_arg)
|
31
|
+
end
|
32
|
+
(@cv_ref, @accession, @name, @value) =
|
33
|
+
case args.size
|
34
|
+
when 1..2 # accession number (maybe with value)
|
35
|
+
(obo_type, accnum) = args.first.split(':')
|
36
|
+
[obo_type, args.first, MS::CV::Obo[obo_type][args.first], args[1]]
|
37
|
+
when 3..4 # they have fully specified the object
|
38
|
+
args
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# CVList.new( <CV::Param> )
|
45
|
+
# CVList['MS:1000004', ['MS:1000004'], ['IMS:1000042', 23], param_obj, args]
|
46
|
+
# CVList.new do
|
47
|
+
# param MS:1000004
|
48
|
+
# param MS:1000042, 23
|
49
|
+
# end
|
50
|
+
class CVList < Array
|
51
|
+
|
52
|
+
# ensures that each argument is an argument that can be handled by
|
53
|
+
# CV::Param. Returns the CVList object it creates
|
54
|
+
def self.[](*args)
|
55
|
+
list = self.new
|
56
|
+
args.each do |arg|
|
57
|
+
arg.is_a?(Array) ? list.param(*arg) : list.param(arg)
|
58
|
+
end
|
59
|
+
list
|
60
|
+
end
|
61
|
+
|
62
|
+
# takes a list of valid CV::Param objects, or they can be set in the block
|
63
|
+
# using param
|
64
|
+
def initialize(*args, &block)
|
65
|
+
args.each {|arg| param(arg) }
|
66
|
+
instance_eval &block if block
|
67
|
+
end
|
68
|
+
|
69
|
+
# if the first object is a MS::CV::Param it is just pushed onto the list,
|
70
|
+
# otherwise the arguments are sent in to initialize a fresh MS::CV::Param,
|
71
|
+
# and this object is pushed onto the list.
|
72
|
+
def param(*args)
|
73
|
+
push args.first.is_a?(::CV::Param) ? args.first : MS::CV::Param.new(*args)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
data/lib/ms/digester.rb
ADDED
@@ -0,0 +1,245 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
|
3
|
+
module MS
|
4
|
+
|
5
|
+
# A Digester splits a protein sequence into peptides at specified sites.
|
6
|
+
#
|
7
|
+
# trypsin = MS::Digester[:trypsin]
|
8
|
+
#
|
9
|
+
# trypsin.digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG')
|
10
|
+
# # => ['MIVIGR', 'SIVHPYITNEYEPFAAEK', 'QQILSIMAG']
|
11
|
+
#
|
12
|
+
# With 1 missed cleavage:
|
13
|
+
#
|
14
|
+
# trypsin.digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG', 1)
|
15
|
+
# # => ['MIVIGR','MIVIGRSIVHPYITNEYEPFAAEK','SIVHPYITNEYEPFAAEK',
|
16
|
+
# # 'SIVHPYITNEYEPFAAEKQQILSIMAG', 'QQILSIMAG']
|
17
|
+
#
|
18
|
+
# Return the start and end sites of digestion:
|
19
|
+
#
|
20
|
+
# trypsin.site_digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG', 1)
|
21
|
+
# # => [[0,6],[0,24],[6,24],[6,33],[24,33]]
|
22
|
+
class Digester
|
23
|
+
|
24
|
+
# The name of the digester
|
25
|
+
attr_reader :name
|
26
|
+
|
27
|
+
# A string of residues at which cleavage occurs
|
28
|
+
attr_reader :cleave_str
|
29
|
+
|
30
|
+
# A c-terminal resitriction residue which prevents
|
31
|
+
# cleavage at a potential cleavage site (optional).
|
32
|
+
attr_reader :cterm_exception
|
33
|
+
|
34
|
+
# True if cleavage occurs at the c-terminus of a
|
35
|
+
# cleavage residue, false if cleavage occurs at
|
36
|
+
# the n-terminus.
|
37
|
+
attr_reader :cterm_cleavage
|
38
|
+
|
39
|
+
MULTILINE_WHITESPACE = /\s*/m
|
40
|
+
|
41
|
+
def initialize(name, cleave_str, cterm_exception=nil, cterm_cleavage=true)
|
42
|
+
regexp = []
|
43
|
+
0.upto(cleave_str.length - 1) {|i| regexp << cleave_str[i, 1] }
|
44
|
+
|
45
|
+
@name = name
|
46
|
+
@cleave_str = cleave_str
|
47
|
+
@cleave_regexp = Regexp.new(regexp.join('|'))
|
48
|
+
@cterm_exception = case
|
49
|
+
when cterm_exception == nil || cterm_exception.empty? then nil
|
50
|
+
when cterm_exception.length == 1 then cterm_exception[0]
|
51
|
+
else
|
52
|
+
raise ArgumentError, "cterm exceptions must be a single residue: #{cterm_exception}"
|
53
|
+
end
|
54
|
+
|
55
|
+
@cterm_cleavage = cterm_cleavage
|
56
|
+
@scanner = StringScanner.new('')
|
57
|
+
end
|
58
|
+
|
59
|
+
# Returns digestion sites in sequence, as determined by the
|
60
|
+
# cleave_regexp boundaries. The digestion sites correspond to the
|
61
|
+
# positions where a peptide begins and ends, such that [n, (n+1) - n]
|
62
|
+
# corresponds to the [index, length] for peptide n.
|
63
|
+
#
|
64
|
+
# d = Digester.new('Trypsin', 'KR', 'P')
|
65
|
+
# seq = "AARGGR"
|
66
|
+
# sites = d.cleavage_sites(seq) # => [0, 3, 6]
|
67
|
+
#
|
68
|
+
# seq[sites[0], sites[0+1] - sites[0]] # => "AAR"
|
69
|
+
# seq[sites[1], sites[1+1] - sites[1]] # => "GGR"
|
70
|
+
#
|
71
|
+
# Trailing whitespace is included in the fragment.
|
72
|
+
#
|
73
|
+
# seq = "AAR \n GGR"
|
74
|
+
# sites = d.cleavage_sites(seq) # => [0, 8, 11]
|
75
|
+
#
|
76
|
+
# seq[sites[0], sites[0+1] - sites[0]] # => "AAR \n "
|
77
|
+
# seq[sites[1], sites[1+1] - sites[1]] # => "GGR"
|
78
|
+
#
|
79
|
+
# The digested section of sequence may be specified using offset
|
80
|
+
# and length.
|
81
|
+
def cleavage_sites(seq, offset=0, length=seq.length-offset)
|
82
|
+
return [0, 1] if seq.size == 1 # adding exceptions is lame--algorithm should just work
|
83
|
+
|
84
|
+
adjustment = cterm_cleavage ? 0 : 1
|
85
|
+
limit = offset + length
|
86
|
+
|
87
|
+
positions = [offset]
|
88
|
+
pos = scan(seq, offset, limit) do |pos|
|
89
|
+
positions << (pos - adjustment)
|
90
|
+
end
|
91
|
+
|
92
|
+
# add the final position
|
93
|
+
if (pos < limit) || (positions.length == 1)
|
94
|
+
positions << limit
|
95
|
+
end
|
96
|
+
# adding exceptions is lame.. this code probably needs to be
|
97
|
+
# refactored (corrected).
|
98
|
+
if !cterm_cleavage && pos == limit
|
99
|
+
positions << limit
|
100
|
+
end
|
101
|
+
positions
|
102
|
+
end
|
103
|
+
|
104
|
+
# Returns digestion sites of sequence as [start_index, end_index] pairs,
|
105
|
+
# allowing for missed cleavages. Digestion sites are determined using
|
106
|
+
# cleavage_sites; as in that method, the digested section of sequence
|
107
|
+
# may be specified using offset and length.
|
108
|
+
#
|
109
|
+
# Each [start_index, end_index] pair is yielded to the block, if given,
|
110
|
+
# and the collected results are returned.
|
111
|
+
def site_digest(seq, max_misses=0, offset=0, length=seq.length-offset, &block) # :yields: start_index, end_index
|
112
|
+
frag_sites = cleavage_sites(seq, offset, length)
|
113
|
+
|
114
|
+
overlay(frag_sites.length, max_misses, 1) do |start_index, end_index|
|
115
|
+
start_index = frag_sites[start_index]
|
116
|
+
end_index = frag_sites[end_index]
|
117
|
+
|
118
|
+
block ? block.call(start_index, end_index) : [start_index, end_index]
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
# Returns an array of peptides produced by digesting sequence, allowing for
|
123
|
+
# missed cleavage sites. Digestion sites are determined using cleavage_sites;
|
124
|
+
# as in that method, the digested section of sequence may be specified using
|
125
|
+
# offset and length.
|
126
|
+
def digest(seq, max_misses=0, offset=0, length=seq.length-offset)
|
127
|
+
site_digest(seq, max_misses, offset, length).map do |s, e|
|
128
|
+
seq[s, e-s]
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
protected
|
133
|
+
|
134
|
+
# The cleavage regexp used to identify cleavage sites
|
135
|
+
attr_reader :cleave_regexp # :nodoc:
|
136
|
+
|
137
|
+
# The scanner used to digest strings.
|
138
|
+
attr_reader :scanner # :nodoc:
|
139
|
+
|
140
|
+
# Scans seq between offset and limit for the cleave_regexp, skipping whitespace
|
141
|
+
# and being mindful of exception characters. The positions of the scanner at
|
142
|
+
# each match are yielded to the block.
|
143
|
+
def scan(seq, offset, limit, &block) # :nodoc:
|
144
|
+
scanner.string = seq
|
145
|
+
scanner.pos = offset
|
146
|
+
|
147
|
+
while scanner.search_full(cleave_regexp, true, false)
|
148
|
+
scanner.search_full(MULTILINE_WHITESPACE, true, false)
|
149
|
+
pos = scanner.pos
|
150
|
+
|
151
|
+
# skip if the next character is the exception character
|
152
|
+
next if cterm_exception != nil && seq[pos] == cterm_exception
|
153
|
+
|
154
|
+
# break if you scanned past the upper limit
|
155
|
+
break if pos > limit
|
156
|
+
|
157
|
+
block.call(pos)
|
158
|
+
end
|
159
|
+
|
160
|
+
scanner.pos
|
161
|
+
end
|
162
|
+
|
163
|
+
# Performs an overlap-collect algorithm providing the start and end
|
164
|
+
# indicies of spans skipping up to max_misses boundaries.
|
165
|
+
def overlay(n, max_misses, offset, &block) # :nodoc:
|
166
|
+
results = []
|
167
|
+
0.upto(n-1) do |start_index|
|
168
|
+
0.upto(max_misses) do |n_miss|
|
169
|
+
end_index = start_index + offset + n_miss
|
170
|
+
break if end_index == n
|
171
|
+
|
172
|
+
results << block.call(start_index, end_index)
|
173
|
+
end
|
174
|
+
end
|
175
|
+
results
|
176
|
+
end
|
177
|
+
|
178
|
+
#
|
179
|
+
# Enzymes adapted from the default Mascot enzyme list.
|
180
|
+
#
|
181
|
+
|
182
|
+
class << self
|
183
|
+
# takes the name of the enzyme in any case (symbol or string)
|
184
|
+
# and accesses the constant (returns nil if none found)
|
185
|
+
def [](enzyme_name)
|
186
|
+
ENZYMES[ enzyme_name.to_s.downcase.gsub(/\W+/,'_').to_sym ]
|
187
|
+
end
|
188
|
+
|
189
|
+
# Utility method to parse a mascot enzyme configuration
|
190
|
+
# string (tab separated) into a Digester.
|
191
|
+
def mascot_parse(str) # :nodoc:
|
192
|
+
name, sense, cleave_str, cterm_exception, independent, semi_specific = str.split(/ *\t */)
|
193
|
+
cterm_cleavage = case sense
|
194
|
+
when 'C-Term' then true
|
195
|
+
when 'N-Term' then false
|
196
|
+
else raise ArgumentError, "unknown sense: #{sense}"
|
197
|
+
end
|
198
|
+
|
199
|
+
new(name, cleave_str, cterm_exception, cterm_cleavage)
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
# ARG_C = mascot_parse('Arg-C C-Term R P no no')
|
204
|
+
# ENZYMES[:arg_c] = <'Arg-C' enzyme>
|
205
|
+
MASCOT_ENZYME_CONFIG_STRINGS = {
|
206
|
+
:arg_c => 'Arg-C C-Term R P no no',
|
207
|
+
:asp_n => 'Asp-N N-Term BD no no',
|
208
|
+
:asp_n_ambic => 'Asp-N_ambic N-Term DE no no',
|
209
|
+
:chymotrypsin => 'Chymotrypsin C-Term FLWY P no no',
|
210
|
+
:cnbr => 'CNBr C-Term M no no',
|
211
|
+
:lys_c => 'Lys-C C-Term K P no no',
|
212
|
+
:lys_c_p => 'Lys-C/P C-Term K no no',
|
213
|
+
:pepsin_a => 'PepsinA C-Term FL no no',
|
214
|
+
:tryp_cnbr => 'Tryp-CNBr C-Term KMR P no no',
|
215
|
+
:tryp_chymo => 'TrypChymo C-Term FKLRWY P no no',
|
216
|
+
:trypsin_p => 'Trypsin/P C-Term KR no no',
|
217
|
+
:v8_de => 'V8-DE C-Term BDEZ P no no',
|
218
|
+
:v8_e => 'V8-E C-Term EZ P no no',
|
219
|
+
:trypsin => 'Trypsin C-Term KR P no no',
|
220
|
+
:v8_e_trypsin => 'V8-E+Trypsin C-Term EKRZ P no no',
|
221
|
+
:v8_de_trypsin => 'V8-DE+Trypsin C-Term BDEKRZ P no no',
|
222
|
+
:arg_c => 'Arg-C C-Term R P no no',
|
223
|
+
:asp_n => 'Asp-N N-Term BD no no',
|
224
|
+
:asp_n_ambic => 'Asp-N_ambic N-Term DE no no',
|
225
|
+
:chymotrypsin => 'Chymotrypsin C-Term FLWY P no no',
|
226
|
+
:cnbr => 'CNBr C-Term M no no',
|
227
|
+
:lys_c => 'Lys-C C-Term K P no no',
|
228
|
+
:lys_c_p => 'Lys-C/P C-Term K no no',
|
229
|
+
:pepsin_a => 'PepsinA C-Term FL no no',
|
230
|
+
:tryp_cnbr => 'Tryp-CNBr C-Term KMR P no no',
|
231
|
+
:tryp_chymo => 'TrypChymo C-Term FKLRWY P no no',
|
232
|
+
:trypsin_p => 'Trypsin/P C-Term KR no no',
|
233
|
+
:v8_de => 'V8-DE C-Term BDEZ P no no',
|
234
|
+
:v8_e => 'V8-E C-Term EZ P no no',
|
235
|
+
:trypsin => 'Trypsin C-Term KR P no no',
|
236
|
+
:v8_e_trypsin => 'V8-E+Trypsin C-Term EKRZ P no no',
|
237
|
+
:v8_de_trypsin => 'V8-DE+Trypsin C-Term BDEKRZ P no no',
|
238
|
+
}
|
239
|
+
|
240
|
+
ENZYMES = MASCOT_ENZYME_CONFIG_STRINGS.inject(Hash.new) do |hash,(k,v)|
|
241
|
+
hash[k] = mascot_parse(v)
|
242
|
+
hash
|
243
|
+
end
|
244
|
+
end
|
245
|
+
end
|
data/lib/ms/fasta.rb
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
require 'bio'
|
2
|
+
require 'stringio'
|
3
|
+
|
4
|
+
class Bio::FlatFile
|
5
|
+
include Enumerable
|
6
|
+
end
|
7
|
+
|
8
|
+
class Bio::FastaFormat
|
9
|
+
alias_method :header, :definition
|
10
|
+
alias_method :sequence, :seq
|
11
|
+
end
|
12
|
+
|
13
|
+
module MS
|
14
|
+
# A convenience class for working with fasta formatted sequence databases.
|
15
|
+
# the file which includes this class also includes Enumerable with
|
16
|
+
# Bio::FlatFile so you can do things like this:
|
17
|
+
#
|
18
|
+
# accessions = MS::Fasta.open("file.fasta") do |fasta|
|
19
|
+
# fasta.map(&:accession)
|
20
|
+
# end
|
21
|
+
#
|
22
|
+
# A few aliases are added to Bio::FastaFormat
|
23
|
+
#
|
24
|
+
# entry.header == entry.definition
|
25
|
+
# entry.sequence == entry.seq
|
26
|
+
#
|
27
|
+
# MS::Fasta.new accepts both an IO object or a String (a fasta formatted
|
28
|
+
# string itself)
|
29
|
+
#
|
30
|
+
# # taking an io object:
|
31
|
+
# File.open("file.fasta") do |io|
|
32
|
+
# fasta = MS::Fasta.new(io)
|
33
|
+
# ... do something with it
|
34
|
+
# end
|
35
|
+
# # taking a string
|
36
|
+
# string = ">id1 a simple header\nAAASDDEEEDDD\n>id2 header again\nPPPPPPWWWWWWTTTTYY\n"
|
37
|
+
# fasta = MS::Fasta.new(string)
|
38
|
+
# (simple, not_simple) = fasta.partition {|entry| entry.header =~ /simple/ }
|
39
|
+
module Fasta
|
40
|
+
|
41
|
+
# opens the flatfile and yields a Bio::FlatFile object
|
42
|
+
def self.open(file, &block)
|
43
|
+
Bio::FlatFile.open(Bio::FastaFormat, file, &block)
|
44
|
+
end
|
45
|
+
|
46
|
+
# yields each Bio::FastaFormat object in turn
|
47
|
+
def self.foreach(file, &block)
|
48
|
+
Bio::FlatFile.open(Bio::FastaFormat, file) do |fasta|
|
49
|
+
fasta.each(&block)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# takes an IO object or a string that is the fasta data itself
|
54
|
+
def self.new(io)
|
55
|
+
io = StringIO.new(io) if io.is_a?(String)
|
56
|
+
Bio::FlatFile.new(Bio::FastaFormat, io)
|
57
|
+
end
|
58
|
+
|
59
|
+
# returns two hashes [id_to_length, id_to_description]
|
60
|
+
# faster (~4x) than official route.
|
61
|
+
def self.protein_lengths_and_descriptions(file)
|
62
|
+
protid_to_description = {}
|
63
|
+
protid_to_length = {}
|
64
|
+
re = /^>([^\s]+) (.*)/
|
65
|
+
ids = []
|
66
|
+
lengths = []
|
67
|
+
current_length = nil
|
68
|
+
IO.foreach(file) do |line|
|
69
|
+
line.chomp!
|
70
|
+
if md=re.match(line)
|
71
|
+
lengths << current_length
|
72
|
+
current_id = md[1]
|
73
|
+
ids << current_id
|
74
|
+
current_length = 0
|
75
|
+
protid_to_description[current_id] = md[2]
|
76
|
+
else
|
77
|
+
current_length += line.size
|
78
|
+
end
|
79
|
+
end
|
80
|
+
lengths << current_length
|
81
|
+
lengths.shift # remove the first nil entry
|
82
|
+
[Hash[ids.zip(lengths).to_a], protid_to_description]
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
86
|
+
end
|