mspire 0.5.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +24 -0
- data/Rakefile +51 -0
- data/VERSION +1 -0
- data/lib/cv/description.rb +18 -0
- data/lib/cv/param.rb +33 -0
- data/lib/cv.rb +3 -0
- data/lib/io/bookmark.rb +13 -0
- data/lib/merge.rb +7 -0
- data/lib/ms/cvlist.rb +76 -0
- data/lib/ms/digester.rb +245 -0
- data/lib/ms/fasta.rb +86 -0
- data/lib/ms/ident/peptide/db.rb +243 -0
- data/lib/ms/ident/peptide.rb +72 -0
- data/lib/ms/ident/peptide_hit/qvalue.rb +56 -0
- data/lib/ms/ident/peptide_hit.rb +26 -0
- data/lib/ms/ident/pepxml/modifications.rb +83 -0
- data/lib/ms/ident/pepxml/msms_pipeline_analysis.rb +70 -0
- data/lib/ms/ident/pepxml/msms_run_summary.rb +82 -0
- data/lib/ms/ident/pepxml/parameters.rb +14 -0
- data/lib/ms/ident/pepxml/sample_enzyme.rb +165 -0
- data/lib/ms/ident/pepxml/search_database.rb +49 -0
- data/lib/ms/ident/pepxml/search_hit/modification_info.rb +79 -0
- data/lib/ms/ident/pepxml/search_hit.rb +144 -0
- data/lib/ms/ident/pepxml/search_result.rb +35 -0
- data/lib/ms/ident/pepxml/search_summary.rb +92 -0
- data/lib/ms/ident/pepxml/spectrum_query.rb +85 -0
- data/lib/ms/ident/pepxml.rb +112 -0
- data/lib/ms/ident/protein.rb +33 -0
- data/lib/ms/ident/protein_group.rb +80 -0
- data/lib/ms/ident/search.rb +114 -0
- data/lib/ms/ident.rb +37 -0
- data/lib/ms/isotope/aa.rb +59 -0
- data/lib/ms/mascot.rb +6 -0
- data/lib/ms/mass/aa.rb +79 -0
- data/lib/ms/mass.rb +55 -0
- data/lib/ms/mzml/index_list.rb +98 -0
- data/lib/ms/mzml/plms1.rb +34 -0
- data/lib/ms/mzml.rb +197 -0
- data/lib/ms/obo.rb +38 -0
- data/lib/ms/plms1.rb +156 -0
- data/lib/ms/quant/qspec/protein_group_comparison.rb +22 -0
- data/lib/ms/quant/qspec.rb +112 -0
- data/lib/ms/spectrum.rb +154 -8
- data/lib/ms.rb +3 -10
- data/lib/msplat.rb +2 -0
- data/lib/obo/ims.rb +5 -0
- data/lib/obo/ms.rb +7 -0
- data/lib/obo/ontology.rb +41 -0
- data/lib/obo/unit.rb +5 -0
- data/lib/openany.rb +23 -0
- data/lib/write_file_or_string.rb +18 -0
- data/obo/ims.obo +562 -0
- data/obo/ms.obo +11677 -0
- data/obo/unit.obo +2563 -0
- data/spec/ms/cvlist_spec.rb +60 -0
- data/spec/ms/digester_spec.rb +351 -0
- data/spec/ms/fasta_spec.rb +100 -0
- data/spec/ms/ident/peptide/db_spec.rb +108 -0
- data/spec/ms/ident/pepxml/sample_enzyme_spec.rb +181 -0
- data/spec/ms/ident/pepxml/search_hit/modification_info_spec.rb +37 -0
- data/spec/ms/ident/pepxml_spec.rb +442 -0
- data/spec/ms/ident/protein_group_spec.rb +68 -0
- data/spec/ms/mass_spec.rb +8 -0
- data/spec/ms/mzml/index_list_spec.rb +122 -0
- data/spec/ms/mzml/plms1_spec.rb +62 -0
- data/spec/ms/mzml_spec.rb +50 -0
- data/spec/ms/plms1_spec.rb +38 -0
- data/spec/ms/quant/qspec_spec.rb +25 -0
- data/spec/msplat_spec.rb +24 -0
- data/spec/obo_spec.rb +25 -0
- data/spec/spec_helper.rb +25 -0
- data/spec/testfiles/ms/ident/peptide/db/uni_11_sp_tr.fasta +69 -0
- data/spec/testfiles/ms/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4.yml +728 -0
- data/spec/testfiles/ms/mzml/j24z.idx_comp.3.mzML +271 -0
- data/spec/testfiles/ms/mzml/openms.noidx_nocomp.12.mzML +330 -0
- data/spec/testfiles/ms/quant/kill_extra_tabs.rb +13 -0
- data/spec/testfiles/ms/quant/max_quant_output.provenance.txt +15 -0
- data/spec/testfiles/ms/quant/max_quant_output.txt +199 -0
- data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv_qspecgp +199 -0
- data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv_qspecgp.csv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_final.txt +199 -0
- data/spec/testfiles/ms/quant/pdcd5_final.txt_qspecgp +0 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.CSV.csv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.csv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.csv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv_qspecgp +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv_qspecgp.csv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.txt +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_tabdel.txt +134 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_tabdel.txt_qspecgp +134 -0
- data/spec/testfiles/ms/quant/remove_rest_of_proteins.rb +13 -0
- data/spec/testfiles/ms/quant/unlog_transform.rb +13 -0
- data/spec/testfiles/plms1/output.key +0 -0
- metadata +157 -40
- data/README +0 -77
- data/changelog.txt +0 -196
- data/lib/ms/calc.rb +0 -32
- data/lib/ms/data/interleaved.rb +0 -60
- data/lib/ms/data/lazy_io.rb +0 -73
- data/lib/ms/data/lazy_string.rb +0 -15
- data/lib/ms/data/simple.rb +0 -59
- data/lib/ms/data/transposed.rb +0 -41
- data/lib/ms/data.rb +0 -57
- data/lib/ms/format/format_error.rb +0 -12
- data/lib/ms/support/binary_search.rb +0 -126
data/README.rdoc
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
= mspire
|
|
2
|
+
|
|
3
|
+
Tools for working with mass spectrometry data in ruby.
|
|
4
|
+
|
|
5
|
+
== Examples
|
|
6
|
+
|
|
7
|
+
=== mzml
|
|
8
|
+
|
|
9
|
+
require 'ms/mzml'
|
|
10
|
+
|
|
11
|
+
MS::Mzml.open("somefile.mzml") do |mzml|
|
|
12
|
+
spectrum = mzml[0] # the first spectrum ( same as mzml.spectrum(0) )
|
|
13
|
+
spectrum = mzml["controllerType=0 controllerNumber=1 scan=2"] # query by id string
|
|
14
|
+
mzml.spectrum_from_scan_num(23) # raises ScanNumbersNotFound or ScanNumbersNotUnique errors if problems
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
require 'ms/mass/aa'
|
|
18
|
+
|
|
19
|
+
MS::Mass::AA::MONO['A'] # or access by symbol
|
|
20
|
+
|
|
21
|
+
== Copyright
|
|
22
|
+
|
|
23
|
+
See LICENSE (MIT)
|
|
24
|
+
|
data/Rakefile
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
require 'rubygems'
|
|
2
|
+
require 'rake'
|
|
3
|
+
require 'rspec/core/rake_task'
|
|
4
|
+
|
|
5
|
+
require 'jeweler'
|
|
6
|
+
Jeweler::Tasks.new do |gem|
|
|
7
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
|
8
|
+
gem.name = "mspire"
|
|
9
|
+
gem.homepage = "http://github.com/princelab/mspire"
|
|
10
|
+
gem.license = "MIT"
|
|
11
|
+
gem.summary = %Q{mass spectrometry proteomics, lipidomics, and tools}
|
|
12
|
+
gem.description = %Q{mass spectrometry proteomics, lipidomics, and tools, a rewrite of mspire, merging of ms-* gems}
|
|
13
|
+
gem.email = "jtprince@gmail.com"
|
|
14
|
+
gem.authors = ["John T. Prince", "Simon Chiang"]
|
|
15
|
+
gem.add_dependency "nokogiri", "~> 1.5"
|
|
16
|
+
gem.add_development_dependency "rspec", "~> 2.6"
|
|
17
|
+
gem.add_development_dependency "jeweler", "~> 1.5.2"
|
|
18
|
+
gem.add_development_dependency "rcov", ">= 0"
|
|
19
|
+
gem.add_development_dependency "obo", ">= 0.1.0"
|
|
20
|
+
end
|
|
21
|
+
Jeweler::RubygemsDotOrgTasks.new
|
|
22
|
+
|
|
23
|
+
require 'rspec/core'
|
|
24
|
+
require 'rspec/core/rake_task'
|
|
25
|
+
RSpec::Core::RakeTask.new(:spec) do |spec|
|
|
26
|
+
spec.pattern = FileList['spec/**/*_spec.rb']
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
RSpec::Core::RakeTask.new(:rcov) do |spec|
|
|
30
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
|
31
|
+
spec.rcov = true
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
#require 'rcov/rcovtask'
|
|
35
|
+
#Rcov::RcovTask.new do |spec|
|
|
36
|
+
# spec.libs << 'spec'
|
|
37
|
+
# spec.pattern = 'spec/**/*_spec.rb'
|
|
38
|
+
# spec.verbose = true
|
|
39
|
+
#end
|
|
40
|
+
|
|
41
|
+
task :default => :spec
|
|
42
|
+
|
|
43
|
+
require 'rdoc/task'
|
|
44
|
+
Rake::RDocTask.new do |rdoc|
|
|
45
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
|
46
|
+
|
|
47
|
+
rdoc.rdoc_dir = 'rdoc'
|
|
48
|
+
rdoc.title = "mspire #{version}"
|
|
49
|
+
rdoc.rdoc_files.include('README*')
|
|
50
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
|
51
|
+
end
|
data/VERSION
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.6.1
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
|
|
2
|
+
module CV
|
|
3
|
+
class Description < Array
|
|
4
|
+
def initialize(*args, &block)
|
|
5
|
+
super(args)
|
|
6
|
+
self.instance_eval &block
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
# pushes a CV::Param object onto the description array
|
|
10
|
+
def param(*args)
|
|
11
|
+
push CV::Param.new(*args)
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def to_xml(xml)
|
|
15
|
+
each {|param| param.to_xml(xml) }
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
data/lib/cv/param.rb
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
|
|
2
|
+
module CV
|
|
3
|
+
class Param
|
|
4
|
+
attr_accessor :cv_ref, :accession, :name, :value
|
|
5
|
+
# A valueless CV::Param object that describes the units being used
|
|
6
|
+
attr_accessor :unit
|
|
7
|
+
|
|
8
|
+
def initialize(cv_ref, accession, name, value=nil)
|
|
9
|
+
(@cv_ref, @accession, @name, @value) = [cv_ref, accession, name, value]
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def to_xml(xml, name=:cvParam)
|
|
13
|
+
hash_to_send = {:cvRef => @cvref, :accession => @accession, :name => @name}
|
|
14
|
+
hash_to_send[:value] = @value if @value
|
|
15
|
+
if unit
|
|
16
|
+
hash_to_send.merge!( { :unitCvRef => unit.cv_ref,
|
|
17
|
+
:unitAccession => unit.accession,
|
|
18
|
+
:unitName => unit.name } )
|
|
19
|
+
end
|
|
20
|
+
xml.send(name, hash_to_send)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def ==(other)
|
|
24
|
+
if !other.nil? && other.is_a?(CV::Param)
|
|
25
|
+
[:cv_ref, :accession, :name, :value, :unit].inject(true) do |bool, mthd|
|
|
26
|
+
bool && (self.send(mthd) == other.send(mthd))
|
|
27
|
+
end
|
|
28
|
+
else ; false
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
data/lib/cv.rb
ADDED
data/lib/io/bookmark.rb
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
|
|
2
|
+
class IO
|
|
3
|
+
# saves the position and returns to it after the block
|
|
4
|
+
# is executed. Returns the block's reply. if rewind, io.rewind is called
|
|
5
|
+
# before handing the io object to the block.
|
|
6
|
+
def bookmark(rewind=false, &block)
|
|
7
|
+
start = self.pos
|
|
8
|
+
self.rewind if rewind
|
|
9
|
+
reply = block.call(self)
|
|
10
|
+
self.pos = start
|
|
11
|
+
reply
|
|
12
|
+
end
|
|
13
|
+
end
|
data/lib/merge.rb
ADDED
data/lib/ms/cvlist.rb
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
|
|
2
|
+
require 'cv'
|
|
3
|
+
require 'obo/ms'
|
|
4
|
+
require 'obo/ims'
|
|
5
|
+
require 'obo/unit'
|
|
6
|
+
|
|
7
|
+
module MS
|
|
8
|
+
module CV
|
|
9
|
+
Obo = {
|
|
10
|
+
'MS' => Obo::MS.id_to_name,
|
|
11
|
+
'IMS' => Obo::IMS.id_to_name,
|
|
12
|
+
'UO' => Obo::Unit.id_to_name,
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
class Param < ::CV::Param
|
|
16
|
+
# takes a variety of arguments (acc = accession):
|
|
17
|
+
#
|
|
18
|
+
# acc#
|
|
19
|
+
# acc#, value
|
|
20
|
+
# acc#, unit_acc# or CV::Param object
|
|
21
|
+
# acc#, value, unit_acc# or CV::Param object
|
|
22
|
+
# cvref, acc#, name
|
|
23
|
+
# cvref, acc#, name, value
|
|
24
|
+
# cvref, acc#, name, unit_acc# or CV::Param object
|
|
25
|
+
# cvref, acc#, name, value, unit_acc# or CV::Param object
|
|
26
|
+
def initialize(*args)
|
|
27
|
+
@unit =
|
|
28
|
+
if args.size > 1 && ((args.last.is_a?(::CV::Param) || args.last =~ /[A-Za-z]+:\d+/))
|
|
29
|
+
unit_arg = args.pop
|
|
30
|
+
unit_arg.is_a?(::CV::Param) ? unit_arg : self.class.new(unit_arg)
|
|
31
|
+
end
|
|
32
|
+
(@cv_ref, @accession, @name, @value) =
|
|
33
|
+
case args.size
|
|
34
|
+
when 1..2 # accession number (maybe with value)
|
|
35
|
+
(obo_type, accnum) = args.first.split(':')
|
|
36
|
+
[obo_type, args.first, MS::CV::Obo[obo_type][args.first], args[1]]
|
|
37
|
+
when 3..4 # they have fully specified the object
|
|
38
|
+
args
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# CVList.new( <CV::Param> )
|
|
45
|
+
# CVList['MS:1000004', ['MS:1000004'], ['IMS:1000042', 23], param_obj, args]
|
|
46
|
+
# CVList.new do
|
|
47
|
+
# param MS:1000004
|
|
48
|
+
# param MS:1000042, 23
|
|
49
|
+
# end
|
|
50
|
+
class CVList < Array
|
|
51
|
+
|
|
52
|
+
# ensures that each argument is an argument that can be handled by
|
|
53
|
+
# CV::Param. Returns the CVList object it creates
|
|
54
|
+
def self.[](*args)
|
|
55
|
+
list = self.new
|
|
56
|
+
args.each do |arg|
|
|
57
|
+
arg.is_a?(Array) ? list.param(*arg) : list.param(arg)
|
|
58
|
+
end
|
|
59
|
+
list
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# takes a list of valid CV::Param objects, or they can be set in the block
|
|
63
|
+
# using param
|
|
64
|
+
def initialize(*args, &block)
|
|
65
|
+
args.each {|arg| param(arg) }
|
|
66
|
+
instance_eval &block if block
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# if the first object is a MS::CV::Param it is just pushed onto the list,
|
|
70
|
+
# otherwise the arguments are sent in to initialize a fresh MS::CV::Param,
|
|
71
|
+
# and this object is pushed onto the list.
|
|
72
|
+
def param(*args)
|
|
73
|
+
push args.first.is_a?(::CV::Param) ? args.first : MS::CV::Param.new(*args)
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
data/lib/ms/digester.rb
ADDED
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
require 'strscan'
|
|
2
|
+
|
|
3
|
+
module MS
|
|
4
|
+
|
|
5
|
+
# A Digester splits a protein sequence into peptides at specified sites.
|
|
6
|
+
#
|
|
7
|
+
# trypsin = MS::Digester[:trypsin]
|
|
8
|
+
#
|
|
9
|
+
# trypsin.digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG')
|
|
10
|
+
# # => ['MIVIGR', 'SIVHPYITNEYEPFAAEK', 'QQILSIMAG']
|
|
11
|
+
#
|
|
12
|
+
# With 1 missed cleavage:
|
|
13
|
+
#
|
|
14
|
+
# trypsin.digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG', 1)
|
|
15
|
+
# # => ['MIVIGR','MIVIGRSIVHPYITNEYEPFAAEK','SIVHPYITNEYEPFAAEK',
|
|
16
|
+
# # 'SIVHPYITNEYEPFAAEKQQILSIMAG', 'QQILSIMAG']
|
|
17
|
+
#
|
|
18
|
+
# Return the start and end sites of digestion:
|
|
19
|
+
#
|
|
20
|
+
# trypsin.site_digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG', 1)
|
|
21
|
+
# # => [[0,6],[0,24],[6,24],[6,33],[24,33]]
|
|
22
|
+
class Digester
|
|
23
|
+
|
|
24
|
+
# The name of the digester
|
|
25
|
+
attr_reader :name
|
|
26
|
+
|
|
27
|
+
# A string of residues at which cleavage occurs
|
|
28
|
+
attr_reader :cleave_str
|
|
29
|
+
|
|
30
|
+
# A c-terminal resitriction residue which prevents
|
|
31
|
+
# cleavage at a potential cleavage site (optional).
|
|
32
|
+
attr_reader :cterm_exception
|
|
33
|
+
|
|
34
|
+
# True if cleavage occurs at the c-terminus of a
|
|
35
|
+
# cleavage residue, false if cleavage occurs at
|
|
36
|
+
# the n-terminus.
|
|
37
|
+
attr_reader :cterm_cleavage
|
|
38
|
+
|
|
39
|
+
MULTILINE_WHITESPACE = /\s*/m
|
|
40
|
+
|
|
41
|
+
def initialize(name, cleave_str, cterm_exception=nil, cterm_cleavage=true)
|
|
42
|
+
regexp = []
|
|
43
|
+
0.upto(cleave_str.length - 1) {|i| regexp << cleave_str[i, 1] }
|
|
44
|
+
|
|
45
|
+
@name = name
|
|
46
|
+
@cleave_str = cleave_str
|
|
47
|
+
@cleave_regexp = Regexp.new(regexp.join('|'))
|
|
48
|
+
@cterm_exception = case
|
|
49
|
+
when cterm_exception == nil || cterm_exception.empty? then nil
|
|
50
|
+
when cterm_exception.length == 1 then cterm_exception[0]
|
|
51
|
+
else
|
|
52
|
+
raise ArgumentError, "cterm exceptions must be a single residue: #{cterm_exception}"
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
@cterm_cleavage = cterm_cleavage
|
|
56
|
+
@scanner = StringScanner.new('')
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Returns digestion sites in sequence, as determined by the
|
|
60
|
+
# cleave_regexp boundaries. The digestion sites correspond to the
|
|
61
|
+
# positions where a peptide begins and ends, such that [n, (n+1) - n]
|
|
62
|
+
# corresponds to the [index, length] for peptide n.
|
|
63
|
+
#
|
|
64
|
+
# d = Digester.new('Trypsin', 'KR', 'P')
|
|
65
|
+
# seq = "AARGGR"
|
|
66
|
+
# sites = d.cleavage_sites(seq) # => [0, 3, 6]
|
|
67
|
+
#
|
|
68
|
+
# seq[sites[0], sites[0+1] - sites[0]] # => "AAR"
|
|
69
|
+
# seq[sites[1], sites[1+1] - sites[1]] # => "GGR"
|
|
70
|
+
#
|
|
71
|
+
# Trailing whitespace is included in the fragment.
|
|
72
|
+
#
|
|
73
|
+
# seq = "AAR \n GGR"
|
|
74
|
+
# sites = d.cleavage_sites(seq) # => [0, 8, 11]
|
|
75
|
+
#
|
|
76
|
+
# seq[sites[0], sites[0+1] - sites[0]] # => "AAR \n "
|
|
77
|
+
# seq[sites[1], sites[1+1] - sites[1]] # => "GGR"
|
|
78
|
+
#
|
|
79
|
+
# The digested section of sequence may be specified using offset
|
|
80
|
+
# and length.
|
|
81
|
+
def cleavage_sites(seq, offset=0, length=seq.length-offset)
|
|
82
|
+
return [0, 1] if seq.size == 1 # adding exceptions is lame--algorithm should just work
|
|
83
|
+
|
|
84
|
+
adjustment = cterm_cleavage ? 0 : 1
|
|
85
|
+
limit = offset + length
|
|
86
|
+
|
|
87
|
+
positions = [offset]
|
|
88
|
+
pos = scan(seq, offset, limit) do |pos|
|
|
89
|
+
positions << (pos - adjustment)
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# add the final position
|
|
93
|
+
if (pos < limit) || (positions.length == 1)
|
|
94
|
+
positions << limit
|
|
95
|
+
end
|
|
96
|
+
# adding exceptions is lame.. this code probably needs to be
|
|
97
|
+
# refactored (corrected).
|
|
98
|
+
if !cterm_cleavage && pos == limit
|
|
99
|
+
positions << limit
|
|
100
|
+
end
|
|
101
|
+
positions
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Returns digestion sites of sequence as [start_index, end_index] pairs,
|
|
105
|
+
# allowing for missed cleavages. Digestion sites are determined using
|
|
106
|
+
# cleavage_sites; as in that method, the digested section of sequence
|
|
107
|
+
# may be specified using offset and length.
|
|
108
|
+
#
|
|
109
|
+
# Each [start_index, end_index] pair is yielded to the block, if given,
|
|
110
|
+
# and the collected results are returned.
|
|
111
|
+
def site_digest(seq, max_misses=0, offset=0, length=seq.length-offset, &block) # :yields: start_index, end_index
|
|
112
|
+
frag_sites = cleavage_sites(seq, offset, length)
|
|
113
|
+
|
|
114
|
+
overlay(frag_sites.length, max_misses, 1) do |start_index, end_index|
|
|
115
|
+
start_index = frag_sites[start_index]
|
|
116
|
+
end_index = frag_sites[end_index]
|
|
117
|
+
|
|
118
|
+
block ? block.call(start_index, end_index) : [start_index, end_index]
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
# Returns an array of peptides produced by digesting sequence, allowing for
|
|
123
|
+
# missed cleavage sites. Digestion sites are determined using cleavage_sites;
|
|
124
|
+
# as in that method, the digested section of sequence may be specified using
|
|
125
|
+
# offset and length.
|
|
126
|
+
def digest(seq, max_misses=0, offset=0, length=seq.length-offset)
|
|
127
|
+
site_digest(seq, max_misses, offset, length).map do |s, e|
|
|
128
|
+
seq[s, e-s]
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
protected
|
|
133
|
+
|
|
134
|
+
# The cleavage regexp used to identify cleavage sites
|
|
135
|
+
attr_reader :cleave_regexp # :nodoc:
|
|
136
|
+
|
|
137
|
+
# The scanner used to digest strings.
|
|
138
|
+
attr_reader :scanner # :nodoc:
|
|
139
|
+
|
|
140
|
+
# Scans seq between offset and limit for the cleave_regexp, skipping whitespace
|
|
141
|
+
# and being mindful of exception characters. The positions of the scanner at
|
|
142
|
+
# each match are yielded to the block.
|
|
143
|
+
def scan(seq, offset, limit, &block) # :nodoc:
|
|
144
|
+
scanner.string = seq
|
|
145
|
+
scanner.pos = offset
|
|
146
|
+
|
|
147
|
+
while scanner.search_full(cleave_regexp, true, false)
|
|
148
|
+
scanner.search_full(MULTILINE_WHITESPACE, true, false)
|
|
149
|
+
pos = scanner.pos
|
|
150
|
+
|
|
151
|
+
# skip if the next character is the exception character
|
|
152
|
+
next if cterm_exception != nil && seq[pos] == cterm_exception
|
|
153
|
+
|
|
154
|
+
# break if you scanned past the upper limit
|
|
155
|
+
break if pos > limit
|
|
156
|
+
|
|
157
|
+
block.call(pos)
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
scanner.pos
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
# Performs an overlap-collect algorithm providing the start and end
|
|
164
|
+
# indicies of spans skipping up to max_misses boundaries.
|
|
165
|
+
def overlay(n, max_misses, offset, &block) # :nodoc:
|
|
166
|
+
results = []
|
|
167
|
+
0.upto(n-1) do |start_index|
|
|
168
|
+
0.upto(max_misses) do |n_miss|
|
|
169
|
+
end_index = start_index + offset + n_miss
|
|
170
|
+
break if end_index == n
|
|
171
|
+
|
|
172
|
+
results << block.call(start_index, end_index)
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
results
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
#
|
|
179
|
+
# Enzymes adapted from the default Mascot enzyme list.
|
|
180
|
+
#
|
|
181
|
+
|
|
182
|
+
class << self
|
|
183
|
+
# takes the name of the enzyme in any case (symbol or string)
|
|
184
|
+
# and accesses the constant (returns nil if none found)
|
|
185
|
+
def [](enzyme_name)
|
|
186
|
+
ENZYMES[ enzyme_name.to_s.downcase.gsub(/\W+/,'_').to_sym ]
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
# Utility method to parse a mascot enzyme configuration
|
|
190
|
+
# string (tab separated) into a Digester.
|
|
191
|
+
def mascot_parse(str) # :nodoc:
|
|
192
|
+
name, sense, cleave_str, cterm_exception, independent, semi_specific = str.split(/ *\t */)
|
|
193
|
+
cterm_cleavage = case sense
|
|
194
|
+
when 'C-Term' then true
|
|
195
|
+
when 'N-Term' then false
|
|
196
|
+
else raise ArgumentError, "unknown sense: #{sense}"
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
new(name, cleave_str, cterm_exception, cterm_cleavage)
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
# ARG_C = mascot_parse('Arg-C C-Term R P no no')
|
|
204
|
+
# ENZYMES[:arg_c] = <'Arg-C' enzyme>
|
|
205
|
+
MASCOT_ENZYME_CONFIG_STRINGS = {
|
|
206
|
+
:arg_c => 'Arg-C C-Term R P no no',
|
|
207
|
+
:asp_n => 'Asp-N N-Term BD no no',
|
|
208
|
+
:asp_n_ambic => 'Asp-N_ambic N-Term DE no no',
|
|
209
|
+
:chymotrypsin => 'Chymotrypsin C-Term FLWY P no no',
|
|
210
|
+
:cnbr => 'CNBr C-Term M no no',
|
|
211
|
+
:lys_c => 'Lys-C C-Term K P no no',
|
|
212
|
+
:lys_c_p => 'Lys-C/P C-Term K no no',
|
|
213
|
+
:pepsin_a => 'PepsinA C-Term FL no no',
|
|
214
|
+
:tryp_cnbr => 'Tryp-CNBr C-Term KMR P no no',
|
|
215
|
+
:tryp_chymo => 'TrypChymo C-Term FKLRWY P no no',
|
|
216
|
+
:trypsin_p => 'Trypsin/P C-Term KR no no',
|
|
217
|
+
:v8_de => 'V8-DE C-Term BDEZ P no no',
|
|
218
|
+
:v8_e => 'V8-E C-Term EZ P no no',
|
|
219
|
+
:trypsin => 'Trypsin C-Term KR P no no',
|
|
220
|
+
:v8_e_trypsin => 'V8-E+Trypsin C-Term EKRZ P no no',
|
|
221
|
+
:v8_de_trypsin => 'V8-DE+Trypsin C-Term BDEKRZ P no no',
|
|
222
|
+
:arg_c => 'Arg-C C-Term R P no no',
|
|
223
|
+
:asp_n => 'Asp-N N-Term BD no no',
|
|
224
|
+
:asp_n_ambic => 'Asp-N_ambic N-Term DE no no',
|
|
225
|
+
:chymotrypsin => 'Chymotrypsin C-Term FLWY P no no',
|
|
226
|
+
:cnbr => 'CNBr C-Term M no no',
|
|
227
|
+
:lys_c => 'Lys-C C-Term K P no no',
|
|
228
|
+
:lys_c_p => 'Lys-C/P C-Term K no no',
|
|
229
|
+
:pepsin_a => 'PepsinA C-Term FL no no',
|
|
230
|
+
:tryp_cnbr => 'Tryp-CNBr C-Term KMR P no no',
|
|
231
|
+
:tryp_chymo => 'TrypChymo C-Term FKLRWY P no no',
|
|
232
|
+
:trypsin_p => 'Trypsin/P C-Term KR no no',
|
|
233
|
+
:v8_de => 'V8-DE C-Term BDEZ P no no',
|
|
234
|
+
:v8_e => 'V8-E C-Term EZ P no no',
|
|
235
|
+
:trypsin => 'Trypsin C-Term KR P no no',
|
|
236
|
+
:v8_e_trypsin => 'V8-E+Trypsin C-Term EKRZ P no no',
|
|
237
|
+
:v8_de_trypsin => 'V8-DE+Trypsin C-Term BDEKRZ P no no',
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
ENZYMES = MASCOT_ENZYME_CONFIG_STRINGS.inject(Hash.new) do |hash,(k,v)|
|
|
241
|
+
hash[k] = mascot_parse(v)
|
|
242
|
+
hash
|
|
243
|
+
end
|
|
244
|
+
end
|
|
245
|
+
end
|
data/lib/ms/fasta.rb
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
require 'bio'
|
|
2
|
+
require 'stringio'
|
|
3
|
+
|
|
4
|
+
class Bio::FlatFile
|
|
5
|
+
include Enumerable
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
class Bio::FastaFormat
|
|
9
|
+
alias_method :header, :definition
|
|
10
|
+
alias_method :sequence, :seq
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
module MS
|
|
14
|
+
# A convenience class for working with fasta formatted sequence databases.
|
|
15
|
+
# the file which includes this class also includes Enumerable with
|
|
16
|
+
# Bio::FlatFile so you can do things like this:
|
|
17
|
+
#
|
|
18
|
+
# accessions = MS::Fasta.open("file.fasta") do |fasta|
|
|
19
|
+
# fasta.map(&:accession)
|
|
20
|
+
# end
|
|
21
|
+
#
|
|
22
|
+
# A few aliases are added to Bio::FastaFormat
|
|
23
|
+
#
|
|
24
|
+
# entry.header == entry.definition
|
|
25
|
+
# entry.sequence == entry.seq
|
|
26
|
+
#
|
|
27
|
+
# MS::Fasta.new accepts both an IO object or a String (a fasta formatted
|
|
28
|
+
# string itself)
|
|
29
|
+
#
|
|
30
|
+
# # taking an io object:
|
|
31
|
+
# File.open("file.fasta") do |io|
|
|
32
|
+
# fasta = MS::Fasta.new(io)
|
|
33
|
+
# ... do something with it
|
|
34
|
+
# end
|
|
35
|
+
# # taking a string
|
|
36
|
+
# string = ">id1 a simple header\nAAASDDEEEDDD\n>id2 header again\nPPPPPPWWWWWWTTTTYY\n"
|
|
37
|
+
# fasta = MS::Fasta.new(string)
|
|
38
|
+
# (simple, not_simple) = fasta.partition {|entry| entry.header =~ /simple/ }
|
|
39
|
+
module Fasta
|
|
40
|
+
|
|
41
|
+
# opens the flatfile and yields a Bio::FlatFile object
|
|
42
|
+
def self.open(file, &block)
|
|
43
|
+
Bio::FlatFile.open(Bio::FastaFormat, file, &block)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# yields each Bio::FastaFormat object in turn
|
|
47
|
+
def self.foreach(file, &block)
|
|
48
|
+
Bio::FlatFile.open(Bio::FastaFormat, file) do |fasta|
|
|
49
|
+
fasta.each(&block)
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# takes an IO object or a string that is the fasta data itself
|
|
54
|
+
def self.new(io)
|
|
55
|
+
io = StringIO.new(io) if io.is_a?(String)
|
|
56
|
+
Bio::FlatFile.new(Bio::FastaFormat, io)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# returns two hashes [id_to_length, id_to_description]
|
|
60
|
+
# faster (~4x) than official route.
|
|
61
|
+
def self.protein_lengths_and_descriptions(file)
|
|
62
|
+
protid_to_description = {}
|
|
63
|
+
protid_to_length = {}
|
|
64
|
+
re = /^>([^\s]+) (.*)/
|
|
65
|
+
ids = []
|
|
66
|
+
lengths = []
|
|
67
|
+
current_length = nil
|
|
68
|
+
IO.foreach(file) do |line|
|
|
69
|
+
line.chomp!
|
|
70
|
+
if md=re.match(line)
|
|
71
|
+
lengths << current_length
|
|
72
|
+
current_id = md[1]
|
|
73
|
+
ids << current_id
|
|
74
|
+
current_length = 0
|
|
75
|
+
protid_to_description[current_id] = md[2]
|
|
76
|
+
else
|
|
77
|
+
current_length += line.size
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
lengths << current_length
|
|
81
|
+
lengths.shift # remove the first nil entry
|
|
82
|
+
[Hash[ids.zip(lengths).to_a], protid_to_description]
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
end
|
|
86
|
+
end
|