mspire 0.2.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/INSTALL +1 -0
- data/README +25 -0
- data/Rakefile +129 -40
- data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
- data/bin/bioworks_to_pepxml.rb +1 -0
- data/bin/fasta_shaker.rb +1 -96
- data/bin/filter_and_validate.rb +5 -0
- data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
- data/bin/prob_validate.rb +6 -0
- data/bin/raw_to_mzXML.rb +2 -2
- data/bin/srf_group.rb +1 -0
- data/bin/srf_to_sqt.rb +40 -0
- data/changelog.txt +68 -0
- data/lib/align/chams.rb +6 -6
- data/lib/align.rb +4 -3
- data/lib/bsearch.rb +120 -0
- data/lib/fasta.rb +318 -86
- data/lib/group_by.rb +10 -0
- data/lib/index_by.rb +11 -0
- data/lib/merge_deep.rb +21 -0
- data/lib/{spec → ms/converter}/mzxml.rb +77 -109
- data/lib/ms/gradient_program.rb +171 -0
- data/lib/ms/msrun.rb +209 -0
- data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
- data/lib/ms/parser/mzdata/axml.rb +12 -0
- data/lib/ms/parser/mzdata/dom.rb +160 -0
- data/lib/ms/parser/mzdata/libxml.rb +7 -0
- data/lib/ms/parser/mzdata.rb +25 -0
- data/lib/ms/parser/mzxml/axml.rb +11 -0
- data/lib/ms/parser/mzxml/dom.rb +159 -0
- data/lib/ms/parser/mzxml/hpricot.rb +253 -0
- data/lib/ms/parser/mzxml/libxml.rb +15 -0
- data/lib/ms/parser/mzxml/regexp.rb +122 -0
- data/lib/ms/parser/mzxml/rexml.rb +72 -0
- data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
- data/lib/ms/parser/mzxml.rb +175 -0
- data/lib/ms/parser.rb +108 -0
- data/lib/ms/precursor.rb +10 -0
- data/lib/ms/scan.rb +81 -0
- data/lib/ms/spectrum.rb +193 -0
- data/lib/ms.rb +10 -0
- data/lib/mspire.rb +4 -0
- data/lib/roc.rb +61 -1
- data/lib/sample_enzyme.rb +31 -8
- data/lib/scan_i.rb +21 -0
- data/lib/spec_id/aa_freqs.rb +7 -3
- data/lib/spec_id/bioworks.rb +20 -14
- data/lib/spec_id/digestor.rb +139 -0
- data/lib/spec_id/mass.rb +116 -0
- data/lib/spec_id/parser/proph.rb +236 -0
- data/lib/spec_id/precision/filter/cmdline.rb +209 -0
- data/lib/spec_id/precision/filter/interactive.rb +134 -0
- data/lib/spec_id/precision/filter/output.rb +147 -0
- data/lib/spec_id/precision/filter.rb +623 -0
- data/lib/spec_id/precision/output.rb +60 -0
- data/lib/spec_id/precision/prob/cmdline.rb +139 -0
- data/lib/spec_id/precision/prob/output.rb +88 -0
- data/lib/spec_id/precision/prob.rb +171 -0
- data/lib/spec_id/proph/pep_summary.rb +92 -0
- data/lib/spec_id/proph/prot_summary.rb +484 -0
- data/lib/spec_id/proph.rb +2 -466
- data/lib/spec_id/protein_summary.rb +2 -2
- data/lib/spec_id/sequest/params.rb +316 -0
- data/lib/spec_id/sequest/pepxml.rb +1513 -0
- data/lib/spec_id/sequest.rb +2 -1672
- data/lib/spec_id/srf.rb +445 -177
- data/lib/spec_id.rb +183 -95
- data/lib/spec_id_xml.rb +8 -10
- data/lib/transmem/phobius.rb +147 -0
- data/lib/transmem/toppred.rb +368 -0
- data/lib/transmem.rb +157 -0
- data/lib/validator/aa.rb +135 -0
- data/lib/validator/background.rb +73 -0
- data/lib/validator/bias.rb +95 -0
- data/lib/validator/cmdline.rb +260 -0
- data/lib/validator/decoy.rb +94 -0
- data/lib/validator/digestion_based.rb +69 -0
- data/lib/validator/probability.rb +48 -0
- data/lib/validator/prot_from_pep.rb +234 -0
- data/lib/validator/transmem.rb +272 -0
- data/lib/validator/true_pos.rb +46 -0
- data/lib/validator.rb +214 -0
- data/lib/xml.rb +38 -0
- data/lib/xml_style_parser.rb +105 -0
- data/lib/xmlparser_wrapper.rb +19 -0
- data/script/compile_and_plot_smriti_final.rb +97 -0
- data/script/extract_gradient_programs.rb +56 -0
- data/script/get_apex_values_rexml.rb +44 -0
- data/script/mzXML2timeIndex.rb +1 -1
- data/script/smriti_final_analysis.rb +103 -0
- data/script/toppred_to_yaml.rb +47 -0
- data/script/tpp_installer.rb +1 -1
- data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
- data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
- data/specs/bin/fasta_shaker_spec.rb +259 -0
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
- data/specs/bin/filter_and_validate_spec.rb +124 -0
- data/specs/bin/ms_to_lmat_spec.rb +34 -0
- data/specs/bin/prob_validate_spec.rb +62 -0
- data/specs/bin/protein_summary_spec.rb +10 -0
- data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
- data/specs/gi_spec.rb +22 -0
- data/specs/load_bin_path.rb +7 -0
- data/specs/merge_deep_spec.rb +13 -0
- data/specs/ms/gradient_program_spec.rb +77 -0
- data/specs/ms/msrun_spec.rb +455 -0
- data/specs/ms/parser_spec.rb +92 -0
- data/specs/ms/spectrum_spec.rb +89 -0
- data/specs/roc_spec.rb +251 -0
- data/specs/rspec_autotest.rb +149 -0
- data/specs/sample_enzyme_spec.rb +41 -0
- data/specs/spec_helper.rb +133 -0
- data/specs/spec_id/aa_freqs_spec.rb +52 -0
- data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
- data/specs/spec_id/digestor_spec.rb +75 -0
- data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
- data/specs/spec_id/precision/filter/output_spec.rb +31 -0
- data/specs/spec_id/precision/filter_spec.rb +243 -0
- data/specs/spec_id/precision/prob_spec.rb +111 -0
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
- data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
- data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
- data/specs/spec_id/sequest/params_spec.rb +68 -0
- data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
- data/specs/spec_id/sqt_spec.rb +138 -0
- data/specs/spec_id/srf_spec.rb +209 -0
- data/specs/spec_id/srf_spec_helper.rb +302 -0
- data/specs/spec_id_helper.rb +33 -0
- data/specs/spec_id_spec.rb +361 -0
- data/specs/spec_id_xml_spec.rb +33 -0
- data/specs/transmem/phobius_spec.rb +423 -0
- data/specs/transmem/toppred_spec.rb +297 -0
- data/specs/transmem_spec.rb +60 -0
- data/specs/transmem_spec_shared.rb +64 -0
- data/specs/validator/aa_spec.rb +107 -0
- data/specs/validator/background_spec.rb +51 -0
- data/specs/validator/bias_spec.rb +146 -0
- data/specs/validator/decoy_spec.rb +51 -0
- data/specs/validator/fasta_helper.rb +26 -0
- data/specs/validator/prot_from_pep_spec.rb +141 -0
- data/specs/validator/transmem_spec.rb +145 -0
- data/specs/validator/true_pos_spec.rb +58 -0
- data/specs/validator_helper.rb +33 -0
- data/specs/xml_spec.rb +12 -0
- data/test_files/000_pepxml18_small.xml +206 -0
- data/test_files/020a.mzXML.timeIndex +4710 -0
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
- data/test_files/4-03-03_small-prot.xml +321 -0
- data/test_files/4-03-03_small.xml +3876 -0
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +5999 -0
- data/test_files/bioworks31.params +77 -0
- data/test_files/bioworks32.params +62 -0
- data/test_files/bioworks33.params +63 -0
- data/test_files/bioworks_single_run_small.xml +7237 -0
- data/test_files/bioworks_small.fasta +212 -0
- data/test_files/bioworks_small.params +63 -0
- data/test_files/bioworks_small.phobius +109 -0
- data/test_files/bioworks_small.toppred.out +2847 -0
- data/test_files/bioworks_small.xml +5610 -0
- data/test_files/bioworks_with_INV_small.xml +3753 -0
- data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +304 -0
- data/test_files/messups.fasta +297 -0
- data/test_files/opd1/000.my_answer.100lines.xml +101 -0
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
- data/test_files/opd1/000_020_3prots-prot.xml +62 -0
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
- data/test_files/opd1/sequest.3.1.params +77 -0
- data/test_files/opd1/sequest.3.2.params +62 -0
- data/test_files/opd1/twenty_scans.mzXML +418 -0
- data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +9 -0
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
- data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
- data/test_files/pepproph_small.xml +4691 -0
- data/test_files/phobius.small.noheader.txt +50 -0
- data/test_files/phobius.small.small.txt +53 -0
- data/test_files/s01_anC1_ld020mM.key.txt +25 -0
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +297 -0
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +14340 -0
- data/test_files/tf_bioworks2excel.txt.actual +1035 -0
- data/test_files/toppred.small.out +416 -0
- data/test_files/toppred.xml.out +318 -0
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
- data/test_files/yeast_gly_small-prot.xml +265 -0
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
- data/test_files/yeast_gly_small.xml +3807 -0
- data/test_files/yeast_gly_small2.parentTimes +6 -0
- metadata +273 -57
- data/bin/filter.rb +0 -6
- data/bin/precision.rb +0 -5
- data/lib/spec/mzdata/parser.rb +0 -108
- data/lib/spec/mzdata.rb +0 -48
- data/lib/spec/mzxml/parser.rb +0 -449
- data/lib/spec/scan.rb +0 -55
- data/lib/spec_id/filter.rb +0 -797
- data/lib/spec_id/precision.rb +0 -421
- data/lib/toppred.rb +0 -18
- data/script/filter-peps.rb +0 -164
- data/test/tc_aa_freqs.rb +0 -59
- data/test/tc_fasta_shaker.rb +0 -149
- data/test/tc_filter.rb +0 -203
- data/test/tc_filter_peps.rb +0 -46
- data/test/tc_gi.rb +0 -17
- data/test/tc_id_class_anal.rb +0 -70
- data/test/tc_id_precision.rb +0 -89
- data/test/tc_msrun.rb +0 -88
- data/test/tc_mzxml.rb +0 -88
- data/test/tc_mzxml_to_lmat.rb +0 -36
- data/test/tc_peptide_parent_times.rb +0 -27
- data/test/tc_precision.rb +0 -60
- data/test/tc_roc.rb +0 -166
- data/test/tc_sample_enzyme.rb +0 -32
- data/test/tc_scan.rb +0 -26
- data/test/tc_sequest.rb +0 -336
- data/test/tc_spec.rb +0 -78
- data/test/tc_spec_id.rb +0 -201
- data/test/tc_spec_id_xml.rb +0 -36
- data/test/tc_srf.rb +0 -262
data/lib/bsearch.rb
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Ruby/Bsearch - a binary search library for Ruby.
|
|
3
|
+
#
|
|
4
|
+
# Copyright (C) 2001 Satoru Takabayashi <satoru@namazu.org>
|
|
5
|
+
# All rights reserved.
|
|
6
|
+
# This is free software with ABSOLUTELY NO WARRANTY.
|
|
7
|
+
#
|
|
8
|
+
# You can redistribute it and/or modify it under the terms of
|
|
9
|
+
# the Ruby's licence.
|
|
10
|
+
#
|
|
11
|
+
# Example:
|
|
12
|
+
#
|
|
13
|
+
# % irb -r ./bsearch.rb
|
|
14
|
+
# >> %w(a b c c c d e f).bsearch_first {|x| x <=> "c"}
|
|
15
|
+
# => 2
|
|
16
|
+
# >> %w(a b c c c d e f).bsearch_last {|x| x <=> "c"}
|
|
17
|
+
# => 4
|
|
18
|
+
# >> %w(a b c e f).bsearch_first {|x| x <=> "c"}
|
|
19
|
+
# => 2
|
|
20
|
+
# >> %w(a b e f).bsearch_first {|x| x <=> "c"}
|
|
21
|
+
# => nil
|
|
22
|
+
# >> %w(a b e f).bsearch_last {|x| x <=> "c"}
|
|
23
|
+
# => nil
|
|
24
|
+
# >> %w(a b e f).bsearch_lower_boundary {|x| x <=> "c"}
|
|
25
|
+
# => 2
|
|
26
|
+
# >> %w(a b e f).bsearch_upper_boundary {|x| x <=> "c"}
|
|
27
|
+
# => 2
|
|
28
|
+
# >> %w(a b c c c d e f).bsearch_range {|x| x <=> "c"}
|
|
29
|
+
# => 2...5
|
|
30
|
+
# >> %w(a b c d e f).bsearch_range {|x| x <=> "c"}
|
|
31
|
+
# => 2...3
|
|
32
|
+
# >> %w(a b d e f).bsearch_range {|x| x <=> "c"}
|
|
33
|
+
# => 2...2
|
|
34
|
+
|
|
35
|
+
module Bsearch
|
|
36
|
+
VERSION = '1.5'
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
class Array
|
|
40
|
+
#
|
|
41
|
+
# The binary search algorithm is extracted from Jon Bentley's
|
|
42
|
+
# Programming Pearls 2nd ed. p.93
|
|
43
|
+
#
|
|
44
|
+
|
|
45
|
+
#
|
|
46
|
+
# Return the lower boundary. (inside)
|
|
47
|
+
#
|
|
48
|
+
def bsearch_lower_boundary (range = 0 ... self.length, &block)
|
|
49
|
+
lower = range.first() -1
|
|
50
|
+
upper = if range.exclude_end? then range.last else range.last + 1 end
|
|
51
|
+
while lower + 1 != upper
|
|
52
|
+
mid = ((lower + upper) / 2).to_i # for working with mathn.rb (Rational)
|
|
53
|
+
if yield(self[mid]) < 0
|
|
54
|
+
lower = mid
|
|
55
|
+
else
|
|
56
|
+
upper = mid
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
return upper
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
#
|
|
63
|
+
# This method searches the FIRST occurrence which satisfies a
|
|
64
|
+
# condition given by a block in binary fashion and return the
|
|
65
|
+
# index of the first occurrence. Return nil if not found.
|
|
66
|
+
#
|
|
67
|
+
def bsearch_first (range = 0 ... self.length, &block)
|
|
68
|
+
boundary = bsearch_lower_boundary(range, &block)
|
|
69
|
+
if boundary >= self.length || yield(self[boundary]) != 0
|
|
70
|
+
return nil
|
|
71
|
+
else
|
|
72
|
+
return boundary
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
alias bsearch bsearch_first
|
|
77
|
+
|
|
78
|
+
#
|
|
79
|
+
# Return the upper boundary. (outside)
|
|
80
|
+
#
|
|
81
|
+
def bsearch_upper_boundary (range = 0 ... self.length, &block)
|
|
82
|
+
lower = range.first() -1
|
|
83
|
+
upper = if range.exclude_end? then range.last else range.last + 1 end
|
|
84
|
+
while lower + 1 != upper
|
|
85
|
+
mid = ((lower + upper) / 2).to_i # for working with mathn.rb (Rational)
|
|
86
|
+
if yield(self[mid]) <= 0
|
|
87
|
+
lower = mid
|
|
88
|
+
else
|
|
89
|
+
upper = mid
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
return lower + 1 # outside of the matching range.
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
#
|
|
96
|
+
# This method searches the LAST occurrence which satisfies a
|
|
97
|
+
# condition given by a block in binary fashion and return the
|
|
98
|
+
# index of the last occurrence. Return nil if not found.
|
|
99
|
+
#
|
|
100
|
+
def bsearch_last (range = 0 ... self.length, &block)
|
|
101
|
+
# `- 1' for canceling `lower + 1' in bsearch_upper_boundary.
|
|
102
|
+
boundary = bsearch_upper_boundary(range, &block) - 1
|
|
103
|
+
|
|
104
|
+
if (boundary <= -1 || yield(self[boundary]) != 0)
|
|
105
|
+
return nil
|
|
106
|
+
else
|
|
107
|
+
return boundary
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
#
|
|
112
|
+
# Return the search result as a Range object.
|
|
113
|
+
#
|
|
114
|
+
def bsearch_range (range = 0 ... self.length, &block)
|
|
115
|
+
lower = bsearch_lower_boundary(range, &block)
|
|
116
|
+
upper = bsearch_upper_boundary(range, &block)
|
|
117
|
+
return lower ... upper
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
data/lib/fasta.rb
CHANGED
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
require 'sample_enzyme'
|
|
2
2
|
require 'each_index'
|
|
3
|
+
require 'optparse'
|
|
4
|
+
require 'delegate'
|
|
5
|
+
require 'hash_by'
|
|
6
|
+
require 'digest/md5'
|
|
3
7
|
|
|
4
8
|
|
|
5
9
|
tmp = $VERBOSE ; $VERBOSE = nil
|
|
@@ -27,8 +31,10 @@ end
|
|
|
27
31
|
$VERBOSE = tmp
|
|
28
32
|
|
|
29
33
|
|
|
34
|
+
module FastaManipulation ; end
|
|
30
35
|
|
|
31
|
-
class Fasta
|
|
36
|
+
class Fasta < DelegateClass(Array)
|
|
37
|
+
include FastaManipulation
|
|
32
38
|
SHUFF_PREFIX = "SHUFF_"
|
|
33
39
|
SHUFF_FILE_POSTFIX = "_SHUFF"
|
|
34
40
|
CAT_SHUFF_FILE_POSTFIX = "_CAT_SHUFF"
|
|
@@ -37,21 +43,124 @@ class Fasta
|
|
|
37
43
|
INV_FILE_POSTFIX = "_INV"
|
|
38
44
|
CAT_INV_FILE_POSTFIX = "_CAT_INV"
|
|
39
45
|
|
|
40
|
-
|
|
46
|
+
attr_writer :prots
|
|
47
|
+
# this will probably be relative
|
|
48
|
+
attr_accessor :filename
|
|
41
49
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
50
|
+
# for backwards compatibility
|
|
51
|
+
def prots
|
|
52
|
+
@prots
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def self.to_fasta(file_or_obj)
|
|
56
|
+
if file_or_obj.is_a? Fasta
|
|
57
|
+
file_or_obj
|
|
58
|
+
else
|
|
59
|
+
Fasta.new(file_or_obj)
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# arg can be:
|
|
64
|
+
# Fasta::Prot objects (Array)
|
|
65
|
+
# filename (String)
|
|
66
|
+
# Another Fasta object (Fasta) (shallow copy!)
|
|
67
|
+
def initialize(arg=nil, filename=nil)
|
|
68
|
+
@filename = filename
|
|
69
|
+
@prots = []
|
|
70
|
+
if arg
|
|
71
|
+
if arg.is_a? Fasta
|
|
72
|
+
self.prots = arg.prots
|
|
73
|
+
self.filename = arg.filename
|
|
74
|
+
elsif arg.is_a? Array
|
|
75
|
+
@prots = arg
|
|
76
|
+
else
|
|
77
|
+
read_file(arg)
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
super(@prots)
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# uses the filename (if available, otherwise returning nil) to grab the md5 sum of the file
|
|
84
|
+
def md5_sum
|
|
85
|
+
if File.exist?(@filename)
|
|
86
|
+
Digest::MD5.hexdigest(File.read(@filename))
|
|
45
87
|
else
|
|
46
|
-
|
|
88
|
+
nil
|
|
47
89
|
end
|
|
48
90
|
end
|
|
49
91
|
|
|
92
|
+
# returns the length of the file (in terms of the total number of amino
|
|
93
|
+
# acids represented)
|
|
94
|
+
def aa_seq_length
|
|
95
|
+
tot = 0
|
|
96
|
+
self.each do |prot|
|
|
97
|
+
tot += prot.aaseq.size
|
|
98
|
+
end
|
|
99
|
+
tot
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# searches proteins for a match to the exact sequence and returns a single
|
|
103
|
+
# protein header (with > & no newline)
|
|
104
|
+
# exact matches). nil if no matches
|
|
105
|
+
def header_from_exact_sequence(aaseq)
|
|
106
|
+
hash = self.hash_by(:aaseq)
|
|
107
|
+
answ = hash[aaseq].map{|v| v.header}
|
|
108
|
+
if answ.size == 1
|
|
109
|
+
answ
|
|
110
|
+
elsif answ.size == 0
|
|
111
|
+
nil
|
|
112
|
+
else
|
|
113
|
+
answ
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# searches all headers to see if they include input string
|
|
118
|
+
# returns true if one matches, false otherwise
|
|
119
|
+
# (remember that headers are not stored with newline chars but do contain
|
|
120
|
+
# beginning '>'
|
|
121
|
+
def included_in_header?(input)
|
|
122
|
+
@prots.any? do |prot|
|
|
123
|
+
prot.header.include? input
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# takes an io object or string (which is the fasta data) This is not as
|
|
128
|
+
# stringent as 'read_file' which is recommended for industrial type use. For
|
|
129
|
+
# instance, this will fail if your newlines are different in your file from
|
|
130
|
+
# those defined on your operating system. If you have a string, simply pass
|
|
131
|
+
# in StringIO.new(your_string) to be read.
|
|
132
|
+
# returns self
|
|
133
|
+
def load(io)
|
|
134
|
+
current_prot = nil
|
|
135
|
+
current_aaseq = nil
|
|
136
|
+
@prots.clear
|
|
137
|
+
io.each do |line|
|
|
138
|
+
if line[0,1] == '>'
|
|
139
|
+
current_prot = Prot.new
|
|
140
|
+
@prots << current_prot
|
|
141
|
+
current_prot.header = line.chomp
|
|
142
|
+
current_aaseq = ''
|
|
143
|
+
current_prot.aaseq = current_aaseq
|
|
144
|
+
elsif (line =~ /[^ ]/) && (line.size > 1)
|
|
145
|
+
current_aaseq << line.chomp
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
self
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
# uses 'load' to create a fasta object from a fasta string
|
|
152
|
+
def self.from_string(string)
|
|
153
|
+
Fasta.new.load(StringIO.new(string))
|
|
154
|
+
end
|
|
155
|
+
|
|
50
156
|
# Reads fasta files (under windows or unix newlines)
|
|
51
157
|
# Always outputs LF separated files
|
|
52
158
|
# Checks that the first character per line is '>' or character class [A-Za-z*]
|
|
53
159
|
# returns a fasta object for stringing commands
|
|
54
|
-
|
|
160
|
+
# if fn not given, will read the :filename attribute
|
|
161
|
+
# will set :filename to fn is given
|
|
162
|
+
def read_file(fn=nil)
|
|
163
|
+
@filename = fn if fn
|
|
55
164
|
first_char_re = /[A-Za-z*]/o
|
|
56
165
|
obj = nil
|
|
57
166
|
regex = /(\r\n)|\n/o
|
|
@@ -76,14 +185,151 @@ class Fasta
|
|
|
76
185
|
self
|
|
77
186
|
end
|
|
78
187
|
|
|
79
|
-
#
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
188
|
+
# if no fn, will write to :filename attribute
|
|
189
|
+
def write_file(fn=nil)
|
|
190
|
+
fn = @out unless fn
|
|
191
|
+
File.open(fn, "wb") do |out|
|
|
192
|
+
@prots.each do |prot|
|
|
193
|
+
out.print(prot.to_s)
|
|
194
|
+
end
|
|
195
|
+
end
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
# duplicates the object (deep copy)
|
|
199
|
+
def dup
|
|
200
|
+
other = self.class.new
|
|
201
|
+
other.filename = self.filename
|
|
202
|
+
self.prots.each do |prot|
|
|
203
|
+
other.prots << prot.dup
|
|
204
|
+
end
|
|
205
|
+
other
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
class FastaShaker
|
|
211
|
+
|
|
212
|
+
def reverse(fasta_file_or_obj, opts={})
|
|
213
|
+
shake_it(:reverse, fasta_file_or_obj, opts)
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
def shuffle(fasta_file_or_obj, opts={})
|
|
217
|
+
shake_it(:shuffle, fasta_file_or_obj, opts)
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
# sets the outbound filename attribute from opts
|
|
221
|
+
def create_filename(fasta, method, opts={})
|
|
222
|
+
file = fasta.filename || 'fasta'
|
|
223
|
+
filebase = file.sub(/\..*$/,'')
|
|
224
|
+
parts = [filebase]
|
|
225
|
+
parts << 'cat' if opts[:cat]
|
|
226
|
+
parts << method
|
|
227
|
+
parts << 'prefix' << opts[:prefix] if opts[:prefix]
|
|
228
|
+
parts << 'fraction' << opts[:fraction] if opts[:fraction]
|
|
229
|
+
parts << 'tryptic_peptides' if opts[:tryptic_peptides]
|
|
230
|
+
parts.join("_") << ".fasta"
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
protected
|
|
234
|
+
def shake_it(method, fasta_file_or_obj, opt)
|
|
235
|
+
fasta = Fasta.to_fasta(fasta_file_or_obj)
|
|
236
|
+
if opt[:cat] && !opt[:prefix]
|
|
237
|
+
message = "WARNING: concatenated proteins don't have unique headers\n[you probably wanted to use the '--prefix' option!]"
|
|
238
|
+
warn message
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
unless opt[:out]
|
|
242
|
+
opt[:out] = create_filename(fasta, method, opt)
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
## CAT (save an original copy)
|
|
246
|
+
fasta_orig = fasta.dup if opt[:cat]
|
|
247
|
+
|
|
248
|
+
## FRACTION the proteins
|
|
249
|
+
if f = opt[:fraction]
|
|
250
|
+
prefix = nil
|
|
251
|
+
if f > 1.0
|
|
252
|
+
prefix = proc {|cnt| "f#{cnt}_" }
|
|
253
|
+
end
|
|
254
|
+
fasta = fasta.fraction_of_prots(f, prefix)
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
## PREFIX the proteins
|
|
258
|
+
if pre = opt[:prefix]
|
|
259
|
+
fasta.header_prefix!(pre)
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
## MODIFY the proteins
|
|
263
|
+
fasta.aaseq!((method.to_s + '!').to_sym, opt[:tryptic_peptides])
|
|
264
|
+
|
|
265
|
+
## CAT (finish it up)
|
|
266
|
+
if opt[:cat]
|
|
267
|
+
fasta_orig << fasta
|
|
268
|
+
fasta = fasta_orig
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
## WRITE out the file
|
|
272
|
+
fasta.write_file(opt[:out])
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
#############################################
|
|
279
|
+
# END MAIN METHODS
|
|
280
|
+
#############################################
|
|
281
|
+
|
|
282
|
+
# takes command line input, and sends it to shake
|
|
283
|
+
def FastaShaker.shake_from_argv(argv)
|
|
284
|
+
opt = {}
|
|
285
|
+
|
|
286
|
+
opts = OptionParser.new do |op|
|
|
287
|
+
prog = File.basename(__FILE__)
|
|
288
|
+
op.banner = "USAGE: #{prog} <method> [OPTIONS] <file>.fasta"
|
|
289
|
+
op.separator " <method> = reverse | shuffle"
|
|
290
|
+
op.separator ""
|
|
291
|
+
op.separator "fasta_shaker is kind of like a salt shaker:"
|
|
292
|
+
op.separator "shake up your fasta proteins and let them"
|
|
293
|
+
op.separator "season your dinner (hopefully a protein dinner). Mmmm."
|
|
294
|
+
op.separator "false identification rates never tasted so good :)"
|
|
295
|
+
op.separator ""
|
|
296
|
+
op.on("-c", "--cat", "catenates the output to copy of original") {|v| opt[:cat] = v }
|
|
297
|
+
op.on("-o", "--out <string>", "name of output file (default is descriptive)") {|v| opt[:out] = v }
|
|
298
|
+
op.on("-p", "--prefix <string>", "give a header prefix to modified prots") {|v| opt[:prefix] = v }
|
|
299
|
+
op.on("-f", "--fraction <float>", Float, "creates some fraction of proteins") {|v| opt[:fraction] = v }
|
|
300
|
+
op.separator " [if fraction > 1 then the tag 'f<frac#>_' prefixed to proteins"
|
|
301
|
+
op.separator " (after any given prefix) so that proteins are unique]"
|
|
302
|
+
op.on("--tryptic_peptides", "applies method to [KR][^P] peptides") {|v| opt[:tryptic_peptides] = v }
|
|
303
|
+
|
|
304
|
+
op.separator ""
|
|
305
|
+
op.separator "EXAMPLES: "
|
|
306
|
+
op.separator " #{prog} reverse file.fasta -o protein_aa_sequence_reversed.fasta"
|
|
307
|
+
op.separator " #{prog} shuffle file.fasta -o protein_aa_sequence_shuffled.fasta"
|
|
308
|
+
op.separator " #{prog} shuffle file.fasta -c -p SH_ -o normal_cat_shuffled_with_prefix.fasta"
|
|
309
|
+
op.separator " #{prog} reverse file.fasta --tryptic_peptides tryptic_peptides_reversed.fasta"
|
|
310
|
+
end
|
|
311
|
+
|
|
312
|
+
#p argv
|
|
313
|
+
opts.parse!(argv)
|
|
314
|
+
|
|
315
|
+
if argv.size < 2
|
|
316
|
+
puts opts
|
|
317
|
+
exit
|
|
318
|
+
end
|
|
319
|
+
|
|
320
|
+
(method, file) = argv
|
|
321
|
+
fs = FastaShaker.new
|
|
322
|
+
fs.send(method.to_sym, file, opt)
|
|
85
323
|
end
|
|
86
324
|
|
|
325
|
+
private
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
end
|
|
330
|
+
|
|
331
|
+
module FastaManipulation
|
|
332
|
+
|
|
87
333
|
# concatenates the filenames like this:
|
|
88
334
|
# cat_filenames('fn1.ext1', 'fn2.ext2', '__') # -> 'fn1__fn2.ext1'
|
|
89
335
|
# the path and extension of the first filename are kept intact.
|
|
@@ -99,18 +345,6 @@ class Fasta
|
|
|
99
345
|
fn1.gsub(/#{Regexp.escape(fn1_ext)}$/, connector + con_filenames + fn1_ext)
|
|
100
346
|
end
|
|
101
347
|
|
|
102
|
-
# Convenience method for creating a modified file with a particular method
|
|
103
|
-
# from Fasta. Returns the name of the output file.
|
|
104
|
-
def self.modify_file(file, method, file_postfix="", prot_header_prefix=nil)
|
|
105
|
-
file_out = prefix_extension(file, file_postfix)
|
|
106
|
-
fasta = Fasta.new
|
|
107
|
-
fasta.read_file(file)
|
|
108
|
-
fasta.send(method)
|
|
109
|
-
fasta.header_prefix!(prot_header_prefix) if prot_header_prefix
|
|
110
|
-
fasta.write_file(file_out)
|
|
111
|
-
file_out
|
|
112
|
-
end
|
|
113
|
-
|
|
114
348
|
# returns a new fasta object using some fraction of proteins randomly
|
|
115
349
|
# selected (fraction may be > 1). Always rounds up. Will not choose a
|
|
116
350
|
# protein twice unless all other proteins have been chosen
|
|
@@ -150,30 +384,6 @@ class Fasta
|
|
|
150
384
|
fasta_fraction = Fasta.new(arr)
|
|
151
385
|
end
|
|
152
386
|
|
|
153
|
-
# Convenience method for modifying some fraction of the proteins of a file
|
|
154
|
-
# and concatenating it to a copy of the original. Returns the name of the
|
|
155
|
-
# output file.
|
|
156
|
-
def self.modify_fraction_and_cat_to_file(file, method, fraction=1, file_postfix=nil, prot_header_prefix=nil)
|
|
157
|
-
#puts [file, method, fraction, file_postfix, prot_header_prefix].join("*")
|
|
158
|
-
file_postfix = "" unless file_postfix
|
|
159
|
-
fasta = Fasta.new
|
|
160
|
-
fasta.read_file(file)
|
|
161
|
-
outfile = prefix_extension(file, file_postfix)
|
|
162
|
-
other_fasta = fasta.fraction_of_prots(fraction)
|
|
163
|
-
other_fasta.send(method)
|
|
164
|
-
other_fasta.header_prefix!(prot_header_prefix) if prot_header_prefix
|
|
165
|
-
fasta << other_fasta
|
|
166
|
-
fasta.write_file(outfile)
|
|
167
|
-
return outfile
|
|
168
|
-
end
|
|
169
|
-
|
|
170
|
-
# Convenience method for modifying a file and concatenating it to a copy of
|
|
171
|
-
# the original. Returns th name of the output file.
|
|
172
|
-
def self.modify_and_cat_to_file(file, method, file_postfix=nil, prot_header_prefix=nil)
|
|
173
|
-
fraction = 1
|
|
174
|
-
modify_fraction_and_cat_to_file(file, method, fraction, file_postfix, prot_header_prefix)
|
|
175
|
-
end
|
|
176
|
-
|
|
177
387
|
# Convenience method to concatenate an array of fasta files. Filenames are
|
|
178
388
|
# concatenated according to 'cat_filenames') and prefixes the proteins
|
|
179
389
|
# according to the values in 'file_prot_header_prefixes' array
|
|
@@ -196,35 +406,13 @@ class Fasta
|
|
|
196
406
|
end
|
|
197
407
|
|
|
198
408
|
def <<(other)
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
@prots.each_with_index do |prot, index|
|
|
206
|
-
if other_prots[index] != prot
|
|
207
|
-
return false
|
|
208
|
-
end
|
|
209
|
-
end
|
|
210
|
-
return true
|
|
211
|
-
end
|
|
212
|
-
|
|
213
|
-
def write_file(fn)
|
|
214
|
-
File.open(fn, "wb") do |out|
|
|
215
|
-
@prots.each do |prot|
|
|
216
|
-
out.print(prot.to_s)
|
|
217
|
-
end
|
|
218
|
-
end
|
|
219
|
-
end
|
|
220
|
-
|
|
221
|
-
# duplicates the object (deep copy)
|
|
222
|
-
def dup
|
|
223
|
-
other = self.class.new
|
|
224
|
-
self.prots.each do |prot|
|
|
225
|
-
other.prots << prot.dup
|
|
409
|
+
# case when with class names uses === operator
|
|
410
|
+
case other
|
|
411
|
+
when Fasta
|
|
412
|
+
@prots.push(*(other.prots))
|
|
413
|
+
when Fasta::Prot
|
|
414
|
+
@prots.push(other)
|
|
226
415
|
end
|
|
227
|
-
other
|
|
228
416
|
end
|
|
229
417
|
|
|
230
418
|
# method = :shuffle! | :reverse!
|
|
@@ -260,10 +448,39 @@ class Fasta
|
|
|
260
448
|
prot.header_prefix!(prefix)
|
|
261
449
|
end
|
|
262
450
|
end
|
|
263
|
-
|
|
451
|
+
|
|
452
|
+
end
|
|
453
|
+
|
|
454
|
+
# requires that object respond_to? :reference
|
|
455
|
+
module ProteinReferenceable
|
|
456
|
+
# gives the string up to the first space (without the leading '>')
|
|
457
|
+
def first_entry
|
|
458
|
+
ref = reference
|
|
459
|
+
if ref
|
|
460
|
+
if ref.size > 1
|
|
461
|
+
ls_ref = ref.lstrip
|
|
462
|
+
index = ls_ref.index(' ')
|
|
463
|
+
if index
|
|
464
|
+
ls_ref[0...index]
|
|
465
|
+
else
|
|
466
|
+
ls_ref.dup
|
|
467
|
+
end
|
|
468
|
+
else
|
|
469
|
+
''
|
|
470
|
+
end
|
|
471
|
+
else
|
|
472
|
+
nil
|
|
473
|
+
end
|
|
474
|
+
end
|
|
475
|
+
|
|
264
476
|
end
|
|
265
477
|
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
|
|
266
481
|
class Fasta::Prot
|
|
482
|
+
include ProteinReferenceable
|
|
483
|
+
|
|
267
484
|
# header given as full line with starting '>' (but no newline chars!).
|
|
268
485
|
# aaseq also given without any newline chars
|
|
269
486
|
attr_accessor :header, :aaseq
|
|
@@ -280,15 +497,30 @@ class Fasta::Prot
|
|
|
280
497
|
other && other.class == self.class && other.aaseq == self.aaseq && other.header == self.header
|
|
281
498
|
end
|
|
282
499
|
|
|
283
|
-
#
|
|
284
|
-
def
|
|
285
|
-
|
|
286
|
-
|
|
500
|
+
# gives the string up to the first space (without the leading '>')
|
|
501
|
+
def first_entry
|
|
502
|
+
|
|
503
|
+
if @header
|
|
504
|
+
if @header.size > 1
|
|
505
|
+
index = @header.index(' ')
|
|
506
|
+
if index
|
|
507
|
+
@header[1...index]
|
|
508
|
+
else
|
|
509
|
+
@header[1..-1]
|
|
510
|
+
end
|
|
511
|
+
else
|
|
512
|
+
''
|
|
513
|
+
end
|
|
287
514
|
else
|
|
288
|
-
|
|
515
|
+
nil
|
|
289
516
|
end
|
|
290
517
|
end
|
|
291
518
|
|
|
519
|
+
# returns the fasta header information without the leading '>'
|
|
520
|
+
def reference
|
|
521
|
+
@header[1..-1]
|
|
522
|
+
end
|
|
523
|
+
|
|
292
524
|
# returns the value after the first '|' and before the second '|'
|
|
293
525
|
# according to this regexp: /\|(.*?)\|/
|
|
294
526
|
# This will typically be the gi code
|
|
@@ -314,7 +546,7 @@ class Fasta::Prot
|
|
|
314
546
|
def tryptic_peptides!(method_as_symbol)
|
|
315
547
|
peps = SampleEnzyme.tryptic(@aaseq)
|
|
316
548
|
ends_in_RK = /[KR]/o
|
|
317
|
-
|
|
549
|
+
|
|
318
550
|
## if the last peptide doesn't end in R or K we want to flip it completely
|
|
319
551
|
last_pep_special = nil
|
|
320
552
|
if peps.last[-1,1] !~ /[KR]/
|
|
@@ -360,7 +592,7 @@ class Fasta::Prot
|
|
|
360
592
|
|
|
361
593
|
end
|
|
362
594
|
|
|
363
|
-
|
|
595
|
+
|
|
364
596
|
# For reference, my code is about 15X faster than the first code I wrote
|
|
365
597
|
# below! It turns out that the major slowdown is in the randomize routine.
|
|
366
598
|
# Using my own randomize routine with the below way of reading fasta
|
|
@@ -391,4 +623,4 @@ end
|
|
|
391
623
|
# end
|
|
392
624
|
# end
|
|
393
625
|
#end
|
|
394
|
-
|
|
626
|
+
by=:protein, num=1
|
data/lib/group_by.rb
ADDED
data/lib/index_by.rb
ADDED
data/lib/merge_deep.rb
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
|
|
2
|
+
class Hash
|
|
3
|
+
|
|
4
|
+
# any hashes within the hash will also be merged to the level specifid
|
|
5
|
+
def merge_deep(hash2, level=1)
|
|
6
|
+
if level == 1
|
|
7
|
+
tmp_opts = {}
|
|
8
|
+
self.each do |k,v|
|
|
9
|
+
if (v.is_a?(Hash) and hash2[k].is_a?(Hash))
|
|
10
|
+
tmp_opts[k] = v.merge(hash2[k])
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
opts = self.merge(hash2)
|
|
14
|
+
opts.merge!(tmp_opts)
|
|
15
|
+
opts
|
|
16
|
+
else
|
|
17
|
+
raise NotImplementedError, "need to implement level > 1"
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
|