mspire 0.4.9 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +27 -17
- data/changelog.txt +31 -62
- data/lib/ms/calc.rb +32 -0
- data/lib/ms/data/interleaved.rb +60 -0
- data/lib/ms/data/lazy_io.rb +73 -0
- data/lib/ms/data/lazy_string.rb +15 -0
- data/lib/ms/data/simple.rb +59 -0
- data/lib/ms/data/transposed.rb +41 -0
- data/lib/ms/data.rb +57 -0
- data/lib/ms/format/format_error.rb +12 -0
- data/lib/ms/spectrum.rb +25 -384
- data/lib/ms/support/binary_search.rb +126 -0
- data/lib/ms.rb +10 -10
- metadata +38 -350
- data/INSTALL +0 -58
- data/README.rdoc +0 -18
- data/Rakefile +0 -330
- data/bin/aafreqs.rb +0 -23
- data/bin/bioworks2excel.rb +0 -14
- data/bin/bioworks_to_pepxml.rb +0 -148
- data/bin/bioworks_to_pepxml_gui.rb +0 -225
- data/bin/fasta_shaker.rb +0 -5
- data/bin/filter_and_validate.rb +0 -5
- data/bin/gi2annot.rb +0 -14
- data/bin/id_class_anal.rb +0 -112
- data/bin/id_precision.rb +0 -172
- data/bin/ms_to_lmat.rb +0 -67
- data/bin/pepproph_filter.rb +0 -16
- data/bin/prob_validate.rb +0 -6
- data/bin/protein_summary.rb +0 -6
- data/bin/protxml2prots_peps.rb +0 -32
- data/bin/raw_to_mzXML.rb +0 -55
- data/bin/run_percolator.rb +0 -122
- data/bin/sqt_group.rb +0 -26
- data/bin/srf_group.rb +0 -27
- data/bin/srf_to_sqt.rb +0 -40
- data/lib/align/chams.rb +0 -78
- data/lib/align.rb +0 -154
- data/lib/archive/targz.rb +0 -94
- data/lib/bsearch.rb +0 -120
- data/lib/core_extensions.rb +0 -16
- data/lib/fasta.rb +0 -626
- data/lib/gi.rb +0 -124
- data/lib/group_by.rb +0 -10
- data/lib/index_by.rb +0 -11
- data/lib/merge_deep.rb +0 -21
- data/lib/ms/converter/mzxml.rb +0 -77
- data/lib/ms/gradient_program.rb +0 -170
- data/lib/ms/msrun.rb +0 -244
- data/lib/ms/msrun_index.rb +0 -108
- data/lib/ms/parser/mzdata/axml.rb +0 -67
- data/lib/ms/parser/mzdata/dom.rb +0 -175
- data/lib/ms/parser/mzdata/libxml.rb +0 -7
- data/lib/ms/parser/mzdata.rb +0 -31
- data/lib/ms/parser/mzxml/axml.rb +0 -70
- data/lib/ms/parser/mzxml/dom.rb +0 -182
- data/lib/ms/parser/mzxml/hpricot.rb +0 -253
- data/lib/ms/parser/mzxml/libxml.rb +0 -19
- data/lib/ms/parser/mzxml/regexp.rb +0 -122
- data/lib/ms/parser/mzxml/rexml.rb +0 -72
- data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
- data/lib/ms/parser/mzxml.rb +0 -282
- data/lib/ms/parser.rb +0 -108
- data/lib/ms/precursor.rb +0 -25
- data/lib/ms/scan.rb +0 -81
- data/lib/mspire.rb +0 -4
- data/lib/pi_zero.rb +0 -244
- data/lib/qvalue.rb +0 -161
- data/lib/roc.rb +0 -187
- data/lib/sample_enzyme.rb +0 -160
- data/lib/scan_i.rb +0 -21
- data/lib/spec_id/aa_freqs.rb +0 -170
- data/lib/spec_id/bioworks.rb +0 -497
- data/lib/spec_id/digestor.rb +0 -138
- data/lib/spec_id/mass.rb +0 -179
- data/lib/spec_id/parser/proph.rb +0 -335
- data/lib/spec_id/precision/filter/cmdline.rb +0 -218
- data/lib/spec_id/precision/filter/interactive.rb +0 -134
- data/lib/spec_id/precision/filter/output.rb +0 -148
- data/lib/spec_id/precision/filter.rb +0 -637
- data/lib/spec_id/precision/output.rb +0 -60
- data/lib/spec_id/precision/prob/cmdline.rb +0 -160
- data/lib/spec_id/precision/prob/output.rb +0 -94
- data/lib/spec_id/precision/prob.rb +0 -249
- data/lib/spec_id/proph/pep_summary.rb +0 -104
- data/lib/spec_id/proph/prot_summary.rb +0 -484
- data/lib/spec_id/proph.rb +0 -4
- data/lib/spec_id/protein_summary.rb +0 -489
- data/lib/spec_id/sequest/params.rb +0 -316
- data/lib/spec_id/sequest/pepxml.rb +0 -1458
- data/lib/spec_id/sequest.rb +0 -33
- data/lib/spec_id/sqt.rb +0 -349
- data/lib/spec_id/srf.rb +0 -973
- data/lib/spec_id.rb +0 -778
- data/lib/spec_id_xml.rb +0 -99
- data/lib/transmem/phobius.rb +0 -147
- data/lib/transmem/toppred.rb +0 -368
- data/lib/transmem.rb +0 -157
- data/lib/validator/aa.rb +0 -48
- data/lib/validator/aa_est.rb +0 -112
- data/lib/validator/background.rb +0 -77
- data/lib/validator/bias.rb +0 -95
- data/lib/validator/cmdline.rb +0 -431
- data/lib/validator/decoy.rb +0 -107
- data/lib/validator/digestion_based.rb +0 -70
- data/lib/validator/probability.rb +0 -51
- data/lib/validator/prot_from_pep.rb +0 -234
- data/lib/validator/q_value.rb +0 -32
- data/lib/validator/transmem.rb +0 -272
- data/lib/validator/true_pos.rb +0 -46
- data/lib/validator.rb +0 -197
- data/lib/xml.rb +0 -38
- data/lib/xml_style_parser.rb +0 -119
- data/lib/xmlparser_wrapper.rb +0 -19
- data/release_notes.txt +0 -2
- data/script/compile_and_plot_smriti_final.rb +0 -97
- data/script/create_little_pepxml.rb +0 -61
- data/script/degenerate_peptides.rb +0 -47
- data/script/estimate_fpr_by_cysteine.rb +0 -226
- data/script/extract_gradient_programs.rb +0 -56
- data/script/find_cysteine_background.rb +0 -137
- data/script/genuine_tps_and_probs.rb +0 -136
- data/script/get_apex_values_rexml.rb +0 -44
- data/script/histogram_probs.rb +0 -61
- data/script/mascot_fix_pepxml.rb +0 -123
- data/script/msvis.rb +0 -42
- data/script/mzXML2timeIndex.rb +0 -25
- data/script/peps_per_bin.rb +0 -67
- data/script/prep_dir.rb +0 -121
- data/script/simple_protein_digestion.rb +0 -27
- data/script/smriti_final_analysis.rb +0 -103
- data/script/sqt_to_meta.rb +0 -24
- data/script/top_hit_per_scan.rb +0 -67
- data/script/toppred_to_yaml.rb +0 -47
- data/script/tpp_installer.rb +0 -249
- data/specs/align_spec.rb +0 -79
- data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
- data/specs/bin/fasta_shaker_spec.rb +0 -259
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
- data/specs/bin/filter_and_validate_spec.rb +0 -180
- data/specs/bin/ms_to_lmat_spec.rb +0 -34
- data/specs/bin/prob_validate_spec.rb +0 -86
- data/specs/bin/protein_summary_spec.rb +0 -14
- data/specs/fasta_spec.rb +0 -354
- data/specs/gi_spec.rb +0 -22
- data/specs/load_bin_path.rb +0 -7
- data/specs/merge_deep_spec.rb +0 -13
- data/specs/ms/gradient_program_spec.rb +0 -77
- data/specs/ms/msrun_spec.rb +0 -498
- data/specs/ms/parser_spec.rb +0 -92
- data/specs/ms/spectrum_spec.rb +0 -87
- data/specs/pi_zero_spec.rb +0 -115
- data/specs/qvalue_spec.rb +0 -39
- data/specs/roc_spec.rb +0 -251
- data/specs/rspec_autotest.rb +0 -149
- data/specs/sample_enzyme_spec.rb +0 -126
- data/specs/spec_helper.rb +0 -135
- data/specs/spec_id/aa_freqs_spec.rb +0 -52
- data/specs/spec_id/bioworks_spec.rb +0 -148
- data/specs/spec_id/digestor_spec.rb +0 -75
- data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
- data/specs/spec_id/precision/filter/output_spec.rb +0 -31
- data/specs/spec_id/precision/filter_spec.rb +0 -246
- data/specs/spec_id/precision/prob_spec.rb +0 -44
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
- data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
- data/specs/spec_id/protein_summary_spec.rb +0 -189
- data/specs/spec_id/sequest/params_spec.rb +0 -68
- data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
- data/specs/spec_id/sequest_spec.rb +0 -38
- data/specs/spec_id/sqt_spec.rb +0 -246
- data/specs/spec_id/srf_spec.rb +0 -172
- data/specs/spec_id/srf_spec_helper.rb +0 -139
- data/specs/spec_id_helper.rb +0 -33
- data/specs/spec_id_spec.rb +0 -366
- data/specs/spec_id_xml_spec.rb +0 -33
- data/specs/transmem/phobius_spec.rb +0 -425
- data/specs/transmem/toppred_spec.rb +0 -298
- data/specs/transmem_spec.rb +0 -60
- data/specs/transmem_spec_shared.rb +0 -64
- data/specs/validator/aa_est_spec.rb +0 -66
- data/specs/validator/aa_spec.rb +0 -40
- data/specs/validator/background_spec.rb +0 -67
- data/specs/validator/bias_spec.rb +0 -122
- data/specs/validator/decoy_spec.rb +0 -51
- data/specs/validator/fasta_helper.rb +0 -26
- data/specs/validator/prot_from_pep_spec.rb +0 -141
- data/specs/validator/transmem_spec.rb +0 -146
- data/specs/validator/true_pos_spec.rb +0 -58
- data/specs/validator_helper.rb +0 -33
- data/specs/xml_spec.rb +0 -12
- data/test_files/000_pepxml18_small.xml +0 -206
- data/test_files/020a.mzXML.timeIndex +0 -4710
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
- data/test_files/4-03-03_small-prot.xml +0 -321
- data/test_files/4-03-03_small.xml +0 -3876
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +0 -5999
- data/test_files/bioworks31.params +0 -77
- data/test_files/bioworks32.params +0 -62
- data/test_files/bioworks33.params +0 -63
- data/test_files/bioworks_single_run_small.xml +0 -7237
- data/test_files/bioworks_small.fasta +0 -212
- data/test_files/bioworks_small.params +0 -63
- data/test_files/bioworks_small.phobius +0 -109
- data/test_files/bioworks_small.toppred.out +0 -2847
- data/test_files/bioworks_small.xml +0 -5610
- data/test_files/bioworks_with_INV_small.xml +0 -3753
- data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +0 -304
- data/test_files/messups.fasta +0 -297
- data/test_files/opd1/000.my_answer.100lines.xml +0 -101
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
- data/test_files/opd1/000_020_3prots-prot.xml +0 -62
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
- data/test_files/opd1/sequest.3.1.params +0 -77
- data/test_files/opd1/sequest.3.2.params +0 -62
- data/test_files/opd1/twenty_scans.mzXML +0 -418
- data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +0 -9
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
- data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
- data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
- data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
- data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
- data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
- data/test_files/pepproph_small.xml +0 -4691
- data/test_files/phobius.small.noheader.txt +0 -50
- data/test_files/phobius.small.small.txt +0 -53
- data/test_files/s01_anC1_ld020mM.key.txt +0 -25
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +0 -297
- data/test_files/small.sqt +0 -87
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +0 -14340
- data/test_files/tf_bioworks2excel.txt.actual +0 -1035
- data/test_files/toppred.small.out +0 -416
- data/test_files/toppred.xml.out +0 -318
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
- data/test_files/yeast_gly_small-prot.xml +0 -265
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
- data/test_files/yeast_gly_small.xml +0 -3807
- data/test_files/yeast_gly_small2.parentTimes +0 -6
data/lib/fasta.rb
DELETED
|
@@ -1,626 +0,0 @@
|
|
|
1
|
-
require 'sample_enzyme'
|
|
2
|
-
require 'each_index'
|
|
3
|
-
require 'optparse'
|
|
4
|
-
require 'delegate'
|
|
5
|
-
require 'hash_by'
|
|
6
|
-
require 'digest/md5'
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
tmp = $VERBOSE ; $VERBOSE = nil
|
|
10
|
-
class String
|
|
11
|
-
|
|
12
|
-
def each_index
|
|
13
|
-
(0...self.size).each do |c|
|
|
14
|
-
yield c
|
|
15
|
-
end
|
|
16
|
-
end
|
|
17
|
-
|
|
18
|
-
# modifies and returns self
|
|
19
|
-
def shuffle!
|
|
20
|
-
each_index {|j| i = rand(size-j); self[j], self[j+i] = self[j+i], self[j]}
|
|
21
|
-
self
|
|
22
|
-
end
|
|
23
|
-
|
|
24
|
-
def shuffle
|
|
25
|
-
out = self.dup
|
|
26
|
-
out.shuffle!
|
|
27
|
-
out
|
|
28
|
-
end
|
|
29
|
-
|
|
30
|
-
end
|
|
31
|
-
$VERBOSE = tmp
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
module FastaManipulation ; end
|
|
35
|
-
|
|
36
|
-
class Fasta < DelegateClass(Array)
|
|
37
|
-
include FastaManipulation
|
|
38
|
-
SHUFF_PREFIX = "SHUFF_"
|
|
39
|
-
SHUFF_FILE_POSTFIX = "_SHUFF"
|
|
40
|
-
CAT_SHUFF_FILE_POSTFIX = "_CAT_SHUFF"
|
|
41
|
-
FILE_CONNECTOR = "__"
|
|
42
|
-
INV_PREFIX = "INV_"
|
|
43
|
-
INV_FILE_POSTFIX = "_INV"
|
|
44
|
-
CAT_INV_FILE_POSTFIX = "_CAT_INV"
|
|
45
|
-
|
|
46
|
-
attr_writer :prots
|
|
47
|
-
# this will probably be relative
|
|
48
|
-
attr_accessor :filename
|
|
49
|
-
|
|
50
|
-
# for backwards compatibility
|
|
51
|
-
def prots
|
|
52
|
-
@prots
|
|
53
|
-
end
|
|
54
|
-
|
|
55
|
-
def self.to_fasta(file_or_obj)
|
|
56
|
-
if file_or_obj.is_a? Fasta
|
|
57
|
-
file_or_obj
|
|
58
|
-
else
|
|
59
|
-
Fasta.new(file_or_obj)
|
|
60
|
-
end
|
|
61
|
-
end
|
|
62
|
-
|
|
63
|
-
# arg can be:
|
|
64
|
-
# Fasta::Prot objects (Array)
|
|
65
|
-
# filename (String)
|
|
66
|
-
# Another Fasta object (Fasta) (shallow copy!)
|
|
67
|
-
def initialize(arg=nil, filename=nil)
|
|
68
|
-
@filename = filename
|
|
69
|
-
@prots = []
|
|
70
|
-
if arg
|
|
71
|
-
if arg.is_a? Fasta
|
|
72
|
-
self.prots = arg.prots
|
|
73
|
-
self.filename = arg.filename
|
|
74
|
-
elsif arg.is_a? Array
|
|
75
|
-
@prots = arg
|
|
76
|
-
else
|
|
77
|
-
read_file(arg)
|
|
78
|
-
end
|
|
79
|
-
end
|
|
80
|
-
super(@prots)
|
|
81
|
-
end
|
|
82
|
-
|
|
83
|
-
# uses the filename (if available, otherwise returning nil) to grab the md5 sum of the file
|
|
84
|
-
def md5_sum
|
|
85
|
-
if File.exist?(@filename)
|
|
86
|
-
Digest::MD5.hexdigest(File.read(@filename))
|
|
87
|
-
else
|
|
88
|
-
nil
|
|
89
|
-
end
|
|
90
|
-
end
|
|
91
|
-
|
|
92
|
-
# returns the length of the file (in terms of the total number of amino
|
|
93
|
-
# acids represented)
|
|
94
|
-
def aa_seq_length
|
|
95
|
-
tot = 0
|
|
96
|
-
self.each do |prot|
|
|
97
|
-
tot += prot.aaseq.size
|
|
98
|
-
end
|
|
99
|
-
tot
|
|
100
|
-
end
|
|
101
|
-
|
|
102
|
-
# searches proteins for a match to the exact sequence and returns a single
|
|
103
|
-
# protein header (with > & no newline)
|
|
104
|
-
# exact matches). nil if no matches
|
|
105
|
-
def header_from_exact_sequence(aaseq)
|
|
106
|
-
hash = self.hash_by(:aaseq)
|
|
107
|
-
answ = hash[aaseq].map{|v| v.header}
|
|
108
|
-
if answ.size == 1
|
|
109
|
-
answ
|
|
110
|
-
elsif answ.size == 0
|
|
111
|
-
nil
|
|
112
|
-
else
|
|
113
|
-
answ
|
|
114
|
-
end
|
|
115
|
-
end
|
|
116
|
-
|
|
117
|
-
# searches all headers to see if they include input string
|
|
118
|
-
# returns true if one matches, false otherwise
|
|
119
|
-
# (remember that headers are not stored with newline chars but do contain
|
|
120
|
-
# beginning '>'
|
|
121
|
-
def included_in_header?(input)
|
|
122
|
-
@prots.any? do |prot|
|
|
123
|
-
prot.header.include? input
|
|
124
|
-
end
|
|
125
|
-
end
|
|
126
|
-
|
|
127
|
-
# takes an io object or string (which is the fasta data) This is not as
|
|
128
|
-
# stringent as 'read_file' which is recommended for industrial type use. For
|
|
129
|
-
# instance, this will fail if your newlines are different in your file from
|
|
130
|
-
# those defined on your operating system. If you have a string, simply pass
|
|
131
|
-
# in StringIO.new(your_string) to be read.
|
|
132
|
-
# returns self
|
|
133
|
-
def load(io)
|
|
134
|
-
current_prot = nil
|
|
135
|
-
current_aaseq = nil
|
|
136
|
-
@prots.clear
|
|
137
|
-
io.each do |line|
|
|
138
|
-
if line[0,1] == '>'
|
|
139
|
-
current_prot = Prot.new
|
|
140
|
-
@prots << current_prot
|
|
141
|
-
current_prot.header = line.chomp
|
|
142
|
-
current_aaseq = ''
|
|
143
|
-
current_prot.aaseq = current_aaseq
|
|
144
|
-
elsif (line =~ /[^ ]/) && (line.size > 1)
|
|
145
|
-
current_aaseq << line.chomp
|
|
146
|
-
end
|
|
147
|
-
end
|
|
148
|
-
self
|
|
149
|
-
end
|
|
150
|
-
|
|
151
|
-
# uses 'load' to create a fasta object from a fasta string
|
|
152
|
-
def self.from_string(string)
|
|
153
|
-
Fasta.new.load(StringIO.new(string))
|
|
154
|
-
end
|
|
155
|
-
|
|
156
|
-
# Reads fasta files (under windows or unix newlines)
|
|
157
|
-
# Always outputs LF separated files
|
|
158
|
-
# Checks that the first character per line is '>' or character class [A-Za-z*]
|
|
159
|
-
# returns a fasta object for stringing commands
|
|
160
|
-
# if fn not given, will read the :filename attribute
|
|
161
|
-
# will set :filename to fn is given
|
|
162
|
-
def read_file(fn=nil)
|
|
163
|
-
@filename = fn if fn
|
|
164
|
-
first_char_re = /[A-Za-z*]/o
|
|
165
|
-
obj = nil
|
|
166
|
-
regex = /(\r\n)|\n/o
|
|
167
|
-
fh = File.new(fn).binmode
|
|
168
|
-
lines = fh.read.split(regex)
|
|
169
|
-
fh.close
|
|
170
|
-
first_char = nil
|
|
171
|
-
lines.each do |line|
|
|
172
|
-
if line =~ /[^ \n\r]/
|
|
173
|
-
first_char = line[0,1]
|
|
174
|
-
if first_char == '>'
|
|
175
|
-
obj = Prot.new
|
|
176
|
-
@prots << obj
|
|
177
|
-
obj.header = line.dup
|
|
178
|
-
elsif first_char =~ first_char_re
|
|
179
|
-
obj.aaseq << line.chomp
|
|
180
|
-
else
|
|
181
|
-
raise "Line not in fasta format (between arrows): -->#{line}<--"
|
|
182
|
-
end
|
|
183
|
-
end
|
|
184
|
-
end
|
|
185
|
-
self
|
|
186
|
-
end
|
|
187
|
-
|
|
188
|
-
# if no fn, will write to :filename attribute
|
|
189
|
-
def write_file(fn=nil)
|
|
190
|
-
fn = @out unless fn
|
|
191
|
-
File.open(fn, "wb") do |out|
|
|
192
|
-
@prots.each do |prot|
|
|
193
|
-
out.print(prot.to_s)
|
|
194
|
-
end
|
|
195
|
-
end
|
|
196
|
-
end
|
|
197
|
-
|
|
198
|
-
# duplicates the object (deep copy)
|
|
199
|
-
def dup
|
|
200
|
-
other = self.class.new
|
|
201
|
-
other.filename = self.filename
|
|
202
|
-
self.prots.each do |prot|
|
|
203
|
-
other.prots << prot.dup
|
|
204
|
-
end
|
|
205
|
-
other
|
|
206
|
-
end
|
|
207
|
-
|
|
208
|
-
end
|
|
209
|
-
|
|
210
|
-
class FastaShaker
|
|
211
|
-
|
|
212
|
-
def reverse(fasta_file_or_obj, opts={})
|
|
213
|
-
shake_it(:reverse, fasta_file_or_obj, opts)
|
|
214
|
-
end
|
|
215
|
-
|
|
216
|
-
def shuffle(fasta_file_or_obj, opts={})
|
|
217
|
-
shake_it(:shuffle, fasta_file_or_obj, opts)
|
|
218
|
-
end
|
|
219
|
-
|
|
220
|
-
# sets the outbound filename attribute from opts
|
|
221
|
-
def create_filename(fasta, method, opts={})
|
|
222
|
-
file = fasta.filename || 'fasta'
|
|
223
|
-
filebase = file.sub(/\..*$/,'')
|
|
224
|
-
parts = [filebase]
|
|
225
|
-
parts << 'cat' if opts[:cat]
|
|
226
|
-
parts << method
|
|
227
|
-
parts << 'prefix' << opts[:prefix] if opts[:prefix]
|
|
228
|
-
parts << 'fraction' << opts[:fraction] if opts[:fraction]
|
|
229
|
-
parts << 'tryptic_peptides' if opts[:tryptic_peptides]
|
|
230
|
-
parts.join("_") << ".fasta"
|
|
231
|
-
end
|
|
232
|
-
|
|
233
|
-
protected
|
|
234
|
-
def shake_it(method, fasta_file_or_obj, opt)
|
|
235
|
-
fasta = Fasta.to_fasta(fasta_file_or_obj)
|
|
236
|
-
if opt[:cat] && !opt[:prefix]
|
|
237
|
-
message = "WARNING: concatenated proteins don't have unique headers\n[you probably wanted to use the '--prefix' option!]"
|
|
238
|
-
warn message
|
|
239
|
-
end
|
|
240
|
-
|
|
241
|
-
unless opt[:out]
|
|
242
|
-
opt[:out] = create_filename(fasta, method, opt)
|
|
243
|
-
end
|
|
244
|
-
|
|
245
|
-
## CAT (save an original copy)
|
|
246
|
-
fasta_orig = fasta.dup if opt[:cat]
|
|
247
|
-
|
|
248
|
-
## FRACTION the proteins
|
|
249
|
-
if f = opt[:fraction]
|
|
250
|
-
prefix = nil
|
|
251
|
-
if f > 1.0
|
|
252
|
-
prefix = proc {|cnt| "f#{cnt}_" }
|
|
253
|
-
end
|
|
254
|
-
fasta = fasta.fraction_of_prots(f, prefix)
|
|
255
|
-
end
|
|
256
|
-
|
|
257
|
-
## PREFIX the proteins
|
|
258
|
-
if pre = opt[:prefix]
|
|
259
|
-
fasta.header_prefix!(pre)
|
|
260
|
-
end
|
|
261
|
-
|
|
262
|
-
## MODIFY the proteins
|
|
263
|
-
fasta.aaseq!((method.to_s + '!').to_sym, opt[:tryptic_peptides])
|
|
264
|
-
|
|
265
|
-
## CAT (finish it up)
|
|
266
|
-
if opt[:cat]
|
|
267
|
-
fasta_orig << fasta
|
|
268
|
-
fasta = fasta_orig
|
|
269
|
-
end
|
|
270
|
-
|
|
271
|
-
## WRITE out the file
|
|
272
|
-
fasta.write_file(opt[:out])
|
|
273
|
-
end
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
#############################################
|
|
279
|
-
# END MAIN METHODS
|
|
280
|
-
#############################################
|
|
281
|
-
|
|
282
|
-
# takes command line input, and sends it to shake
|
|
283
|
-
def FastaShaker.shake_from_argv(argv)
|
|
284
|
-
opt = {}
|
|
285
|
-
|
|
286
|
-
opts = OptionParser.new do |op|
|
|
287
|
-
prog = File.basename(__FILE__)
|
|
288
|
-
op.banner = "USAGE: #{prog} <method> [OPTIONS] <file>.fasta"
|
|
289
|
-
op.separator " <method> = reverse | shuffle"
|
|
290
|
-
op.separator ""
|
|
291
|
-
op.separator "fasta_shaker is kind of like a salt shaker:"
|
|
292
|
-
op.separator "shake up your fasta proteins and let them"
|
|
293
|
-
op.separator "season your dinner (hopefully a protein dinner). Mmmm."
|
|
294
|
-
op.separator "false identification rates never tasted so good :)"
|
|
295
|
-
op.separator ""
|
|
296
|
-
op.on("-c", "--cat", "catenates the output to copy of original") {|v| opt[:cat] = v }
|
|
297
|
-
op.on("-o", "--out <string>", "name of output file (default is descriptive)") {|v| opt[:out] = v }
|
|
298
|
-
op.on("-p", "--prefix <string>", "give a header prefix to modified prots") {|v| opt[:prefix] = v }
|
|
299
|
-
op.on("-f", "--fraction <float>", Float, "creates some fraction of proteins") {|v| opt[:fraction] = v }
|
|
300
|
-
op.separator " [if fraction > 1 then the tag 'f<frac#>_' prefixed to proteins"
|
|
301
|
-
op.separator " (after any given prefix) so that proteins are unique]"
|
|
302
|
-
op.on("--tryptic_peptides", "applies method to [KR][^P] peptides") {|v| opt[:tryptic_peptides] = v }
|
|
303
|
-
|
|
304
|
-
op.separator ""
|
|
305
|
-
op.separator "EXAMPLES: "
|
|
306
|
-
op.separator " #{prog} reverse file.fasta -o protein_aa_sequence_reversed.fasta"
|
|
307
|
-
op.separator " #{prog} shuffle file.fasta -o protein_aa_sequence_shuffled.fasta"
|
|
308
|
-
op.separator " #{prog} shuffle file.fasta -c -p SH_ -o normal_cat_shuffled_with_prefix.fasta"
|
|
309
|
-
op.separator " #{prog} reverse file.fasta --tryptic_peptides tryptic_peptides_reversed.fasta"
|
|
310
|
-
end
|
|
311
|
-
|
|
312
|
-
#p argv
|
|
313
|
-
opts.parse!(argv)
|
|
314
|
-
|
|
315
|
-
if argv.size < 2
|
|
316
|
-
puts opts
|
|
317
|
-
exit
|
|
318
|
-
end
|
|
319
|
-
|
|
320
|
-
(method, file) = argv
|
|
321
|
-
fs = FastaShaker.new
|
|
322
|
-
fs.send(method.to_sym, file, opt)
|
|
323
|
-
end
|
|
324
|
-
|
|
325
|
-
private
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
end
|
|
330
|
-
|
|
331
|
-
module FastaManipulation
|
|
332
|
-
|
|
333
|
-
# concatenates the filenames like this:
|
|
334
|
-
# cat_filenames('fn1.ext1', 'fn2.ext2', '__') # -> 'fn1__fn2.ext1'
|
|
335
|
-
# the path and extension of the first filename are kept intact.
|
|
336
|
-
# other files only use the basename (with no extension)
|
|
337
|
-
def self.cat_filenames(filenames, connector="")
|
|
338
|
-
fn1 = filenames.shift
|
|
339
|
-
fn1_ext = File.extname(fn1)
|
|
340
|
-
filenames.collect! do |fn|
|
|
341
|
-
fn_ext = File.extname(fn)
|
|
342
|
-
fn_base_no_ext = File.basename(fn, fn_ext)
|
|
343
|
-
end
|
|
344
|
-
con_filenames = filenames.join(connector)
|
|
345
|
-
fn1.gsub(/#{Regexp.escape(fn1_ext)}$/, connector + con_filenames + fn1_ext)
|
|
346
|
-
end
|
|
347
|
-
|
|
348
|
-
# returns a new fasta object using some fraction of proteins randomly
|
|
349
|
-
# selected (fraction may be > 1). Always rounds up. Will not choose a
|
|
350
|
-
# protein twice unless all other proteins have been chosen
|
|
351
|
-
#
|
|
352
|
-
# fraction_prefix ensures that a unique header is given even if multiple
|
|
353
|
-
# fraction of proteins are being created
|
|
354
|
-
# fraction_cnt = (prot_cnt/num_prots).floor.to_i
|
|
355
|
-
# so for the first n proteins, it will be 0,
|
|
356
|
-
# the 2n proteins will be 1, etc.
|
|
357
|
-
# e.g. prefix_proc = proc {|frac_cnt| "f#{frac_cnt}_" }
|
|
358
|
-
# would give headers like this: >f0_<some_real_header>,
|
|
359
|
-
# >f1_<some_real_header>, ...
|
|
360
|
-
def fraction_of_prots(fraction=1, prefix_proc=nil)
|
|
361
|
-
new_num = (fraction.to_f * self.prots.size).ceil
|
|
362
|
-
arr = []
|
|
363
|
-
orig_num_prots = @prots.size
|
|
364
|
-
|
|
365
|
-
# initialize
|
|
366
|
-
new_prots = @prots.map {|prt| prt.dup }
|
|
367
|
-
frac_cnt = 0
|
|
368
|
-
ind_cnt = 0
|
|
369
|
-
prt_cnt = orig_num_prots
|
|
370
|
-
while ind_cnt < new_num
|
|
371
|
-
arr << new_prots.delete_at(rand(new_prots.size))
|
|
372
|
-
if prefix_proc
|
|
373
|
-
prefix = prefix_proc.call(frac_cnt)
|
|
374
|
-
arr.last.header_prefix!(prefix)
|
|
375
|
-
end
|
|
376
|
-
prt_cnt -= 1 # index
|
|
377
|
-
if prt_cnt == 0
|
|
378
|
-
frac_cnt += 1
|
|
379
|
-
new_prots = @prots.map {|prt| prt.dup }
|
|
380
|
-
prt_cnt = orig_num_prots
|
|
381
|
-
end
|
|
382
|
-
ind_cnt += 1
|
|
383
|
-
end
|
|
384
|
-
fasta_fraction = Fasta.new(arr)
|
|
385
|
-
end
|
|
386
|
-
|
|
387
|
-
# Convenience method to concatenate an array of fasta files. Filenames are
|
|
388
|
-
# concatenated according to 'cat_filenames') and prefixes the proteins
|
|
389
|
-
# according to the values in 'file_prot_header_prefixes' array
|
|
390
|
-
def self.cat_and_prefix(files, file_prot_header_prefixes=nil, file_connector=nil)
|
|
391
|
-
fastas = files.collect do |file|
|
|
392
|
-
Fasta.new.read_file(file)
|
|
393
|
-
end
|
|
394
|
-
outfile = cat_filenames(files, file_connector)
|
|
395
|
-
if file_prot_header_prefixes
|
|
396
|
-
file_prot_header_prefixes.each_with_index do |prefix,i|
|
|
397
|
-
fastas[i].header_prefix!(prefix) if prefix
|
|
398
|
-
end
|
|
399
|
-
end
|
|
400
|
-
fasta1 = fastas.shift
|
|
401
|
-
fastas.each do |fasta|
|
|
402
|
-
fasta1 << fasta
|
|
403
|
-
end
|
|
404
|
-
fasta1.write_file(outfile)
|
|
405
|
-
outfile
|
|
406
|
-
end
|
|
407
|
-
|
|
408
|
-
def <<(other)
|
|
409
|
-
# case when with class names uses === operator
|
|
410
|
-
case other
|
|
411
|
-
when Fasta
|
|
412
|
-
@prots.push(*(other.prots))
|
|
413
|
-
when Fasta::Prot
|
|
414
|
-
@prots.push(other)
|
|
415
|
-
end
|
|
416
|
-
end
|
|
417
|
-
|
|
418
|
-
# method = :shuffle! | :reverse!
|
|
419
|
-
def aaseq!(method_as_symbol=:shuffle!, tryptic_peptides=false)
|
|
420
|
-
if tryptic_peptides
|
|
421
|
-
@prots.each {|prot| prot.tryptic_peptides!( method_as_symbol) }
|
|
422
|
-
else
|
|
423
|
-
@prots.each {|prot| prot.aaseq!(method_as_symbol) }
|
|
424
|
-
end
|
|
425
|
-
end
|
|
426
|
-
|
|
427
|
-
# shuffles the aa sequence of each protein (each protein within itself)
|
|
428
|
-
def aaseq_shuffle!
|
|
429
|
-
@prots.each {|prot| prot.shuffle! }
|
|
430
|
-
end
|
|
431
|
-
|
|
432
|
-
# shuffles the aa sequence of each protein (each protein within itself)
|
|
433
|
-
def aaseq_invert!
|
|
434
|
-
@prots.each {|prot| prot.invert! }
|
|
435
|
-
end
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
def aaseq_invert_tryptic_peptides!
|
|
439
|
-
@prots.each {|prot| prot.invert_tryptic_peptides! }
|
|
440
|
-
end
|
|
441
|
-
|
|
442
|
-
def aaseq_shuffle_tryptic_peptides!
|
|
443
|
-
@prots.each {|prot| prot.invert_tryptic_peptides! }
|
|
444
|
-
end
|
|
445
|
-
|
|
446
|
-
def header_prefix!(prefix)
|
|
447
|
-
@prots.each do |prot|
|
|
448
|
-
prot.header_prefix!(prefix)
|
|
449
|
-
end
|
|
450
|
-
end
|
|
451
|
-
|
|
452
|
-
end
|
|
453
|
-
|
|
454
|
-
# requires that object respond_to? :reference
|
|
455
|
-
module ProteinReferenceable
|
|
456
|
-
# gives the string up to the first space (without the leading '>')
|
|
457
|
-
def first_entry
|
|
458
|
-
ref = reference
|
|
459
|
-
if ref
|
|
460
|
-
if ref.size > 1
|
|
461
|
-
ls_ref = ref.lstrip
|
|
462
|
-
index = ls_ref.index(' ')
|
|
463
|
-
if index
|
|
464
|
-
ls_ref[0...index]
|
|
465
|
-
else
|
|
466
|
-
ls_ref.dup
|
|
467
|
-
end
|
|
468
|
-
else
|
|
469
|
-
''
|
|
470
|
-
end
|
|
471
|
-
else
|
|
472
|
-
nil
|
|
473
|
-
end
|
|
474
|
-
end
|
|
475
|
-
|
|
476
|
-
end
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
class Fasta::Prot
|
|
482
|
-
include ProteinReferenceable
|
|
483
|
-
|
|
484
|
-
# header given as full line with starting '>' (but no newline chars!).
|
|
485
|
-
# aaseq also given without any newline chars
|
|
486
|
-
attr_accessor :header, :aaseq
|
|
487
|
-
def initialize(header=nil, aaseq=nil)
|
|
488
|
-
@header = header || ''
|
|
489
|
-
if aaseq
|
|
490
|
-
@aaseq = aaseq
|
|
491
|
-
else
|
|
492
|
-
@aaseq = ""
|
|
493
|
-
end
|
|
494
|
-
end
|
|
495
|
-
|
|
496
|
-
def ==(other)
|
|
497
|
-
other && other.class == self.class && other.aaseq == self.aaseq && other.header == self.header
|
|
498
|
-
end
|
|
499
|
-
|
|
500
|
-
# gives the string up to the first space (without the leading '>')
|
|
501
|
-
def first_entry
|
|
502
|
-
|
|
503
|
-
if @header
|
|
504
|
-
if @header.size > 1
|
|
505
|
-
index = @header.index(' ')
|
|
506
|
-
if index
|
|
507
|
-
@header[1...index]
|
|
508
|
-
else
|
|
509
|
-
@header[1..-1]
|
|
510
|
-
end
|
|
511
|
-
else
|
|
512
|
-
''
|
|
513
|
-
end
|
|
514
|
-
else
|
|
515
|
-
nil
|
|
516
|
-
end
|
|
517
|
-
end
|
|
518
|
-
|
|
519
|
-
# returns the fasta header information without the leading '>'
|
|
520
|
-
def reference
|
|
521
|
-
@header[1..-1]
|
|
522
|
-
end
|
|
523
|
-
|
|
524
|
-
# returns the value after the first '|' and before the second '|'
|
|
525
|
-
# according to this regexp: /\|(.*?)\|/
|
|
526
|
-
# This will typically be the gi code
|
|
527
|
-
# Returns nil if it doesn't match
|
|
528
|
-
def gi
|
|
529
|
-
if @header =~ /\|(.*?)\|/
|
|
530
|
-
$1.dup
|
|
531
|
-
else
|
|
532
|
-
nil
|
|
533
|
-
end
|
|
534
|
-
end
|
|
535
|
-
|
|
536
|
-
# convenience
|
|
537
|
-
def invert_tryptic_peptides! ; tryptic_peptides!(:reverse) end
|
|
538
|
-
def shuffle_tryptic_peptides! ; tryptic_peptides!(:shuffle) end
|
|
539
|
-
|
|
540
|
-
# modifies tryptic peptides as given by SampleEnzyme.tryptic(@aaseq)
|
|
541
|
-
# [cuts after K or R but not if followed by a P]
|
|
542
|
-
# if method_as_symbol = :reverse
|
|
543
|
-
# :reverse | :shuffle OR :reverse! | :shuffle!
|
|
544
|
-
# aaseq = 'ABCKCDERDEKDGEKWXYRRKDER'
|
|
545
|
-
# -> 'ABCKCDERDEKDGEKWXYRRKDER'
|
|
546
|
-
def tryptic_peptides!(method_as_symbol)
|
|
547
|
-
peps = SampleEnzyme.tryptic(@aaseq)
|
|
548
|
-
ends_in_RK = /[KR]/o
|
|
549
|
-
|
|
550
|
-
## if the last peptide doesn't end in R or K we want to flip it completely
|
|
551
|
-
last_pep_special = nil
|
|
552
|
-
if peps.last[-1,1] !~ /[KR]/
|
|
553
|
-
last_pep_special = peps.pop
|
|
554
|
-
end
|
|
555
|
-
rev_peps = peps.map{|pep| pep[0..-2].send(method_as_symbol) << pep[-1]}
|
|
556
|
-
if last_pep_special
|
|
557
|
-
rev_peps << last_pep_special.send(method_as_symbol)
|
|
558
|
-
end
|
|
559
|
-
@aaseq = rev_peps.join
|
|
560
|
-
end
|
|
561
|
-
|
|
562
|
-
# takes :reverse! | :shuffle!
|
|
563
|
-
def aaseq!(method_as_symbol)
|
|
564
|
-
@aaseq.send(method_as_symbol)
|
|
565
|
-
end
|
|
566
|
-
|
|
567
|
-
def invert!
|
|
568
|
-
@aaseq.reverse!
|
|
569
|
-
end
|
|
570
|
-
|
|
571
|
-
def shuffle!
|
|
572
|
-
@aaseq.shuffle!
|
|
573
|
-
end
|
|
574
|
-
|
|
575
|
-
# adds a prefix to the protein header (which comes after the '>' char) if
|
|
576
|
-
# one is not already there.
|
|
577
|
-
def header_prefix!(prefix)
|
|
578
|
-
unless @header =~ /^>#{Regexp.escape(prefix)}/
|
|
579
|
-
@header.gsub!(/^>/, ">#{prefix}")
|
|
580
|
-
end
|
|
581
|
-
end
|
|
582
|
-
|
|
583
|
-
def dup
|
|
584
|
-
self.class.new(@header.dup, @aaseq.dup)
|
|
585
|
-
end
|
|
586
|
-
|
|
587
|
-
# returns the header line and aaseq with trailing newlines as one might find
|
|
588
|
-
# in a fasta file
|
|
589
|
-
def to_s
|
|
590
|
-
@header + "\n" + @aaseq + "\n"
|
|
591
|
-
end
|
|
592
|
-
|
|
593
|
-
end
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
# For reference, my code is about 15X faster than the first code I wrote
|
|
597
|
-
# below! It turns out that the major slowdown is in the randomize routine.
|
|
598
|
-
# Using my own randomize routine with the below way of reading fasta
|
|
599
|
-
# files is 2X faster than below (in other words, my reader is 2X as fasta).
|
|
600
|
-
#
|
|
601
|
-
##!/usr/bin/ruby -w
|
|
602
|
-
#
|
|
603
|
-
#require 'bio'
|
|
604
|
-
#
|
|
605
|
-
#SHUFF_EXT = "_shuffled"
|
|
606
|
-
#
|
|
607
|
-
#if ARGV.size < 1
|
|
608
|
-
# puts <<END
|
|
609
|
-
#usage: #{File.basename(__FILE__)} file.fasta ... # -> file#{SHUFF_EXT}.fasta ...
|
|
610
|
-
#Shuffles the amino acid sequence of each protein.
|
|
611
|
-
#END
|
|
612
|
-
# exit
|
|
613
|
-
#end
|
|
614
|
-
#
|
|
615
|
-
#ARGV.each do |fn|
|
|
616
|
-
# fn_ext = File.extname(fn)
|
|
617
|
-
# fn_out = fn.gsub(fn_ext, SHUFF_EXT + fn_ext)
|
|
618
|
-
# File.open(fn_out, "w") do |fh|
|
|
619
|
-
# f = Bio::FlatFile.auto(fn)
|
|
620
|
-
# f.each_entry do |e|
|
|
621
|
-
# fh.puts '>' + e.definition
|
|
622
|
-
# fh.puts e.aaseq.randomize
|
|
623
|
-
# end
|
|
624
|
-
# end
|
|
625
|
-
#end
|
|
626
|
-
by=:protein, num=1
|