mspire 0.4.9 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +27 -17
- data/changelog.txt +31 -62
- data/lib/ms/calc.rb +32 -0
- data/lib/ms/data/interleaved.rb +60 -0
- data/lib/ms/data/lazy_io.rb +73 -0
- data/lib/ms/data/lazy_string.rb +15 -0
- data/lib/ms/data/simple.rb +59 -0
- data/lib/ms/data/transposed.rb +41 -0
- data/lib/ms/data.rb +57 -0
- data/lib/ms/format/format_error.rb +12 -0
- data/lib/ms/spectrum.rb +25 -384
- data/lib/ms/support/binary_search.rb +126 -0
- data/lib/ms.rb +10 -10
- metadata +38 -350
- data/INSTALL +0 -58
- data/README.rdoc +0 -18
- data/Rakefile +0 -330
- data/bin/aafreqs.rb +0 -23
- data/bin/bioworks2excel.rb +0 -14
- data/bin/bioworks_to_pepxml.rb +0 -148
- data/bin/bioworks_to_pepxml_gui.rb +0 -225
- data/bin/fasta_shaker.rb +0 -5
- data/bin/filter_and_validate.rb +0 -5
- data/bin/gi2annot.rb +0 -14
- data/bin/id_class_anal.rb +0 -112
- data/bin/id_precision.rb +0 -172
- data/bin/ms_to_lmat.rb +0 -67
- data/bin/pepproph_filter.rb +0 -16
- data/bin/prob_validate.rb +0 -6
- data/bin/protein_summary.rb +0 -6
- data/bin/protxml2prots_peps.rb +0 -32
- data/bin/raw_to_mzXML.rb +0 -55
- data/bin/run_percolator.rb +0 -122
- data/bin/sqt_group.rb +0 -26
- data/bin/srf_group.rb +0 -27
- data/bin/srf_to_sqt.rb +0 -40
- data/lib/align/chams.rb +0 -78
- data/lib/align.rb +0 -154
- data/lib/archive/targz.rb +0 -94
- data/lib/bsearch.rb +0 -120
- data/lib/core_extensions.rb +0 -16
- data/lib/fasta.rb +0 -626
- data/lib/gi.rb +0 -124
- data/lib/group_by.rb +0 -10
- data/lib/index_by.rb +0 -11
- data/lib/merge_deep.rb +0 -21
- data/lib/ms/converter/mzxml.rb +0 -77
- data/lib/ms/gradient_program.rb +0 -170
- data/lib/ms/msrun.rb +0 -244
- data/lib/ms/msrun_index.rb +0 -108
- data/lib/ms/parser/mzdata/axml.rb +0 -67
- data/lib/ms/parser/mzdata/dom.rb +0 -175
- data/lib/ms/parser/mzdata/libxml.rb +0 -7
- data/lib/ms/parser/mzdata.rb +0 -31
- data/lib/ms/parser/mzxml/axml.rb +0 -70
- data/lib/ms/parser/mzxml/dom.rb +0 -182
- data/lib/ms/parser/mzxml/hpricot.rb +0 -253
- data/lib/ms/parser/mzxml/libxml.rb +0 -19
- data/lib/ms/parser/mzxml/regexp.rb +0 -122
- data/lib/ms/parser/mzxml/rexml.rb +0 -72
- data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
- data/lib/ms/parser/mzxml.rb +0 -282
- data/lib/ms/parser.rb +0 -108
- data/lib/ms/precursor.rb +0 -25
- data/lib/ms/scan.rb +0 -81
- data/lib/mspire.rb +0 -4
- data/lib/pi_zero.rb +0 -244
- data/lib/qvalue.rb +0 -161
- data/lib/roc.rb +0 -187
- data/lib/sample_enzyme.rb +0 -160
- data/lib/scan_i.rb +0 -21
- data/lib/spec_id/aa_freqs.rb +0 -170
- data/lib/spec_id/bioworks.rb +0 -497
- data/lib/spec_id/digestor.rb +0 -138
- data/lib/spec_id/mass.rb +0 -179
- data/lib/spec_id/parser/proph.rb +0 -335
- data/lib/spec_id/precision/filter/cmdline.rb +0 -218
- data/lib/spec_id/precision/filter/interactive.rb +0 -134
- data/lib/spec_id/precision/filter/output.rb +0 -148
- data/lib/spec_id/precision/filter.rb +0 -637
- data/lib/spec_id/precision/output.rb +0 -60
- data/lib/spec_id/precision/prob/cmdline.rb +0 -160
- data/lib/spec_id/precision/prob/output.rb +0 -94
- data/lib/spec_id/precision/prob.rb +0 -249
- data/lib/spec_id/proph/pep_summary.rb +0 -104
- data/lib/spec_id/proph/prot_summary.rb +0 -484
- data/lib/spec_id/proph.rb +0 -4
- data/lib/spec_id/protein_summary.rb +0 -489
- data/lib/spec_id/sequest/params.rb +0 -316
- data/lib/spec_id/sequest/pepxml.rb +0 -1458
- data/lib/spec_id/sequest.rb +0 -33
- data/lib/spec_id/sqt.rb +0 -349
- data/lib/spec_id/srf.rb +0 -973
- data/lib/spec_id.rb +0 -778
- data/lib/spec_id_xml.rb +0 -99
- data/lib/transmem/phobius.rb +0 -147
- data/lib/transmem/toppred.rb +0 -368
- data/lib/transmem.rb +0 -157
- data/lib/validator/aa.rb +0 -48
- data/lib/validator/aa_est.rb +0 -112
- data/lib/validator/background.rb +0 -77
- data/lib/validator/bias.rb +0 -95
- data/lib/validator/cmdline.rb +0 -431
- data/lib/validator/decoy.rb +0 -107
- data/lib/validator/digestion_based.rb +0 -70
- data/lib/validator/probability.rb +0 -51
- data/lib/validator/prot_from_pep.rb +0 -234
- data/lib/validator/q_value.rb +0 -32
- data/lib/validator/transmem.rb +0 -272
- data/lib/validator/true_pos.rb +0 -46
- data/lib/validator.rb +0 -197
- data/lib/xml.rb +0 -38
- data/lib/xml_style_parser.rb +0 -119
- data/lib/xmlparser_wrapper.rb +0 -19
- data/release_notes.txt +0 -2
- data/script/compile_and_plot_smriti_final.rb +0 -97
- data/script/create_little_pepxml.rb +0 -61
- data/script/degenerate_peptides.rb +0 -47
- data/script/estimate_fpr_by_cysteine.rb +0 -226
- data/script/extract_gradient_programs.rb +0 -56
- data/script/find_cysteine_background.rb +0 -137
- data/script/genuine_tps_and_probs.rb +0 -136
- data/script/get_apex_values_rexml.rb +0 -44
- data/script/histogram_probs.rb +0 -61
- data/script/mascot_fix_pepxml.rb +0 -123
- data/script/msvis.rb +0 -42
- data/script/mzXML2timeIndex.rb +0 -25
- data/script/peps_per_bin.rb +0 -67
- data/script/prep_dir.rb +0 -121
- data/script/simple_protein_digestion.rb +0 -27
- data/script/smriti_final_analysis.rb +0 -103
- data/script/sqt_to_meta.rb +0 -24
- data/script/top_hit_per_scan.rb +0 -67
- data/script/toppred_to_yaml.rb +0 -47
- data/script/tpp_installer.rb +0 -249
- data/specs/align_spec.rb +0 -79
- data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
- data/specs/bin/fasta_shaker_spec.rb +0 -259
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
- data/specs/bin/filter_and_validate_spec.rb +0 -180
- data/specs/bin/ms_to_lmat_spec.rb +0 -34
- data/specs/bin/prob_validate_spec.rb +0 -86
- data/specs/bin/protein_summary_spec.rb +0 -14
- data/specs/fasta_spec.rb +0 -354
- data/specs/gi_spec.rb +0 -22
- data/specs/load_bin_path.rb +0 -7
- data/specs/merge_deep_spec.rb +0 -13
- data/specs/ms/gradient_program_spec.rb +0 -77
- data/specs/ms/msrun_spec.rb +0 -498
- data/specs/ms/parser_spec.rb +0 -92
- data/specs/ms/spectrum_spec.rb +0 -87
- data/specs/pi_zero_spec.rb +0 -115
- data/specs/qvalue_spec.rb +0 -39
- data/specs/roc_spec.rb +0 -251
- data/specs/rspec_autotest.rb +0 -149
- data/specs/sample_enzyme_spec.rb +0 -126
- data/specs/spec_helper.rb +0 -135
- data/specs/spec_id/aa_freqs_spec.rb +0 -52
- data/specs/spec_id/bioworks_spec.rb +0 -148
- data/specs/spec_id/digestor_spec.rb +0 -75
- data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
- data/specs/spec_id/precision/filter/output_spec.rb +0 -31
- data/specs/spec_id/precision/filter_spec.rb +0 -246
- data/specs/spec_id/precision/prob_spec.rb +0 -44
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
- data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
- data/specs/spec_id/protein_summary_spec.rb +0 -189
- data/specs/spec_id/sequest/params_spec.rb +0 -68
- data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
- data/specs/spec_id/sequest_spec.rb +0 -38
- data/specs/spec_id/sqt_spec.rb +0 -246
- data/specs/spec_id/srf_spec.rb +0 -172
- data/specs/spec_id/srf_spec_helper.rb +0 -139
- data/specs/spec_id_helper.rb +0 -33
- data/specs/spec_id_spec.rb +0 -366
- data/specs/spec_id_xml_spec.rb +0 -33
- data/specs/transmem/phobius_spec.rb +0 -425
- data/specs/transmem/toppred_spec.rb +0 -298
- data/specs/transmem_spec.rb +0 -60
- data/specs/transmem_spec_shared.rb +0 -64
- data/specs/validator/aa_est_spec.rb +0 -66
- data/specs/validator/aa_spec.rb +0 -40
- data/specs/validator/background_spec.rb +0 -67
- data/specs/validator/bias_spec.rb +0 -122
- data/specs/validator/decoy_spec.rb +0 -51
- data/specs/validator/fasta_helper.rb +0 -26
- data/specs/validator/prot_from_pep_spec.rb +0 -141
- data/specs/validator/transmem_spec.rb +0 -146
- data/specs/validator/true_pos_spec.rb +0 -58
- data/specs/validator_helper.rb +0 -33
- data/specs/xml_spec.rb +0 -12
- data/test_files/000_pepxml18_small.xml +0 -206
- data/test_files/020a.mzXML.timeIndex +0 -4710
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
- data/test_files/4-03-03_small-prot.xml +0 -321
- data/test_files/4-03-03_small.xml +0 -3876
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +0 -5999
- data/test_files/bioworks31.params +0 -77
- data/test_files/bioworks32.params +0 -62
- data/test_files/bioworks33.params +0 -63
- data/test_files/bioworks_single_run_small.xml +0 -7237
- data/test_files/bioworks_small.fasta +0 -212
- data/test_files/bioworks_small.params +0 -63
- data/test_files/bioworks_small.phobius +0 -109
- data/test_files/bioworks_small.toppred.out +0 -2847
- data/test_files/bioworks_small.xml +0 -5610
- data/test_files/bioworks_with_INV_small.xml +0 -3753
- data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +0 -304
- data/test_files/messups.fasta +0 -297
- data/test_files/opd1/000.my_answer.100lines.xml +0 -101
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
- data/test_files/opd1/000_020_3prots-prot.xml +0 -62
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
- data/test_files/opd1/sequest.3.1.params +0 -77
- data/test_files/opd1/sequest.3.2.params +0 -62
- data/test_files/opd1/twenty_scans.mzXML +0 -418
- data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +0 -9
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
- data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
- data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
- data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
- data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
- data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
- data/test_files/pepproph_small.xml +0 -4691
- data/test_files/phobius.small.noheader.txt +0 -50
- data/test_files/phobius.small.small.txt +0 -53
- data/test_files/s01_anC1_ld020mM.key.txt +0 -25
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +0 -297
- data/test_files/small.sqt +0 -87
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +0 -14340
- data/test_files/tf_bioworks2excel.txt.actual +0 -1035
- data/test_files/toppred.small.out +0 -416
- data/test_files/toppred.xml.out +0 -318
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
- data/test_files/yeast_gly_small-prot.xml +0 -265
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
- data/test_files/yeast_gly_small.xml +0 -3807
- data/test_files/yeast_gly_small2.parentTimes +0 -6
data/lib/spec_id_xml.rb
DELETED
|
@@ -1,99 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
# I would prefer to call this SpecID::XML, but I keep getting an error:
|
|
3
|
-
# /home/john/Proteomics/msprot/lib/spec_id/bioworks.rb:412: warning: toplevel
|
|
4
|
-
# constant XML referenced by SpecID::XML' This works around that for now.
|
|
5
|
-
# Any major xml elements should return a newline at the end for simple
|
|
6
|
-
# concatenation into a file
|
|
7
|
-
module SpecIDXML
|
|
8
|
-
|
|
9
|
-
MSial_chrs_hash = {
|
|
10
|
-
'"' => '"',
|
|
11
|
-
'&' => '&',
|
|
12
|
-
"'" => ''',
|
|
13
|
-
'<' => '<',
|
|
14
|
-
'>' => '>',
|
|
15
|
-
}
|
|
16
|
-
|
|
17
|
-
# substitutes special xml chars
|
|
18
|
-
def escape_special_chars(string)
|
|
19
|
-
string.split('').map do |char|
|
|
20
|
-
if MSial_chrs_hash.key? char ; MSial_chrs_hash[char]
|
|
21
|
-
# if x = MSial_chrs_hash[char] ; x # <-- that's slightly slower
|
|
22
|
-
else ; char end
|
|
23
|
-
end.join
|
|
24
|
-
end
|
|
25
|
-
|
|
26
|
-
$DEPTH = 0
|
|
27
|
-
|
|
28
|
-
def tabs
|
|
29
|
-
# this is ugly
|
|
30
|
-
string = ""
|
|
31
|
-
$DEPTH.times { string << "\t" }
|
|
32
|
-
string
|
|
33
|
-
end
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def param_xml(obj, symbol)
|
|
37
|
-
tabs + '<parameter name="' + "#{symbol}" + '" value="' + "#{obj.send(symbol)}" + '"/>'
|
|
38
|
-
end
|
|
39
|
-
|
|
40
|
-
def params_xml(obj, *symbol_list)
|
|
41
|
-
symbol_list.collect { |sy|
|
|
42
|
-
param_xml(obj, sy)
|
|
43
|
-
}.join("\n") + "\n"
|
|
44
|
-
end
|
|
45
|
-
|
|
46
|
-
def short_element_xml(element, att_list)
|
|
47
|
-
"#{tabs}<#{element} #{attrs_xml(att_list)}/>\n"
|
|
48
|
-
end
|
|
49
|
-
|
|
50
|
-
def short_element_xml_and_att_string(element, att_string)
|
|
51
|
-
"#{tabs}<#{element} #{att_string}/>\n"
|
|
52
|
-
end
|
|
53
|
-
|
|
54
|
-
# requires that obj have attribute '@xml_element_name'
|
|
55
|
-
# displays all *instance_variables* (does not call methods!)
|
|
56
|
-
def short_element_xml_from_instance_vars(element_name)
|
|
57
|
-
string = instance_variables.map{|v| "#{v[1..-1]}=\"#{instance_variable_get(v)}\"" }.join(' ')
|
|
58
|
-
"#{tabs}<#{element_name} #{string}/>\n"
|
|
59
|
-
end
|
|
60
|
-
|
|
61
|
-
# takes an element as a symbol and returns the
|
|
62
|
-
def element_xml_no_atts(element)
|
|
63
|
-
start = "#{tabs}<#{element}>\n"
|
|
64
|
-
$DEPTH += 1
|
|
65
|
-
if block_given? ; middle = yield else ; middle = '' end
|
|
66
|
-
$DEPTH -= 1
|
|
67
|
-
start + middle + "#{tabs}</#{element}>\n"
|
|
68
|
-
end
|
|
69
|
-
|
|
70
|
-
# takes an element as a symbol and returns the
|
|
71
|
-
def element_xml(element, att_list)
|
|
72
|
-
|
|
73
|
-
start = "#{tabs}<#{element} #{attrs_xml(att_list)}>\n"
|
|
74
|
-
$DEPTH += 1
|
|
75
|
-
if block_given? ; middle = yield else ; middle = '' end
|
|
76
|
-
$DEPTH -= 1
|
|
77
|
-
start + middle + "#{tabs}</#{element}>\n"
|
|
78
|
-
end
|
|
79
|
-
|
|
80
|
-
# element as symbol and att_string as attributes
|
|
81
|
-
# takes a block of whatever
|
|
82
|
-
def element_xml_and_att_string(element, att_string)
|
|
83
|
-
start = "#{tabs}<#{element} #{att_string}>\n"
|
|
84
|
-
$DEPTH += 1
|
|
85
|
-
if block_given? ; middle = yield else ; middle = '' end
|
|
86
|
-
$DEPTH -= 1
|
|
87
|
-
start + middle + "#{tabs}</#{element}>\n"
|
|
88
|
-
end
|
|
89
|
-
|
|
90
|
-
def attr_xml(symbol)
|
|
91
|
-
"#{symbol}=\"#{send(symbol)}\""
|
|
92
|
-
end
|
|
93
|
-
|
|
94
|
-
def attrs_xml(list_of_symbols)
|
|
95
|
-
list_of_symbols.collect {|sy| attr_xml(sy) }.join(" ")
|
|
96
|
-
end
|
|
97
|
-
|
|
98
|
-
end
|
|
99
|
-
|
data/lib/transmem/phobius.rb
DELETED
|
@@ -1,147 +0,0 @@
|
|
|
1
|
-
require 'transmem'
|
|
2
|
-
|
|
3
|
-
class Phobius ; end
|
|
4
|
-
|
|
5
|
-
# This class will probably change its interface some in the future
|
|
6
|
-
# That's the web portal
|
|
7
|
-
# http://phobius.cgb.ki.se/
|
|
8
|
-
# How to run:
|
|
9
|
-
# Select output format as 'Short'
|
|
10
|
-
# then hit 'Submit Query'
|
|
11
|
-
|
|
12
|
-
# note: to implement some of the TransmemIndex features, the update_aaseq
|
|
13
|
-
# method must be called!
|
|
14
|
-
class Phobius::Index < Hash
|
|
15
|
-
include TransmemIndex
|
|
16
|
-
|
|
17
|
-
# will update_aaseq if given a fasta_obj
|
|
18
|
-
def initialize(file, fasta_obj = nil )
|
|
19
|
-
Phobius.default_index(file, self)
|
|
20
|
-
if fasta_obj
|
|
21
|
-
update_aaseq(fasta_obj)
|
|
22
|
-
end
|
|
23
|
-
end
|
|
24
|
-
|
|
25
|
-
# we need to match whatever function toppred uses to generate identifiers if
|
|
26
|
-
# we want derivative processes to be fast and accurate
|
|
27
|
-
def reference_to_key(reference)
|
|
28
|
-
if reference
|
|
29
|
-
if reference.size > 0
|
|
30
|
-
index = reference.index(' ')
|
|
31
|
-
string =
|
|
32
|
-
if index
|
|
33
|
-
reference[0...index]
|
|
34
|
-
else
|
|
35
|
-
reference
|
|
36
|
-
end
|
|
37
|
-
string.gsub('"','')
|
|
38
|
-
else
|
|
39
|
-
''
|
|
40
|
-
end
|
|
41
|
-
else
|
|
42
|
-
nil
|
|
43
|
-
end
|
|
44
|
-
end
|
|
45
|
-
|
|
46
|
-
# adds an :aaseq key to each hash (necessary for avg_overlap method)
|
|
47
|
-
# these are shallow references to the aaseq in the fasta obj
|
|
48
|
-
def update_aaseq(fasta)
|
|
49
|
-
fasta.each do |prot|
|
|
50
|
-
self[reference_to_key(prot.reference)][:aaseq] = prot.aaseq
|
|
51
|
-
end
|
|
52
|
-
end
|
|
53
|
-
|
|
54
|
-
end
|
|
55
|
-
|
|
56
|
-
class Phobius
|
|
57
|
-
include TransmemIndex
|
|
58
|
-
|
|
59
|
-
# returns the default index
|
|
60
|
-
def self.default_index(file, index={})
|
|
61
|
-
parser = Phobius::Parser.new(:short)
|
|
62
|
-
parser.file_to_index(file, index)
|
|
63
|
-
end
|
|
64
|
-
|
|
65
|
-
end
|
|
66
|
-
|
|
67
|
-
module Phobius::Parser
|
|
68
|
-
|
|
69
|
-
def self.new(parser_type=:short)
|
|
70
|
-
klass =
|
|
71
|
-
case parser_type
|
|
72
|
-
when :short
|
|
73
|
-
Phobius::ParserShort
|
|
74
|
-
else
|
|
75
|
-
raise ArgumentError, "don't recognize parser type: #{parser_type}"
|
|
76
|
-
end
|
|
77
|
-
klass.new
|
|
78
|
-
end
|
|
79
|
-
|
|
80
|
-
def file_to_index(file, index={})
|
|
81
|
-
File.open(file) {|fh| to_index(fh, index) }
|
|
82
|
-
end
|
|
83
|
-
|
|
84
|
-
end
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
class Phobius::ParserShort
|
|
88
|
-
include Phobius::Parser
|
|
89
|
-
|
|
90
|
-
# takes a phobius prediction string (e.g., i12-31o37-56i63-84o96-116i123-143o149-169i)
|
|
91
|
-
# and returns an array of hashes with the keys :start and :stop
|
|
92
|
-
def prediction_to_array(string)
|
|
93
|
-
segments = []
|
|
94
|
-
string.scan(/[io](\d+)-(\d+)/) do |m1, m2|
|
|
95
|
-
segments << { :start => m1.to_i, :stop => m2.to_i }
|
|
96
|
-
end
|
|
97
|
-
segments
|
|
98
|
-
end
|
|
99
|
-
|
|
100
|
-
# returns a hash structure in this form: { identifier => {
|
|
101
|
-
# :num_certain_transmembrane_segments => Int,
|
|
102
|
-
# :transmembrane_segments => [:start => Int, :stop
|
|
103
|
-
# => Int] }
|
|
104
|
-
# can parse io even if there is no header to key in on.
|
|
105
|
-
def to_index(io, index={})
|
|
106
|
-
init_pos = io.pos
|
|
107
|
-
cnt = 0
|
|
108
|
-
found_header = false
|
|
109
|
-
loop do
|
|
110
|
-
if io.gets =~ /SEQENCE/
|
|
111
|
-
found_header = true
|
|
112
|
-
break
|
|
113
|
-
end
|
|
114
|
-
cnt += 1
|
|
115
|
-
break if cnt > 10
|
|
116
|
-
end
|
|
117
|
-
if !found_header
|
|
118
|
-
io.pos = init_pos
|
|
119
|
-
end
|
|
120
|
-
current_record = nil
|
|
121
|
-
io.each do |line|
|
|
122
|
-
line.chomp!
|
|
123
|
-
# grab values
|
|
124
|
-
ar = line.split(/\s+/)
|
|
125
|
-
next if ar.size != 4
|
|
126
|
-
(key, num_tms, signal_peptide, prediction) = ar
|
|
127
|
-
# cast the values
|
|
128
|
-
num_tms = num_tms.to_i
|
|
129
|
-
signal_peptide =
|
|
130
|
-
case signal_peptide
|
|
131
|
-
when 'Y'
|
|
132
|
-
true
|
|
133
|
-
when '0'
|
|
134
|
-
false
|
|
135
|
-
end
|
|
136
|
-
index[key] = {
|
|
137
|
-
:num_certain_transmembrane_segments => num_tms,
|
|
138
|
-
:signal_peptide => signal_peptide,
|
|
139
|
-
}
|
|
140
|
-
if num_tms > 0
|
|
141
|
-
index[key][:transmembrane_segments] = prediction_to_array(prediction)
|
|
142
|
-
end
|
|
143
|
-
end
|
|
144
|
-
index
|
|
145
|
-
end
|
|
146
|
-
|
|
147
|
-
end
|
data/lib/transmem/toppred.rb
DELETED
|
@@ -1,368 +0,0 @@
|
|
|
1
|
-
require 'transmem'
|
|
2
|
-
require 'xml_style_parser'
|
|
3
|
-
|
|
4
|
-
class TopPred ; end
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class TopPred::Index < Hash
|
|
8
|
-
include TransmemIndex
|
|
9
|
-
|
|
10
|
-
# we need to match whatever function toppred uses to generate identifiers if
|
|
11
|
-
# we want derivative processes to be fast and accurate
|
|
12
|
-
def reference_to_key(reference)
|
|
13
|
-
if reference
|
|
14
|
-
ri = reference.index(' ')
|
|
15
|
-
frst =
|
|
16
|
-
if ri
|
|
17
|
-
reference[0...reference.index(' ')]
|
|
18
|
-
else
|
|
19
|
-
reference
|
|
20
|
-
end
|
|
21
|
-
if frst
|
|
22
|
-
frst.gsub(/[^0-9a-zA-Z]/,'_')
|
|
23
|
-
else
|
|
24
|
-
nil
|
|
25
|
-
end
|
|
26
|
-
else
|
|
27
|
-
nil
|
|
28
|
-
end
|
|
29
|
-
end
|
|
30
|
-
|
|
31
|
-
def initialize(file, kind=:default)
|
|
32
|
-
case kind
|
|
33
|
-
when :default
|
|
34
|
-
TopPred.default_index(file, self)
|
|
35
|
-
else
|
|
36
|
-
abort "can't do #{kind}"
|
|
37
|
-
end
|
|
38
|
-
end
|
|
39
|
-
|
|
40
|
-
# This class will probably change its interface some in the future
|
|
41
|
-
# That's the web portal
|
|
42
|
-
# http://bioweb.pasteur.fr/seqanal/interfaces/toppred.html
|
|
43
|
-
# How to run:
|
|
44
|
-
# uncheck 'Produce hydrophobicity graph image (-g)'
|
|
45
|
-
# choose 'Xml' or 'New: new text' output
|
|
46
|
-
# type in your email, then hit 'Run toppred'
|
|
47
|
-
end
|
|
48
|
-
|
|
49
|
-
class TopPred
|
|
50
|
-
include TransmemIndex
|
|
51
|
-
|
|
52
|
-
# returns the default index
|
|
53
|
-
def self.default_index(file, index={})
|
|
54
|
-
TopPred::Parser.new(TopPred::Parser.filetype(file)).file_to_index(file, index)
|
|
55
|
-
end
|
|
56
|
-
|
|
57
|
-
end
|
|
58
|
-
|
|
59
|
-
module TopPred::Parser
|
|
60
|
-
# returns :xml or :text
|
|
61
|
-
def self.filetype(file)
|
|
62
|
-
File.open(file) do |fh|
|
|
63
|
-
case fh.gets
|
|
64
|
-
when /<\?xml version.*>/
|
|
65
|
-
:xml
|
|
66
|
-
when /Algorithm specific/
|
|
67
|
-
:text
|
|
68
|
-
else
|
|
69
|
-
nil
|
|
70
|
-
end
|
|
71
|
-
end
|
|
72
|
-
end
|
|
73
|
-
|
|
74
|
-
# type = :xml or :text
|
|
75
|
-
def self.new(parser_type=:xml)
|
|
76
|
-
klass =
|
|
77
|
-
case parser_type
|
|
78
|
-
when :xml
|
|
79
|
-
TopPred::Parser_XML
|
|
80
|
-
when :text
|
|
81
|
-
TopPred::Parser_Text
|
|
82
|
-
else
|
|
83
|
-
abort "don't recognize parser type: #{parser_type}"
|
|
84
|
-
end
|
|
85
|
-
klass.new
|
|
86
|
-
end
|
|
87
|
-
|
|
88
|
-
def file_to_index(file, index={})
|
|
89
|
-
File.open(file) {|fh| to_index(fh, index) }
|
|
90
|
-
end
|
|
91
|
-
|
|
92
|
-
# where each segment = [prob, first, last] and aaseq is a string each
|
|
93
|
-
# segment may also be a hash => first, last, probability (adding key
|
|
94
|
-
# 'aaseq')
|
|
95
|
-
# first/last '1' indexed returns segments where each is [prob,
|
|
96
|
-
# first, last, aaseq] or hash (above)
|
|
97
|
-
def add_sequences_to_segments(segments, aaseq)
|
|
98
|
-
if segments.first.is_a? Array
|
|
99
|
-
segments.each do |seg|
|
|
100
|
-
first_index = seg[1] - 1
|
|
101
|
-
length = (seg[2] - seg[1]) + 1
|
|
102
|
-
seg.push( aaseq[first_index, length] )
|
|
103
|
-
end
|
|
104
|
-
else
|
|
105
|
-
segments.each do |seg|
|
|
106
|
-
first_index = seg[:start] - 1
|
|
107
|
-
length = (seg[:stop] - seg[:start]) + 1
|
|
108
|
-
seg[:aaseq] = ( aaseq[first_index, length] )
|
|
109
|
-
end
|
|
110
|
-
end
|
|
111
|
-
segments
|
|
112
|
-
end
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
end
|
|
117
|
-
|
|
118
|
-
module TopPred::Parser_XML
|
|
119
|
-
include TopPred::Parser
|
|
120
|
-
include XMLStyleParser
|
|
121
|
-
|
|
122
|
-
def self.new(meth=:to_index)
|
|
123
|
-
parser = XMLStyleParser.choose_parser(self, meth).new
|
|
124
|
-
@method = meth
|
|
125
|
-
parser
|
|
126
|
-
end
|
|
127
|
-
|
|
128
|
-
def parse(file)
|
|
129
|
-
send(@method, file)
|
|
130
|
-
end
|
|
131
|
-
end
|
|
132
|
-
|
|
133
|
-
class TopPred::Parser_XML::DOM
|
|
134
|
-
include TopPred::Parser_XML
|
|
135
|
-
include XMLStyleParser
|
|
136
|
-
|
|
137
|
-
=begin
|
|
138
|
-
YAL010C:
|
|
139
|
-
num_putative_transmembrane_segments: 1
|
|
140
|
-
aaseq: MLPYMDQVLRAFYQSTHWSTQNSYEDITATSRTLLDFRIPSAIHLQISNKSTPNTFNSLDFSTRSRINGSLSYLYSDAQQLEKFMRNSTDIPLQDATETYRQLQPNLNFSVSSANTLSSDNTTVDNDKKLLHDSKFVKKSLYYGRMYYPSSDLEAMIIKRLSPQTQFMLKGVSSFKESLNVLTCYFQRDSHRNLQEWIFSTSDLLCGYRVLHNFLTTPSKFNTSLYNNSSLSLGAEFWLGLVSLSPGCSTTLRYYTHSTNTGRPLTLTLSWQPLFGHISSTYSAKTGTNSTFCAKYDFNLYSIESNLSFGCEFWQKKHHLLETNKNNNDKLEPISDELVDINPNSRATKLLHENVPDLNSAVNDIPSTLDIPVHKQKLLNDLTYAFSSSLRKIDEERSTIEKFDNKINSSIFTSVWKLSTSLRDKTLKLLWEGKWRGFLISAGTELVFTRGFQESLSDDEKNDNAISISATDTENGNIPVFPAKFGIQFQYST
|
|
141
|
-
best_structure_probability: 1.0
|
|
142
|
-
transmembrane_segments:
|
|
143
|
-
- aaseq: SLGAEFWLGLVSLSPGCSTTL
|
|
144
|
-
stop: 252
|
|
145
|
-
start: 232
|
|
146
|
-
probability: 1.0
|
|
147
|
-
num_certain_transmembrane_segments: 1
|
|
148
|
-
num_found: 2
|
|
149
|
-
=end
|
|
150
|
-
|
|
151
|
-
# should return a index
|
|
152
|
-
def to_index(io, index = {})
|
|
153
|
-
get_root_node_from_io(io) do |toppreds_n|
|
|
154
|
-
|
|
155
|
-
abort if toppreds_n.name != 'toppreds'
|
|
156
|
-
toppreds_n.find('child::toppred').each do |toppred_n|
|
|
157
|
-
att_hash = {}
|
|
158
|
-
sequence_n = toppred_n.find_first('child::sequence')
|
|
159
|
-
index[sequence_n['id']] = att_hash
|
|
160
|
-
att_hash[:aaseq] = sequence_n.content.gsub(/[\s\n]/,'')
|
|
161
|
-
abort if att_hash[:aaseq].size != sequence_n['size'].to_i
|
|
162
|
-
tmsummary_n = sequence_n.find_first('following-sibling::tmsummary')
|
|
163
|
-
|
|
164
|
-
num_found = tmsummary_n['segments'].to_i
|
|
165
|
-
att_hash[:num_found] = num_found
|
|
166
|
-
if num_found > 0
|
|
167
|
-
|
|
168
|
-
num_certain_transmembrane_segments = 0
|
|
169
|
-
num_putative_transmembrane_segments = 0
|
|
170
|
-
tmsummary_n.find('child::segment').each do |segment_n|
|
|
171
|
-
abort if segment_n.name != 'segment'
|
|
172
|
-
case segment_n['type']
|
|
173
|
-
when 'certain'
|
|
174
|
-
num_certain_transmembrane_segments += 1
|
|
175
|
-
else # putative
|
|
176
|
-
num_putative_transmembrane_segments += 1
|
|
177
|
-
end
|
|
178
|
-
end
|
|
179
|
-
att_hash[:num_putative_transmembrane_segments] = num_putative_transmembrane_segments
|
|
180
|
-
att_hash[:num_certain_transmembrane_segments] = num_certain_transmembrane_segments
|
|
181
|
-
|
|
182
|
-
topologies_n = tmsummary_n.next
|
|
183
|
-
abort if topologies_n.name != 'topologies'
|
|
184
|
-
# get the top probability topology:
|
|
185
|
-
top_prob_topology_n = topologies_n.find('child::topology').to_a.max {|a,b| a['prob'].to_f <=> b['prob'].to_f }
|
|
186
|
-
tmsegments = []
|
|
187
|
-
top_prob_topology_n.find('child::tmsegment').each do |tmsegment_n|
|
|
188
|
-
tmhash = {}
|
|
189
|
-
tmhash[:start] = tmsegment_n['start'].to_i
|
|
190
|
-
tmhash[:stop] = tmsegment_n['stop'].to_i
|
|
191
|
-
## WARNING! it appears the probability is broken on xml output!!
|
|
192
|
-
tmhash[:probability] = tmsegment_n['prob'].to_f
|
|
193
|
-
tmsegments << tmhash
|
|
194
|
-
end
|
|
195
|
-
add_sequences_to_segments(tmsegments, att_hash[:aaseq])
|
|
196
|
-
att_hash[:transmembrane_segments] = tmsegments
|
|
197
|
-
end
|
|
198
|
-
end
|
|
199
|
-
end
|
|
200
|
-
index
|
|
201
|
-
end
|
|
202
|
-
|
|
203
|
-
end
|
|
204
|
-
|
|
205
|
-
class TopPred::Parser_Text
|
|
206
|
-
include TopPred::Parser
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
# returns a hash structure in this form: {identifier => {aaseq => String,
|
|
210
|
-
# num_found: Int, num_certain_transmembrane_segments => Int,
|
|
211
|
-
# num_putative_transmembrane_segments => Int, best_structure_probability =>
|
|
212
|
-
# Float, transmembrane_segments => [probability => Float, start => Int, stop
|
|
213
|
-
# => Int, aaseq => String] } }
|
|
214
|
-
def to_index(io, index={})
|
|
215
|
-
current_record = nil
|
|
216
|
-
|
|
217
|
-
io.each do |line|
|
|
218
|
-
if line =~ /^Sequence : (.*?) +\(/
|
|
219
|
-
current_identifier = $1.dup
|
|
220
|
-
index[current_identifier] = {}
|
|
221
|
-
current_record = index[current_identifier]
|
|
222
|
-
current_record[:aaseq] = read_aaseq(io)
|
|
223
|
-
read_segment_summary(io, current_record)
|
|
224
|
-
elsif line =~ /^HEADER\s+START\s+STOP/
|
|
225
|
-
top_struc = top_structure( read_structures(io) )
|
|
226
|
-
current_record[:best_structure_probability] = top_struc[:probability]
|
|
227
|
-
current_record[:transmembrane_segments] = top_struc[:tm]
|
|
228
|
-
add_sequences_to_segments(current_record[:transmembrane_segments], current_record[:aaseq])
|
|
229
|
-
segment_arrays_to_hashes(current_record[:transmembrane_segments])
|
|
230
|
-
end
|
|
231
|
-
end
|
|
232
|
-
index
|
|
233
|
-
end
|
|
234
|
-
|
|
235
|
-
private
|
|
236
|
-
|
|
237
|
-
# returns a list of all structures given a filehandle starting just after
|
|
238
|
-
# the first "HEADER START STOP ..." line
|
|
239
|
-
def read_structures(fh)
|
|
240
|
-
structures = []
|
|
241
|
-
loop do
|
|
242
|
-
structures.push( read_structure(fh) )
|
|
243
|
-
break if fh.eof?
|
|
244
|
-
line = fh.readline
|
|
245
|
-
unless line =~ /^HEADER\s+START\s+STOP/
|
|
246
|
-
break
|
|
247
|
-
end
|
|
248
|
-
end
|
|
249
|
-
structures
|
|
250
|
-
end
|
|
251
|
-
|
|
252
|
-
# returns a hash with key :probability and key :tm contains an array of
|
|
253
|
-
# arrays: [prob(Float), start(Int), stop(Int)]
|
|
254
|
-
def read_structure(fh)
|
|
255
|
-
structure = {}
|
|
256
|
-
# READ the first line
|
|
257
|
-
line = fh.readline
|
|
258
|
-
structure[:probability] = line.split(/\s+/)[2].to_f
|
|
259
|
-
structure[:tm] = read_segments(fh)
|
|
260
|
-
structure
|
|
261
|
-
end
|
|
262
|
-
|
|
263
|
-
# returns an array of arrays of transmembrane segments: [prob(Float),
|
|
264
|
-
# start(Int), stop(Int)]
|
|
265
|
-
# returns after seeing '//'
|
|
266
|
-
def read_segments(fh)
|
|
267
|
-
segments = []
|
|
268
|
-
st = Regexp.escape('//') ; end_regex = /#{st}/
|
|
269
|
-
fh.each do |line|
|
|
270
|
-
if line =~ /^TRANSMEM/
|
|
271
|
-
(header, start, stop, len, prob) = line.split(/\s+/)[0,5]
|
|
272
|
-
segments << [prob.to_f, start.to_i, stop.to_i]
|
|
273
|
-
elsif line =~ end_regex
|
|
274
|
-
break
|
|
275
|
-
end
|
|
276
|
-
end
|
|
277
|
-
segments
|
|
278
|
-
end
|
|
279
|
-
|
|
280
|
-
# returns the top probability structure (first on tie)
|
|
281
|
-
def top_structure(list)
|
|
282
|
-
top_prob = list.first[:probability]
|
|
283
|
-
top_struc = list.first
|
|
284
|
-
list.each do |st|
|
|
285
|
-
if st[:probability] > top_prob
|
|
286
|
-
top_struc = st
|
|
287
|
-
top_prob = st[:probability]
|
|
288
|
-
end
|
|
289
|
-
end
|
|
290
|
-
top_struc
|
|
291
|
-
end
|
|
292
|
-
|
|
293
|
-
def read_aaseq(fh)
|
|
294
|
-
aaseq = ''
|
|
295
|
-
fh.each do |line|
|
|
296
|
-
line.chomp!
|
|
297
|
-
unless line =~ /[\w\*]/
|
|
298
|
-
break
|
|
299
|
-
end
|
|
300
|
-
aaseq << line
|
|
301
|
-
end
|
|
302
|
-
aaseq
|
|
303
|
-
end
|
|
304
|
-
|
|
305
|
-
def segment_arrays_to_hashes(list)
|
|
306
|
-
list.map! do |ar|
|
|
307
|
-
{ :probability => ar[0],
|
|
308
|
-
:start => ar[1],
|
|
309
|
-
:stop => ar[2],
|
|
310
|
-
:aaseq => ar[3],
|
|
311
|
-
}
|
|
312
|
-
end
|
|
313
|
-
end
|
|
314
|
-
|
|
315
|
-
# returns [certain, putative]
|
|
316
|
-
# expects first line to be a tm segment
|
|
317
|
-
def num_certain_putative(fh)
|
|
318
|
-
certain = 0
|
|
319
|
-
putative = 0
|
|
320
|
-
fh.each do |line|
|
|
321
|
-
certainty = line.chomp.split(/\s+/).last
|
|
322
|
-
if !certainty
|
|
323
|
-
break
|
|
324
|
-
else
|
|
325
|
-
certain += 1 if certainty == 'Certain'
|
|
326
|
-
putative += 1 if certainty == 'Putative'
|
|
327
|
-
end
|
|
328
|
-
end
|
|
329
|
-
[certain, putative]
|
|
330
|
-
end
|
|
331
|
-
|
|
332
|
-
def read_segment_summary(fh, rec)
|
|
333
|
-
fh.each do |line|
|
|
334
|
-
if line =~ /Found: (.*?) segments/
|
|
335
|
-
rec[:num_found] = $1.to_i
|
|
336
|
-
break if rec[:num_found] == 0
|
|
337
|
-
elsif line =~ /Helix\s+Begin/
|
|
338
|
-
(cert, putat) = num_certain_putative(fh)
|
|
339
|
-
rec[:num_certain_transmembrane_segments] = cert
|
|
340
|
-
rec[:num_putative_transmembrane_segments] = putat
|
|
341
|
-
break
|
|
342
|
-
end
|
|
343
|
-
end
|
|
344
|
-
end
|
|
345
|
-
end
|
|
346
|
-
|
|
347
|
-
class TopPred::Parser_XML::LibXML < TopPred::Parser_XML::DOM
|
|
348
|
-
def get_root_node_from_io(io, &block)
|
|
349
|
-
# turn off warnings because this doesn't seem to work:
|
|
350
|
-
# XML::Parser.default_load_external_dtd = false
|
|
351
|
-
# (There is a warning about not finding DTD)
|
|
352
|
-
xml_parser_warnings = XML::Parser.default_warnings
|
|
353
|
-
XML::Parser.default_warnings = false
|
|
354
|
-
doc = XML::Parser.io(io).parse
|
|
355
|
-
root = doc.root
|
|
356
|
-
block.call(root)
|
|
357
|
-
# reset the warning level of XML::Parser:
|
|
358
|
-
XML::Parser.default_warnings = xml_parser_warnings
|
|
359
|
-
end
|
|
360
|
-
end
|
|
361
|
-
|
|
362
|
-
class TopPred::Parser_XML::AXML < TopPred::Parser_XML::DOM
|
|
363
|
-
def get_root_node_from_io(io, &block)
|
|
364
|
-
root = ::AXML.parse(io)
|
|
365
|
-
block.call(root)
|
|
366
|
-
end
|
|
367
|
-
end
|
|
368
|
-
|