egor 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +6 -0
- data/README.rdoc +1 -1
- data/egor.gemspec +5 -5
- data/lib/egor/cli.rb +128 -95
- data/lib/egor.rb +1 -1
- data/website/index.html +1 -1
- data.tar.gz.sig +0 -0
- metadata +3 -3
- metadata.gz.sig +0 -0
data/History.txt
CHANGED
data/README.rdoc
CHANGED
@@ -65,7 +65,7 @@ egor: Esst GeneratOR, a program for calculating environment-specific substitutio
|
|
65
65
|
|
66
66
|
== REQUIREMENTS:
|
67
67
|
|
68
|
-
* ruby 1.8.
|
68
|
+
* ruby 1.8.7 or above (http://www.ruby-lang.org)
|
69
69
|
* rubygems 1.2.0 or above (http://rubyforge.org/projects/rubygems/)
|
70
70
|
|
71
71
|
Following RubyGems will be automatically installed if you have rubygems installed on your machine
|
data/egor.gemspec
CHANGED
@@ -2,12 +2,12 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{egor}
|
5
|
-
s.version = "0.0.
|
5
|
+
s.version = "0.0.4"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Semin Lee"]
|
9
9
|
s.cert_chain = ["/Users/semin/.gem/gem-public_cert.pem"]
|
10
|
-
s.date = %q{2008-12-
|
10
|
+
s.date = %q{2008-12-15}
|
11
11
|
s.default_executable = %q{egor}
|
12
12
|
s.description = %q{egor: Esst GeneratOR, a program for calculating environment-specific substitution tables}
|
13
13
|
s.email = ["seminlee@gmail.com"]
|
@@ -34,14 +34,14 @@ Gem::Specification.new do |s|
|
|
34
34
|
s.add_runtime_dependency(%q<bio>, [">= 1.2.1"])
|
35
35
|
s.add_runtime_dependency(%q<facets>, [">= 2.4.5"])
|
36
36
|
s.add_runtime_dependency(%q<simple_memoize>, [">= 1.0.0"])
|
37
|
-
s.add_development_dependency(%q<newgem>, [">= 1.1
|
37
|
+
s.add_development_dependency(%q<newgem>, [">= 1.2.1"])
|
38
38
|
s.add_development_dependency(%q<hoe>, [">= 1.8.0"])
|
39
39
|
else
|
40
40
|
s.add_dependency(%q<narray>, [">= 0.5.9.5"])
|
41
41
|
s.add_dependency(%q<bio>, [">= 1.2.1"])
|
42
42
|
s.add_dependency(%q<facets>, [">= 2.4.5"])
|
43
43
|
s.add_dependency(%q<simple_memoize>, [">= 1.0.0"])
|
44
|
-
s.add_dependency(%q<newgem>, [">= 1.1
|
44
|
+
s.add_dependency(%q<newgem>, [">= 1.2.1"])
|
45
45
|
s.add_dependency(%q<hoe>, [">= 1.8.0"])
|
46
46
|
end
|
47
47
|
else
|
@@ -49,7 +49,7 @@ Gem::Specification.new do |s|
|
|
49
49
|
s.add_dependency(%q<bio>, [">= 1.2.1"])
|
50
50
|
s.add_dependency(%q<facets>, [">= 2.4.5"])
|
51
51
|
s.add_dependency(%q<simple_memoize>, [">= 1.0.0"])
|
52
|
-
s.add_dependency(%q<newgem>, [">= 1.1
|
52
|
+
s.add_dependency(%q<newgem>, [">= 1.2.1"])
|
53
53
|
s.add_dependency(%q<hoe>, [">= 1.8.0"])
|
54
54
|
end
|
55
55
|
end
|
data/lib/egor/cli.rb
CHANGED
@@ -48,13 +48,13 @@ Options:
|
|
48
48
|
--weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting
|
49
49
|
--noweight: calculate substitution counts with no weights (default)
|
50
50
|
--smooth (-s) INTEGER:
|
51
|
-
0 for
|
51
|
+
0 for partial smoothing (default)
|
52
52
|
1 for full smoothing
|
53
53
|
--nosmooth: perform no smoothing operation
|
54
54
|
--cys (-y) INTEGER:
|
55
55
|
0 for using C and J only for structure (default)
|
56
56
|
1 for both structure and sequence
|
57
|
-
2 for using only C for both
|
57
|
+
2 for using only C for both (should be set having no 'disulphide bonds' environment feature)
|
58
58
|
--output INTEGER:
|
59
59
|
0 for raw counts (no-smoothing performed)
|
60
60
|
1 for probabilities
|
@@ -163,6 +163,9 @@ Options:
|
|
163
163
|
$tot_freq_mat = nil
|
164
164
|
$tot_prob_mat = nil
|
165
165
|
$tot_logo_mat = nil
|
166
|
+
#
|
167
|
+
# Part 1 END
|
168
|
+
#
|
166
169
|
|
167
170
|
# Part 2.
|
168
171
|
#
|
@@ -174,6 +177,7 @@ Options:
|
|
174
177
|
[ '--tem-file', '-f', GetoptLong::REQUIRED_ARGUMENT ],
|
175
178
|
[ '--classdef', '-c', GetoptLong::REQUIRED_ARGUMENT ],
|
176
179
|
[ '--smooth', '-s', GetoptLong::REQUIRED_ARGUMENT ],
|
180
|
+
[ '--nosmooth', GetoptLong::NO_ARGUMENT ],
|
177
181
|
[ '--weight', '-w', GetoptLong::REQUIRED_ARGUMENT ],
|
178
182
|
[ '--noweight', GetoptLong::NO_ARGUMENT ],
|
179
183
|
[ '--heatmap', GetoptLong::NO_ARGUMENT ],
|
@@ -249,6 +253,9 @@ Options:
|
|
249
253
|
print_usage
|
250
254
|
exit 1
|
251
255
|
end
|
256
|
+
#
|
257
|
+
# Part 2 END
|
258
|
+
#
|
252
259
|
|
253
260
|
|
254
261
|
# Part 3.
|
@@ -315,6 +322,9 @@ Options:
|
|
315
322
|
}.each_with_index { |e, i|
|
316
323
|
$envs[e.flatten.join] = Environment.new(i, e.flatten.join, $amino_acids)
|
317
324
|
}
|
325
|
+
#
|
326
|
+
# Part 3 END
|
327
|
+
#
|
318
328
|
|
319
329
|
# Part 4.
|
320
330
|
#
|
@@ -621,7 +631,7 @@ Options:
|
|
621
631
|
end
|
622
632
|
end
|
623
633
|
end
|
624
|
-
end
|
634
|
+
end
|
625
635
|
end
|
626
636
|
|
627
637
|
# print out default header
|
@@ -660,7 +670,13 @@ HEADER
|
|
660
670
|
|
661
671
|
# calculate amino acid frequencies and mutabilities, and
|
662
672
|
# print them as default statistics in the header part
|
663
|
-
ala_factor =
|
673
|
+
ala_factor = if $aa_tot_obs["A"] == 0
|
674
|
+
0.0
|
675
|
+
elsif $aa_mut_obs["A"] == 0
|
676
|
+
0.0
|
677
|
+
else
|
678
|
+
100.0 * $aa_tot_obs["A"] / $aa_mut_obs["A"].to_f
|
679
|
+
end
|
664
680
|
$tot_aa = $aa_tot_obs.values.sum
|
665
681
|
|
666
682
|
$outfh.puts "#"
|
@@ -668,9 +684,9 @@ HEADER
|
|
668
684
|
$outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FRQ]
|
669
685
|
|
670
686
|
$amino_acids.each do |res|
|
671
|
-
$aa_mutb[res] = $aa_mut_obs[res] / $aa_tot_obs[res].to_f
|
687
|
+
$aa_mutb[res] = $aa_tot_obs[res] == 0 ? 1.0 : $aa_mut_obs[res] / $aa_tot_obs[res].to_f
|
672
688
|
$aa_rel_mutb[res] = $aa_mutb[res] * ala_factor
|
673
|
-
$aa_rel_freq[res] = $aa_tot_obs[res] / $tot_aa.to_f
|
689
|
+
$aa_rel_freq[res] = $aa_tot_obs[res] == 0 ? 0.0 : $aa_tot_obs[res] / $tot_aa.to_f
|
674
690
|
end
|
675
691
|
|
676
692
|
$amino_acids.each do |res|
|
@@ -682,6 +698,9 @@ HEADER
|
|
682
698
|
[res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
|
683
699
|
end
|
684
700
|
end
|
701
|
+
#
|
702
|
+
# Part 4. END
|
703
|
+
#
|
685
704
|
|
686
705
|
|
687
706
|
# Part 5.
|
@@ -728,6 +747,9 @@ HEADER
|
|
728
747
|
$outfh.puts $tot_freq_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
729
748
|
exit 0
|
730
749
|
end
|
750
|
+
#
|
751
|
+
# Part 5. END
|
752
|
+
#
|
731
753
|
|
732
754
|
|
733
755
|
# Part 6.
|
@@ -746,6 +768,7 @@ HEADER
|
|
746
768
|
HEADER
|
747
769
|
end
|
748
770
|
|
771
|
+
# when nosmoothing !!!
|
749
772
|
if ($output > 0) && $nosmooth
|
750
773
|
# Probability matrices
|
751
774
|
$tot_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
|
@@ -781,7 +804,7 @@ HEADER
|
|
781
804
|
end
|
782
805
|
end
|
783
806
|
|
784
|
-
#
|
807
|
+
# when smoothing!!!
|
785
808
|
if ($output > 0) && !$nosmooth
|
786
809
|
#
|
787
810
|
# p1 probability
|
@@ -905,7 +928,13 @@ HEADER
|
|
905
928
|
|
906
929
|
# entropy based weighting priors
|
907
930
|
entropy_max = Math::log($amino_acids.size)
|
908
|
-
entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p|
|
931
|
+
entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p|
|
932
|
+
begin
|
933
|
+
p == 0.0 ? s - 1 : s + p * Math::log(p)
|
934
|
+
rescue
|
935
|
+
#puts "P: #{p}"
|
936
|
+
end
|
937
|
+
} }
|
909
938
|
mod_entropies = entropies.map_with_index { |entropy, i| (entropy_max - entropies[i]) / entropy_max }
|
910
939
|
weights = mod_entropies.map { |mod_entropy| mod_entropy / mod_entropies.sum }
|
911
940
|
weighted_priors = priors.map_with_index { |prior, i| prior * weights[i] }.sum
|
@@ -1058,96 +1087,101 @@ HEADER
|
|
1058
1087
|
$outfh.close
|
1059
1088
|
exit 0
|
1060
1089
|
end
|
1090
|
+
end
|
1091
|
+
#
|
1092
|
+
# Part 6. END
|
1093
|
+
#
|
1061
1094
|
|
1062
1095
|
|
1063
|
-
|
1064
|
-
|
1065
|
-
|
1066
|
-
|
1067
|
-
|
1068
|
-
|
1096
|
+
# Part 7.
|
1097
|
+
#
|
1098
|
+
# Calculating log odds ratio scoring matrices
|
1099
|
+
#
|
1100
|
+
if $output == 2
|
1101
|
+
$outfh.puts <<HEADER
|
1069
1102
|
#
|
1070
1103
|
# The probabilities were then divided by the background probabilities
|
1071
1104
|
HEADER
|
1072
|
-
|
1073
|
-
|
1105
|
+
if $penv
|
1106
|
+
$outfh.puts <<HEADER
|
1074
1107
|
# which were derived from the environment-independent amino acid frequencies.
|
1075
1108
|
# ^^^^^^^^^^^^^^^^^^^^^^^
|
1076
1109
|
HEADER
|
1077
|
-
|
1078
|
-
|
1110
|
+
else
|
1111
|
+
$outfh.puts <<HEADER
|
1079
1112
|
# which were derived from the environment-dependent amino acid frequencies.
|
1080
1113
|
# ^^^^^^^^^^^^^^^^^^^^^
|
1081
1114
|
HEADER
|
1082
|
-
|
1115
|
+
end
|
1083
1116
|
|
1084
|
-
|
1085
|
-
|
1086
|
-
|
1087
|
-
|
1088
|
-
|
1089
|
-
|
1090
|
-
|
1091
|
-
|
1092
|
-
|
1093
|
-
|
1094
|
-
|
1095
|
-
|
1096
|
-
|
1097
|
-
|
1098
|
-
|
1099
|
-
|
1100
|
-
|
1101
|
-
|
1102
|
-
|
1103
|
-
$amino_acids.each_with_index do |aa, ai|
|
1104
|
-
env = grp_envs.detect { |e| e.label.start_with?(aa) }
|
1105
|
-
logo_arr = $cys == 0 ? NArray.float($amino_acids.size + 1) : NArray.float($amino_acids.size)
|
1106
|
-
|
1107
|
-
env.smooth_prob_array.to_a.each_with_index do |prob, j|
|
1108
|
-
paj = 100.0 * $aa_rel_freq[$amino_acids[j]]
|
1109
|
-
odds = prob == 0.0 ? 0.000001 / paj : prob / paj
|
1110
|
-
logo_arr[j] = factor * Math::log(odds)
|
1111
|
-
end
|
1117
|
+
$tot_logo_mat = $cys == 0 ? NMatrix.float($amino_acids.size, $amino_acids.size + 1) : NMatrix.float($amino_acids.size, $amino_acids.size)
|
1118
|
+
grp_logo_mats = []
|
1119
|
+
factor = $scale / Math::log(2)
|
1120
|
+
|
1121
|
+
# grouping environments by its environment labels but amino acid label
|
1122
|
+
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
1123
|
+
|
1124
|
+
# sorting environments and build 21X21 substitution matrices
|
1125
|
+
env_groups.to_a.sort_by { |env_group|
|
1126
|
+
# a bit clumsy sorting here...
|
1127
|
+
env_group[0].split("").map_with_index { |l, i|
|
1128
|
+
$env_features[i + 1].labels.index(l)
|
1129
|
+
}
|
1130
|
+
}.each_with_index do |group, group_no|
|
1131
|
+
# calculating substitution probability matrix for each envrionment
|
1132
|
+
grp_label = group[0]
|
1133
|
+
grp_envs = group[1]
|
1134
|
+
grp_logo_mat = $cys == 0 ? NMatrix.float($amino_acids.size, $amino_acids.size + 1) : NMatrix.float($amino_acids.size, $amino_acids.size)
|
1112
1135
|
|
1113
|
-
|
1136
|
+
$amino_acids.each_with_index do |aa, ai|
|
1137
|
+
env = grp_envs.detect { |e| e.label.start_with?(aa) }
|
1138
|
+
logo_arr = $cys == 0 ? NArray.float($amino_acids.size + 1) : NArray.float($amino_acids.size)
|
1114
1139
|
|
1115
|
-
|
1116
|
-
|
1117
|
-
|
1118
|
-
|
1119
|
-
odds = prob == 0.0 ? 0.000001 / paj : prob / paj
|
1120
|
-
logo_arr[logo_arr.size - 1] = factor * Math::log(odds)
|
1121
|
-
grp_logo_mat[ai, logo_arr.size - 1] = logo_arr[logo_arr.size - 1]
|
1122
|
-
end
|
1140
|
+
env.send($nosmooth ? "prob_array" : "smooth_prob_array").to_a.each_with_index do |prob, j|
|
1141
|
+
paj = 100.0 * $aa_rel_freq[$amino_acids[j]]
|
1142
|
+
odds = prob == 0.0 ? 0.000001 / paj : prob / paj
|
1143
|
+
logo_arr[j] = factor * Math::log(odds)
|
1123
1144
|
end
|
1124
1145
|
|
1125
|
-
$
|
1126
|
-
|
1146
|
+
0.upto($amino_acids.size - 1) { |j| grp_logo_mat[ai, j] = logo_arr[j] }
|
1147
|
+
|
1148
|
+
# adding log odds ratio for "U" (J or C) when --cyc is 0
|
1149
|
+
if $cys == 0
|
1150
|
+
paj = 100.0 * ($aa_rel_freq["C"] + $aa_rel_freq["J"])
|
1151
|
+
prob = env.send($nosmooth ? "prob_array" : "smooth_prob_array")[$amino_acids.index("C")] +
|
1152
|
+
env.send($nosmooth ? "prob_array" : "smooth_prob_array")[$amino_acids.index("J")]
|
1153
|
+
odds = prob == 0.0 ? 0.000001 / paj : prob / paj
|
1154
|
+
logo_arr[logo_arr.size - 1] = factor * Math::log(odds)
|
1155
|
+
grp_logo_mat[ai, logo_arr.size - 1] = logo_arr[logo_arr.size - 1]
|
1156
|
+
end
|
1127
1157
|
end
|
1128
1158
|
|
1129
|
-
$tot_logo_mat
|
1130
|
-
|
1131
|
-
|
1132
|
-
|
1133
|
-
|
1134
|
-
|
1135
|
-
|
1136
|
-
|
1137
|
-
|
1138
|
-
|
1139
|
-
|
1140
|
-
|
1141
|
-
|
1142
|
-
|
1143
|
-
|
1144
|
-
|
1145
|
-
|
1146
|
-
|
1159
|
+
$tot_logo_mat += grp_logo_mat
|
1160
|
+
grp_logo_mats << [grp_label, grp_logo_mat]
|
1161
|
+
end
|
1162
|
+
|
1163
|
+
$tot_logo_mat /= env_groups.size
|
1164
|
+
|
1165
|
+
# calculating relative entropy for each amino acid pair H and
|
1166
|
+
# the expected score E in bit units
|
1167
|
+
#
|
1168
|
+
# I'm a bit suspicious about this part...
|
1169
|
+
tot_E = 0.0
|
1170
|
+
tot_H = 0.0
|
1171
|
+
|
1172
|
+
0.upto($tot_logo_mat.shape[0] - 1) do |i|
|
1173
|
+
0.upto($tot_logo_mat.shape[0] - 1) do |j|
|
1174
|
+
if i != j
|
1175
|
+
tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]] / 2.0
|
1176
|
+
tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 2.0 / 10000.0
|
1177
|
+
else
|
1178
|
+
tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]]
|
1179
|
+
tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 10000.0
|
1147
1180
|
end
|
1148
1181
|
end
|
1182
|
+
end
|
1149
1183
|
|
1150
|
-
|
1184
|
+
$outfh.puts <<HEADER
|
1151
1185
|
#
|
1152
1186
|
# Shown here are logarithms of these values multiplied by #{$scale}/log(2)
|
1153
1187
|
# rounded to the nearest integer (log-odds scores in 1/3 bit units).
|
@@ -1156,31 +1190,30 @@ HEADER
|
|
1156
1190
|
#
|
1157
1191
|
HEADER
|
1158
1192
|
|
1159
|
-
|
1160
|
-
|
1161
|
-
|
1193
|
+
grp_logo_mats.each_with_index do |arr, grp_no|
|
1194
|
+
grp_label = arr[0]
|
1195
|
+
grp_logo_mat = arr[1]
|
1162
1196
|
|
1163
|
-
|
1164
|
-
|
1165
|
-
|
1166
|
-
|
1167
|
-
|
1168
|
-
end
|
1197
|
+
$outfh.puts ">#{grp_label} #{grp_no}"
|
1198
|
+
if $cys
|
1199
|
+
$outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
|
1200
|
+
else
|
1201
|
+
$outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
1169
1202
|
end
|
1203
|
+
end
|
1170
1204
|
|
1171
|
-
|
1205
|
+
$outfh.puts ">Total #{grp_logo_mats.size}"
|
1172
1206
|
|
1173
|
-
|
1174
|
-
|
1175
|
-
|
1176
|
-
|
1177
|
-
end
|
1178
|
-
$outfh.close
|
1179
|
-
exit 0
|
1207
|
+
if $cys == 0
|
1208
|
+
$outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
|
1209
|
+
else
|
1210
|
+
$outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
1180
1211
|
end
|
1212
|
+
$outfh.close
|
1213
|
+
exit 0
|
1181
1214
|
end
|
1182
1215
|
end
|
1216
|
+
end
|
1183
1217
|
|
1184
|
-
end # class << self
|
1185
1218
|
end # class CLI
|
1186
1219
|
end # module Egor
|
data/lib/egor.rb
CHANGED
data/website/index.html
CHANGED
@@ -34,7 +34,7 @@
|
|
34
34
|
<div class="sidebar">
|
35
35
|
<div id="version" class="clickable" onclick='document.location = "http://rubyforge.org/projects/egor"; return false'>
|
36
36
|
<p>Get Version</p>
|
37
|
-
<a href="http://rubyforge.org/projects/egor" class="numbers">0.0.
|
37
|
+
<a href="http://rubyforge.org/projects/egor" class="numbers">0.0.4</a>
|
38
38
|
</div>
|
39
39
|
</div>
|
40
40
|
<h2>What</h2>
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: egor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Semin Lee
|
@@ -30,7 +30,7 @@ cert_chain:
|
|
30
30
|
35w+y1Jd
|
31
31
|
-----END CERTIFICATE-----
|
32
32
|
|
33
|
-
date: 2008-12-
|
33
|
+
date: 2008-12-15 00:00:00 +00:00
|
34
34
|
default_executable:
|
35
35
|
dependencies:
|
36
36
|
- !ruby/object:Gem::Dependency
|
@@ -81,7 +81,7 @@ dependencies:
|
|
81
81
|
requirements:
|
82
82
|
- - ">="
|
83
83
|
- !ruby/object:Gem::Version
|
84
|
-
version: 1.1
|
84
|
+
version: 1.2.1
|
85
85
|
version:
|
86
86
|
- !ruby/object:Gem::Dependency
|
87
87
|
name: hoe
|
metadata.gz.sig
CHANGED
Binary file
|