egor 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +6 -0
- data/README.rdoc +1 -1
- data/egor.gemspec +5 -5
- data/lib/egor/cli.rb +128 -95
- data/lib/egor.rb +1 -1
- data/website/index.html +1 -1
- data.tar.gz.sig +0 -0
- metadata +3 -3
- metadata.gz.sig +0 -0
data/History.txt
CHANGED
data/README.rdoc
CHANGED
@@ -65,7 +65,7 @@ egor: Esst GeneratOR, a program for calculating environment-specific substitutio
|
|
65
65
|
|
66
66
|
== REQUIREMENTS:
|
67
67
|
|
68
|
-
* ruby 1.8.
|
68
|
+
* ruby 1.8.7 or above (http://www.ruby-lang.org)
|
69
69
|
* rubygems 1.2.0 or above (http://rubyforge.org/projects/rubygems/)
|
70
70
|
|
71
71
|
Following RubyGems will be automatically installed if you have rubygems installed on your machine
|
data/egor.gemspec
CHANGED
@@ -2,12 +2,12 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{egor}
|
5
|
-
s.version = "0.0.
|
5
|
+
s.version = "0.0.4"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Semin Lee"]
|
9
9
|
s.cert_chain = ["/Users/semin/.gem/gem-public_cert.pem"]
|
10
|
-
s.date = %q{2008-12-
|
10
|
+
s.date = %q{2008-12-15}
|
11
11
|
s.default_executable = %q{egor}
|
12
12
|
s.description = %q{egor: Esst GeneratOR, a program for calculating environment-specific substitution tables}
|
13
13
|
s.email = ["seminlee@gmail.com"]
|
@@ -34,14 +34,14 @@ Gem::Specification.new do |s|
|
|
34
34
|
s.add_runtime_dependency(%q<bio>, [">= 1.2.1"])
|
35
35
|
s.add_runtime_dependency(%q<facets>, [">= 2.4.5"])
|
36
36
|
s.add_runtime_dependency(%q<simple_memoize>, [">= 1.0.0"])
|
37
|
-
s.add_development_dependency(%q<newgem>, [">= 1.1
|
37
|
+
s.add_development_dependency(%q<newgem>, [">= 1.2.1"])
|
38
38
|
s.add_development_dependency(%q<hoe>, [">= 1.8.0"])
|
39
39
|
else
|
40
40
|
s.add_dependency(%q<narray>, [">= 0.5.9.5"])
|
41
41
|
s.add_dependency(%q<bio>, [">= 1.2.1"])
|
42
42
|
s.add_dependency(%q<facets>, [">= 2.4.5"])
|
43
43
|
s.add_dependency(%q<simple_memoize>, [">= 1.0.0"])
|
44
|
-
s.add_dependency(%q<newgem>, [">= 1.1
|
44
|
+
s.add_dependency(%q<newgem>, [">= 1.2.1"])
|
45
45
|
s.add_dependency(%q<hoe>, [">= 1.8.0"])
|
46
46
|
end
|
47
47
|
else
|
@@ -49,7 +49,7 @@ Gem::Specification.new do |s|
|
|
49
49
|
s.add_dependency(%q<bio>, [">= 1.2.1"])
|
50
50
|
s.add_dependency(%q<facets>, [">= 2.4.5"])
|
51
51
|
s.add_dependency(%q<simple_memoize>, [">= 1.0.0"])
|
52
|
-
s.add_dependency(%q<newgem>, [">= 1.1
|
52
|
+
s.add_dependency(%q<newgem>, [">= 1.2.1"])
|
53
53
|
s.add_dependency(%q<hoe>, [">= 1.8.0"])
|
54
54
|
end
|
55
55
|
end
|
data/lib/egor/cli.rb
CHANGED
@@ -48,13 +48,13 @@ Options:
|
|
48
48
|
--weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting
|
49
49
|
--noweight: calculate substitution counts with no weights (default)
|
50
50
|
--smooth (-s) INTEGER:
|
51
|
-
0 for
|
51
|
+
0 for partial smoothing (default)
|
52
52
|
1 for full smoothing
|
53
53
|
--nosmooth: perform no smoothing operation
|
54
54
|
--cys (-y) INTEGER:
|
55
55
|
0 for using C and J only for structure (default)
|
56
56
|
1 for both structure and sequence
|
57
|
-
2 for using only C for both
|
57
|
+
2 for using only C for both (should be set having no 'disulphide bonds' environment feature)
|
58
58
|
--output INTEGER:
|
59
59
|
0 for raw counts (no-smoothing performed)
|
60
60
|
1 for probabilities
|
@@ -163,6 +163,9 @@ Options:
|
|
163
163
|
$tot_freq_mat = nil
|
164
164
|
$tot_prob_mat = nil
|
165
165
|
$tot_logo_mat = nil
|
166
|
+
#
|
167
|
+
# Part 1 END
|
168
|
+
#
|
166
169
|
|
167
170
|
# Part 2.
|
168
171
|
#
|
@@ -174,6 +177,7 @@ Options:
|
|
174
177
|
[ '--tem-file', '-f', GetoptLong::REQUIRED_ARGUMENT ],
|
175
178
|
[ '--classdef', '-c', GetoptLong::REQUIRED_ARGUMENT ],
|
176
179
|
[ '--smooth', '-s', GetoptLong::REQUIRED_ARGUMENT ],
|
180
|
+
[ '--nosmooth', GetoptLong::NO_ARGUMENT ],
|
177
181
|
[ '--weight', '-w', GetoptLong::REQUIRED_ARGUMENT ],
|
178
182
|
[ '--noweight', GetoptLong::NO_ARGUMENT ],
|
179
183
|
[ '--heatmap', GetoptLong::NO_ARGUMENT ],
|
@@ -249,6 +253,9 @@ Options:
|
|
249
253
|
print_usage
|
250
254
|
exit 1
|
251
255
|
end
|
256
|
+
#
|
257
|
+
# Part 2 END
|
258
|
+
#
|
252
259
|
|
253
260
|
|
254
261
|
# Part 3.
|
@@ -315,6 +322,9 @@ Options:
|
|
315
322
|
}.each_with_index { |e, i|
|
316
323
|
$envs[e.flatten.join] = Environment.new(i, e.flatten.join, $amino_acids)
|
317
324
|
}
|
325
|
+
#
|
326
|
+
# Part 3 END
|
327
|
+
#
|
318
328
|
|
319
329
|
# Part 4.
|
320
330
|
#
|
@@ -621,7 +631,7 @@ Options:
|
|
621
631
|
end
|
622
632
|
end
|
623
633
|
end
|
624
|
-
end
|
634
|
+
end
|
625
635
|
end
|
626
636
|
|
627
637
|
# print out default header
|
@@ -660,7 +670,13 @@ HEADER
|
|
660
670
|
|
661
671
|
# calculate amino acid frequencies and mutabilities, and
|
662
672
|
# print them as default statistics in the header part
|
663
|
-
ala_factor =
|
673
|
+
ala_factor = if $aa_tot_obs["A"] == 0
|
674
|
+
0.0
|
675
|
+
elsif $aa_mut_obs["A"] == 0
|
676
|
+
0.0
|
677
|
+
else
|
678
|
+
100.0 * $aa_tot_obs["A"] / $aa_mut_obs["A"].to_f
|
679
|
+
end
|
664
680
|
$tot_aa = $aa_tot_obs.values.sum
|
665
681
|
|
666
682
|
$outfh.puts "#"
|
@@ -668,9 +684,9 @@ HEADER
|
|
668
684
|
$outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FRQ]
|
669
685
|
|
670
686
|
$amino_acids.each do |res|
|
671
|
-
$aa_mutb[res] = $aa_mut_obs[res] / $aa_tot_obs[res].to_f
|
687
|
+
$aa_mutb[res] = $aa_tot_obs[res] == 0 ? 1.0 : $aa_mut_obs[res] / $aa_tot_obs[res].to_f
|
672
688
|
$aa_rel_mutb[res] = $aa_mutb[res] * ala_factor
|
673
|
-
$aa_rel_freq[res] = $aa_tot_obs[res] / $tot_aa.to_f
|
689
|
+
$aa_rel_freq[res] = $aa_tot_obs[res] == 0 ? 0.0 : $aa_tot_obs[res] / $tot_aa.to_f
|
674
690
|
end
|
675
691
|
|
676
692
|
$amino_acids.each do |res|
|
@@ -682,6 +698,9 @@ HEADER
|
|
682
698
|
[res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
|
683
699
|
end
|
684
700
|
end
|
701
|
+
#
|
702
|
+
# Part 4. END
|
703
|
+
#
|
685
704
|
|
686
705
|
|
687
706
|
# Part 5.
|
@@ -728,6 +747,9 @@ HEADER
|
|
728
747
|
$outfh.puts $tot_freq_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
729
748
|
exit 0
|
730
749
|
end
|
750
|
+
#
|
751
|
+
# Part 5. END
|
752
|
+
#
|
731
753
|
|
732
754
|
|
733
755
|
# Part 6.
|
@@ -746,6 +768,7 @@ HEADER
|
|
746
768
|
HEADER
|
747
769
|
end
|
748
770
|
|
771
|
+
# when nosmoothing !!!
|
749
772
|
if ($output > 0) && $nosmooth
|
750
773
|
# Probability matrices
|
751
774
|
$tot_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
|
@@ -781,7 +804,7 @@ HEADER
|
|
781
804
|
end
|
782
805
|
end
|
783
806
|
|
784
|
-
#
|
807
|
+
# when smoothing!!!
|
785
808
|
if ($output > 0) && !$nosmooth
|
786
809
|
#
|
787
810
|
# p1 probability
|
@@ -905,7 +928,13 @@ HEADER
|
|
905
928
|
|
906
929
|
# entropy based weighting priors
|
907
930
|
entropy_max = Math::log($amino_acids.size)
|
908
|
-
entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p|
|
931
|
+
entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p|
|
932
|
+
begin
|
933
|
+
p == 0.0 ? s - 1 : s + p * Math::log(p)
|
934
|
+
rescue
|
935
|
+
#puts "P: #{p}"
|
936
|
+
end
|
937
|
+
} }
|
909
938
|
mod_entropies = entropies.map_with_index { |entropy, i| (entropy_max - entropies[i]) / entropy_max }
|
910
939
|
weights = mod_entropies.map { |mod_entropy| mod_entropy / mod_entropies.sum }
|
911
940
|
weighted_priors = priors.map_with_index { |prior, i| prior * weights[i] }.sum
|
@@ -1058,96 +1087,101 @@ HEADER
|
|
1058
1087
|
$outfh.close
|
1059
1088
|
exit 0
|
1060
1089
|
end
|
1090
|
+
end
|
1091
|
+
#
|
1092
|
+
# Part 6. END
|
1093
|
+
#
|
1061
1094
|
|
1062
1095
|
|
1063
|
-
|
1064
|
-
|
1065
|
-
|
1066
|
-
|
1067
|
-
|
1068
|
-
|
1096
|
+
# Part 7.
|
1097
|
+
#
|
1098
|
+
# Calculating log odds ratio scoring matrices
|
1099
|
+
#
|
1100
|
+
if $output == 2
|
1101
|
+
$outfh.puts <<HEADER
|
1069
1102
|
#
|
1070
1103
|
# The probabilities were then divided by the background probabilities
|
1071
1104
|
HEADER
|
1072
|
-
|
1073
|
-
|
1105
|
+
if $penv
|
1106
|
+
$outfh.puts <<HEADER
|
1074
1107
|
# which were derived from the environment-independent amino acid frequencies.
|
1075
1108
|
# ^^^^^^^^^^^^^^^^^^^^^^^
|
1076
1109
|
HEADER
|
1077
|
-
|
1078
|
-
|
1110
|
+
else
|
1111
|
+
$outfh.puts <<HEADER
|
1079
1112
|
# which were derived from the environment-dependent amino acid frequencies.
|
1080
1113
|
# ^^^^^^^^^^^^^^^^^^^^^
|
1081
1114
|
HEADER
|
1082
|
-
|
1115
|
+
end
|
1083
1116
|
|
1084
|
-
|
1085
|
-
|
1086
|
-
|
1087
|
-
|
1088
|
-
|
1089
|
-
|
1090
|
-
|
1091
|
-
|
1092
|
-
|
1093
|
-
|
1094
|
-
|
1095
|
-
|
1096
|
-
|
1097
|
-
|
1098
|
-
|
1099
|
-
|
1100
|
-
|
1101
|
-
|
1102
|
-
|
1103
|
-
$amino_acids.each_with_index do |aa, ai|
|
1104
|
-
env = grp_envs.detect { |e| e.label.start_with?(aa) }
|
1105
|
-
logo_arr = $cys == 0 ? NArray.float($amino_acids.size + 1) : NArray.float($amino_acids.size)
|
1106
|
-
|
1107
|
-
env.smooth_prob_array.to_a.each_with_index do |prob, j|
|
1108
|
-
paj = 100.0 * $aa_rel_freq[$amino_acids[j]]
|
1109
|
-
odds = prob == 0.0 ? 0.000001 / paj : prob / paj
|
1110
|
-
logo_arr[j] = factor * Math::log(odds)
|
1111
|
-
end
|
1117
|
+
$tot_logo_mat = $cys == 0 ? NMatrix.float($amino_acids.size, $amino_acids.size + 1) : NMatrix.float($amino_acids.size, $amino_acids.size)
|
1118
|
+
grp_logo_mats = []
|
1119
|
+
factor = $scale / Math::log(2)
|
1120
|
+
|
1121
|
+
# grouping environments by its environment labels but amino acid label
|
1122
|
+
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
1123
|
+
|
1124
|
+
# sorting environments and build 21X21 substitution matrices
|
1125
|
+
env_groups.to_a.sort_by { |env_group|
|
1126
|
+
# a bit clumsy sorting here...
|
1127
|
+
env_group[0].split("").map_with_index { |l, i|
|
1128
|
+
$env_features[i + 1].labels.index(l)
|
1129
|
+
}
|
1130
|
+
}.each_with_index do |group, group_no|
|
1131
|
+
# calculating substitution probability matrix for each envrionment
|
1132
|
+
grp_label = group[0]
|
1133
|
+
grp_envs = group[1]
|
1134
|
+
grp_logo_mat = $cys == 0 ? NMatrix.float($amino_acids.size, $amino_acids.size + 1) : NMatrix.float($amino_acids.size, $amino_acids.size)
|
1112
1135
|
|
1113
|
-
|
1136
|
+
$amino_acids.each_with_index do |aa, ai|
|
1137
|
+
env = grp_envs.detect { |e| e.label.start_with?(aa) }
|
1138
|
+
logo_arr = $cys == 0 ? NArray.float($amino_acids.size + 1) : NArray.float($amino_acids.size)
|
1114
1139
|
|
1115
|
-
|
1116
|
-
|
1117
|
-
|
1118
|
-
|
1119
|
-
odds = prob == 0.0 ? 0.000001 / paj : prob / paj
|
1120
|
-
logo_arr[logo_arr.size - 1] = factor * Math::log(odds)
|
1121
|
-
grp_logo_mat[ai, logo_arr.size - 1] = logo_arr[logo_arr.size - 1]
|
1122
|
-
end
|
1140
|
+
env.send($nosmooth ? "prob_array" : "smooth_prob_array").to_a.each_with_index do |prob, j|
|
1141
|
+
paj = 100.0 * $aa_rel_freq[$amino_acids[j]]
|
1142
|
+
odds = prob == 0.0 ? 0.000001 / paj : prob / paj
|
1143
|
+
logo_arr[j] = factor * Math::log(odds)
|
1123
1144
|
end
|
1124
1145
|
|
1125
|
-
$
|
1126
|
-
|
1146
|
+
0.upto($amino_acids.size - 1) { |j| grp_logo_mat[ai, j] = logo_arr[j] }
|
1147
|
+
|
1148
|
+
# adding log odds ratio for "U" (J or C) when --cyc is 0
|
1149
|
+
if $cys == 0
|
1150
|
+
paj = 100.0 * ($aa_rel_freq["C"] + $aa_rel_freq["J"])
|
1151
|
+
prob = env.send($nosmooth ? "prob_array" : "smooth_prob_array")[$amino_acids.index("C")] +
|
1152
|
+
env.send($nosmooth ? "prob_array" : "smooth_prob_array")[$amino_acids.index("J")]
|
1153
|
+
odds = prob == 0.0 ? 0.000001 / paj : prob / paj
|
1154
|
+
logo_arr[logo_arr.size - 1] = factor * Math::log(odds)
|
1155
|
+
grp_logo_mat[ai, logo_arr.size - 1] = logo_arr[logo_arr.size - 1]
|
1156
|
+
end
|
1127
1157
|
end
|
1128
1158
|
|
1129
|
-
$tot_logo_mat
|
1130
|
-
|
1131
|
-
|
1132
|
-
|
1133
|
-
|
1134
|
-
|
1135
|
-
|
1136
|
-
|
1137
|
-
|
1138
|
-
|
1139
|
-
|
1140
|
-
|
1141
|
-
|
1142
|
-
|
1143
|
-
|
1144
|
-
|
1145
|
-
|
1146
|
-
|
1159
|
+
$tot_logo_mat += grp_logo_mat
|
1160
|
+
grp_logo_mats << [grp_label, grp_logo_mat]
|
1161
|
+
end
|
1162
|
+
|
1163
|
+
$tot_logo_mat /= env_groups.size
|
1164
|
+
|
1165
|
+
# calculating relative entropy for each amino acid pair H and
|
1166
|
+
# the expected score E in bit units
|
1167
|
+
#
|
1168
|
+
# I'm a bit suspicious about this part...
|
1169
|
+
tot_E = 0.0
|
1170
|
+
tot_H = 0.0
|
1171
|
+
|
1172
|
+
0.upto($tot_logo_mat.shape[0] - 1) do |i|
|
1173
|
+
0.upto($tot_logo_mat.shape[0] - 1) do |j|
|
1174
|
+
if i != j
|
1175
|
+
tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]] / 2.0
|
1176
|
+
tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 2.0 / 10000.0
|
1177
|
+
else
|
1178
|
+
tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]]
|
1179
|
+
tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 10000.0
|
1147
1180
|
end
|
1148
1181
|
end
|
1182
|
+
end
|
1149
1183
|
|
1150
|
-
|
1184
|
+
$outfh.puts <<HEADER
|
1151
1185
|
#
|
1152
1186
|
# Shown here are logarithms of these values multiplied by #{$scale}/log(2)
|
1153
1187
|
# rounded to the nearest integer (log-odds scores in 1/3 bit units).
|
@@ -1156,31 +1190,30 @@ HEADER
|
|
1156
1190
|
#
|
1157
1191
|
HEADER
|
1158
1192
|
|
1159
|
-
|
1160
|
-
|
1161
|
-
|
1193
|
+
grp_logo_mats.each_with_index do |arr, grp_no|
|
1194
|
+
grp_label = arr[0]
|
1195
|
+
grp_logo_mat = arr[1]
|
1162
1196
|
|
1163
|
-
|
1164
|
-
|
1165
|
-
|
1166
|
-
|
1167
|
-
|
1168
|
-
end
|
1197
|
+
$outfh.puts ">#{grp_label} #{grp_no}"
|
1198
|
+
if $cys
|
1199
|
+
$outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
|
1200
|
+
else
|
1201
|
+
$outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
1169
1202
|
end
|
1203
|
+
end
|
1170
1204
|
|
1171
|
-
|
1205
|
+
$outfh.puts ">Total #{grp_logo_mats.size}"
|
1172
1206
|
|
1173
|
-
|
1174
|
-
|
1175
|
-
|
1176
|
-
|
1177
|
-
end
|
1178
|
-
$outfh.close
|
1179
|
-
exit 0
|
1207
|
+
if $cys == 0
|
1208
|
+
$outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
|
1209
|
+
else
|
1210
|
+
$outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
1180
1211
|
end
|
1212
|
+
$outfh.close
|
1213
|
+
exit 0
|
1181
1214
|
end
|
1182
1215
|
end
|
1216
|
+
end
|
1183
1217
|
|
1184
|
-
end # class << self
|
1185
1218
|
end # class CLI
|
1186
1219
|
end # module Egor
|
data/lib/egor.rb
CHANGED
data/website/index.html
CHANGED
@@ -34,7 +34,7 @@
|
|
34
34
|
<div class="sidebar">
|
35
35
|
<div id="version" class="clickable" onclick='document.location = "http://rubyforge.org/projects/egor"; return false'>
|
36
36
|
<p>Get Version</p>
|
37
|
-
<a href="http://rubyforge.org/projects/egor" class="numbers">0.0.
|
37
|
+
<a href="http://rubyforge.org/projects/egor" class="numbers">0.0.4</a>
|
38
38
|
</div>
|
39
39
|
</div>
|
40
40
|
<h2>What</h2>
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: egor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Semin Lee
|
@@ -30,7 +30,7 @@ cert_chain:
|
|
30
30
|
35w+y1Jd
|
31
31
|
-----END CERTIFICATE-----
|
32
32
|
|
33
|
-
date: 2008-12-
|
33
|
+
date: 2008-12-15 00:00:00 +00:00
|
34
34
|
default_executable:
|
35
35
|
dependencies:
|
36
36
|
- !ruby/object:Gem::Dependency
|
@@ -81,7 +81,7 @@ dependencies:
|
|
81
81
|
requirements:
|
82
82
|
- - ">="
|
83
83
|
- !ruby/object:Gem::Version
|
84
|
-
version: 1.1
|
84
|
+
version: 1.2.1
|
85
85
|
version:
|
86
86
|
- !ruby/object:Gem::Dependency
|
87
87
|
name: hoe
|
metadata.gz.sig
CHANGED
Binary file
|