egor 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,3 +1,9 @@
1
+ == 0.0.4 2008-12-15
2
+
3
+ * 2 major enhancement:
4
+ * log-odds ratio matrices can be generated with --nosmooth option
5
+ * safe handling for zero observations or mutations of amino acids
6
+
1
7
  == 0.0.3 2008-12-09
2
8
 
3
9
  * 2 major enhancement:
data/README.rdoc CHANGED
@@ -65,7 +65,7 @@ egor: Esst GeneratOR, a program for calculating environment-specific substitutio
65
65
 
66
66
  == REQUIREMENTS:
67
67
 
68
- * ruby 1.8.6 or above (http://www.ruby-lang.org)
68
+ * ruby 1.8.7 or above (http://www.ruby-lang.org)
69
69
  * rubygems 1.2.0 or above (http://rubyforge.org/projects/rubygems/)
70
70
 
71
71
  Following RubyGems will be automatically installed if you have rubygems installed on your machine
data/egor.gemspec CHANGED
@@ -2,12 +2,12 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{egor}
5
- s.version = "0.0.3"
5
+ s.version = "0.0.4"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Semin Lee"]
9
9
  s.cert_chain = ["/Users/semin/.gem/gem-public_cert.pem"]
10
- s.date = %q{2008-12-09}
10
+ s.date = %q{2008-12-15}
11
11
  s.default_executable = %q{egor}
12
12
  s.description = %q{egor: Esst GeneratOR, a program for calculating environment-specific substitution tables}
13
13
  s.email = ["seminlee@gmail.com"]
@@ -34,14 +34,14 @@ Gem::Specification.new do |s|
34
34
  s.add_runtime_dependency(%q<bio>, [">= 1.2.1"])
35
35
  s.add_runtime_dependency(%q<facets>, [">= 2.4.5"])
36
36
  s.add_runtime_dependency(%q<simple_memoize>, [">= 1.0.0"])
37
- s.add_development_dependency(%q<newgem>, [">= 1.1.0"])
37
+ s.add_development_dependency(%q<newgem>, [">= 1.2.1"])
38
38
  s.add_development_dependency(%q<hoe>, [">= 1.8.0"])
39
39
  else
40
40
  s.add_dependency(%q<narray>, [">= 0.5.9.5"])
41
41
  s.add_dependency(%q<bio>, [">= 1.2.1"])
42
42
  s.add_dependency(%q<facets>, [">= 2.4.5"])
43
43
  s.add_dependency(%q<simple_memoize>, [">= 1.0.0"])
44
- s.add_dependency(%q<newgem>, [">= 1.1.0"])
44
+ s.add_dependency(%q<newgem>, [">= 1.2.1"])
45
45
  s.add_dependency(%q<hoe>, [">= 1.8.0"])
46
46
  end
47
47
  else
@@ -49,7 +49,7 @@ Gem::Specification.new do |s|
49
49
  s.add_dependency(%q<bio>, [">= 1.2.1"])
50
50
  s.add_dependency(%q<facets>, [">= 2.4.5"])
51
51
  s.add_dependency(%q<simple_memoize>, [">= 1.0.0"])
52
- s.add_dependency(%q<newgem>, [">= 1.1.0"])
52
+ s.add_dependency(%q<newgem>, [">= 1.2.1"])
53
53
  s.add_dependency(%q<hoe>, [">= 1.8.0"])
54
54
  end
55
55
  end
data/lib/egor/cli.rb CHANGED
@@ -48,13 +48,13 @@ Options:
48
48
  --weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting
49
49
  --noweight: calculate substitution counts with no weights (default)
50
50
  --smooth (-s) INTEGER:
51
- 0 for parial smoothing (default)
51
+ 0 for partial smoothing (default)
52
52
  1 for full smoothing
53
53
  --nosmooth: perform no smoothing operation
54
54
  --cys (-y) INTEGER:
55
55
  0 for using C and J only for structure (default)
56
56
  1 for both structure and sequence
57
- 2 for using only C for both
57
+ 2 for using only C for both (should be set having no 'disulphide bonds' environment feature)
58
58
  --output INTEGER:
59
59
  0 for raw counts (no-smoothing performed)
60
60
  1 for probabilities
@@ -163,6 +163,9 @@ Options:
163
163
  $tot_freq_mat = nil
164
164
  $tot_prob_mat = nil
165
165
  $tot_logo_mat = nil
166
+ #
167
+ # Part 1 END
168
+ #
166
169
 
167
170
  # Part 2.
168
171
  #
@@ -174,6 +177,7 @@ Options:
174
177
  [ '--tem-file', '-f', GetoptLong::REQUIRED_ARGUMENT ],
175
178
  [ '--classdef', '-c', GetoptLong::REQUIRED_ARGUMENT ],
176
179
  [ '--smooth', '-s', GetoptLong::REQUIRED_ARGUMENT ],
180
+ [ '--nosmooth', GetoptLong::NO_ARGUMENT ],
177
181
  [ '--weight', '-w', GetoptLong::REQUIRED_ARGUMENT ],
178
182
  [ '--noweight', GetoptLong::NO_ARGUMENT ],
179
183
  [ '--heatmap', GetoptLong::NO_ARGUMENT ],
@@ -249,6 +253,9 @@ Options:
249
253
  print_usage
250
254
  exit 1
251
255
  end
256
+ #
257
+ # Part 2 END
258
+ #
252
259
 
253
260
 
254
261
  # Part 3.
@@ -315,6 +322,9 @@ Options:
315
322
  }.each_with_index { |e, i|
316
323
  $envs[e.flatten.join] = Environment.new(i, e.flatten.join, $amino_acids)
317
324
  }
325
+ #
326
+ # Part 3 END
327
+ #
318
328
 
319
329
  # Part 4.
320
330
  #
@@ -621,7 +631,7 @@ Options:
621
631
  end
622
632
  end
623
633
  end
624
- end # if !$nosmooth
634
+ end
625
635
  end
626
636
 
627
637
  # print out default header
@@ -660,7 +670,13 @@ HEADER
660
670
 
661
671
  # calculate amino acid frequencies and mutabilities, and
662
672
  # print them as default statistics in the header part
663
- ala_factor = 100.0 * $aa_tot_obs["A"] / $aa_mut_obs["A"].to_f
673
+ ala_factor = if $aa_tot_obs["A"] == 0
674
+ 0.0
675
+ elsif $aa_mut_obs["A"] == 0
676
+ 0.0
677
+ else
678
+ 100.0 * $aa_tot_obs["A"] / $aa_mut_obs["A"].to_f
679
+ end
664
680
  $tot_aa = $aa_tot_obs.values.sum
665
681
 
666
682
  $outfh.puts "#"
@@ -668,9 +684,9 @@ HEADER
668
684
  $outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FRQ]
669
685
 
670
686
  $amino_acids.each do |res|
671
- $aa_mutb[res] = $aa_mut_obs[res] / $aa_tot_obs[res].to_f
687
+ $aa_mutb[res] = $aa_tot_obs[res] == 0 ? 1.0 : $aa_mut_obs[res] / $aa_tot_obs[res].to_f
672
688
  $aa_rel_mutb[res] = $aa_mutb[res] * ala_factor
673
- $aa_rel_freq[res] = $aa_tot_obs[res] / $tot_aa.to_f
689
+ $aa_rel_freq[res] = $aa_tot_obs[res] == 0 ? 0.0 : $aa_tot_obs[res] / $tot_aa.to_f
674
690
  end
675
691
 
676
692
  $amino_acids.each do |res|
@@ -682,6 +698,9 @@ HEADER
682
698
  [res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
683
699
  end
684
700
  end
701
+ #
702
+ # Part 4. END
703
+ #
685
704
 
686
705
 
687
706
  # Part 5.
@@ -728,6 +747,9 @@ HEADER
728
747
  $outfh.puts $tot_freq_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
729
748
  exit 0
730
749
  end
750
+ #
751
+ # Part 5. END
752
+ #
731
753
 
732
754
 
733
755
  # Part 6.
@@ -746,6 +768,7 @@ HEADER
746
768
  HEADER
747
769
  end
748
770
 
771
+ # when nosmoothing !!!
749
772
  if ($output > 0) && $nosmooth
750
773
  # Probability matrices
751
774
  $tot_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
@@ -781,7 +804,7 @@ HEADER
781
804
  end
782
805
  end
783
806
 
784
- # for smoothing...
807
+ # when smoothing!!!
785
808
  if ($output > 0) && !$nosmooth
786
809
  #
787
810
  # p1 probability
@@ -905,7 +928,13 @@ HEADER
905
928
 
906
929
  # entropy based weighting priors
907
930
  entropy_max = Math::log($amino_acids.size)
908
- entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p| p == 0.0 ? s - 1 : s + p * Math::log(p) } }
931
+ entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p|
932
+ begin
933
+ p == 0.0 ? s - 1 : s + p * Math::log(p)
934
+ rescue
935
+ #puts "P: #{p}"
936
+ end
937
+ } }
909
938
  mod_entropies = entropies.map_with_index { |entropy, i| (entropy_max - entropies[i]) / entropy_max }
910
939
  weights = mod_entropies.map { |mod_entropy| mod_entropy / mod_entropies.sum }
911
940
  weighted_priors = priors.map_with_index { |prior, i| prior * weights[i] }.sum
@@ -1058,96 +1087,101 @@ HEADER
1058
1087
  $outfh.close
1059
1088
  exit 0
1060
1089
  end
1090
+ end
1091
+ #
1092
+ # Part 6. END
1093
+ #
1061
1094
 
1062
1095
 
1063
- # Part 7.
1064
- #
1065
- # Calculating log odds ratio scoring matrices
1066
- #
1067
- if $output == 2
1068
- $outfh.puts <<HEADER
1096
+ # Part 7.
1097
+ #
1098
+ # Calculating log odds ratio scoring matrices
1099
+ #
1100
+ if $output == 2
1101
+ $outfh.puts <<HEADER
1069
1102
  #
1070
1103
  # The probabilities were then divided by the background probabilities
1071
1104
  HEADER
1072
- if $penv
1073
- $outfh.puts <<HEADER
1105
+ if $penv
1106
+ $outfh.puts <<HEADER
1074
1107
  # which were derived from the environment-independent amino acid frequencies.
1075
1108
  # ^^^^^^^^^^^^^^^^^^^^^^^
1076
1109
  HEADER
1077
- else
1078
- $outfh.puts <<HEADER
1110
+ else
1111
+ $outfh.puts <<HEADER
1079
1112
  # which were derived from the environment-dependent amino acid frequencies.
1080
1113
  # ^^^^^^^^^^^^^^^^^^^^^
1081
1114
  HEADER
1082
- end
1115
+ end
1083
1116
 
1084
- $tot_logo_mat = $cys == 0 ? NMatrix.float($amino_acids.size, $amino_acids.size + 1) : NMatrix.float($amino_acids.size, $amino_acids.size)
1085
- grp_logo_mats = []
1086
- factor = $scale / Math::log(2)
1087
-
1088
- # grouping environments by its environment labels but amino acid label
1089
- env_groups = $envs.values.group_by { |env| env.label[1..-1] }
1090
-
1091
- # sorting environments and build 21X21 substitution matrices
1092
- env_groups.to_a.sort_by { |env_group|
1093
- # a bit clumsy sorting here...
1094
- env_group[0].split("").map_with_index { |l, i|
1095
- $env_features[i + 1].labels.index(l)
1096
- }
1097
- }.each_with_index do |group, group_no|
1098
- # calculating 21X21 substitution probability matrix for each envrionment
1099
- grp_label = group[0]
1100
- grp_envs = group[1]
1101
- grp_logo_mat = $cys == 0 ? NMatrix.float($amino_acids.size, $amino_acids.size + 1) : NMatrix.float($amino_acids.size, $amino_acids.size)
1102
-
1103
- $amino_acids.each_with_index do |aa, ai|
1104
- env = grp_envs.detect { |e| e.label.start_with?(aa) }
1105
- logo_arr = $cys == 0 ? NArray.float($amino_acids.size + 1) : NArray.float($amino_acids.size)
1106
-
1107
- env.smooth_prob_array.to_a.each_with_index do |prob, j|
1108
- paj = 100.0 * $aa_rel_freq[$amino_acids[j]]
1109
- odds = prob == 0.0 ? 0.000001 / paj : prob / paj
1110
- logo_arr[j] = factor * Math::log(odds)
1111
- end
1117
+ $tot_logo_mat = $cys == 0 ? NMatrix.float($amino_acids.size, $amino_acids.size + 1) : NMatrix.float($amino_acids.size, $amino_acids.size)
1118
+ grp_logo_mats = []
1119
+ factor = $scale / Math::log(2)
1120
+
1121
+ # grouping environments by its environment labels but amino acid label
1122
+ env_groups = $envs.values.group_by { |env| env.label[1..-1] }
1123
+
1124
+ # sorting environments and build 21X21 substitution matrices
1125
+ env_groups.to_a.sort_by { |env_group|
1126
+ # a bit clumsy sorting here...
1127
+ env_group[0].split("").map_with_index { |l, i|
1128
+ $env_features[i + 1].labels.index(l)
1129
+ }
1130
+ }.each_with_index do |group, group_no|
1131
+ # calculating substitution probability matrix for each envrionment
1132
+ grp_label = group[0]
1133
+ grp_envs = group[1]
1134
+ grp_logo_mat = $cys == 0 ? NMatrix.float($amino_acids.size, $amino_acids.size + 1) : NMatrix.float($amino_acids.size, $amino_acids.size)
1112
1135
 
1113
- 0.upto($amino_acids.size - 1) { |j| grp_logo_mat[ai, j] = logo_arr[j] }
1136
+ $amino_acids.each_with_index do |aa, ai|
1137
+ env = grp_envs.detect { |e| e.label.start_with?(aa) }
1138
+ logo_arr = $cys == 0 ? NArray.float($amino_acids.size + 1) : NArray.float($amino_acids.size)
1114
1139
 
1115
- # adding log odds ratio for "U" (J or C) when --cyc is 0
1116
- if $cys == 0
1117
- paj = 100.0 * ($aa_rel_freq["C"] + $aa_rel_freq["J"])
1118
- prob = env.smooth_prob_array[$amino_acids.index("C")] + env.smooth_prob_array[$amino_acids.index("J")]
1119
- odds = prob == 0.0 ? 0.000001 / paj : prob / paj
1120
- logo_arr[logo_arr.size - 1] = factor * Math::log(odds)
1121
- grp_logo_mat[ai, logo_arr.size - 1] = logo_arr[logo_arr.size - 1]
1122
- end
1140
+ env.send($nosmooth ? "prob_array" : "smooth_prob_array").to_a.each_with_index do |prob, j|
1141
+ paj = 100.0 * $aa_rel_freq[$amino_acids[j]]
1142
+ odds = prob == 0.0 ? 0.000001 / paj : prob / paj
1143
+ logo_arr[j] = factor * Math::log(odds)
1123
1144
  end
1124
1145
 
1125
- $tot_logo_mat += grp_logo_mat
1126
- grp_logo_mats << [grp_label, grp_logo_mat]
1146
+ 0.upto($amino_acids.size - 1) { |j| grp_logo_mat[ai, j] = logo_arr[j] }
1147
+
1148
+ # adding log odds ratio for "U" (J or C) when --cyc is 0
1149
+ if $cys == 0
1150
+ paj = 100.0 * ($aa_rel_freq["C"] + $aa_rel_freq["J"])
1151
+ prob = env.send($nosmooth ? "prob_array" : "smooth_prob_array")[$amino_acids.index("C")] +
1152
+ env.send($nosmooth ? "prob_array" : "smooth_prob_array")[$amino_acids.index("J")]
1153
+ odds = prob == 0.0 ? 0.000001 / paj : prob / paj
1154
+ logo_arr[logo_arr.size - 1] = factor * Math::log(odds)
1155
+ grp_logo_mat[ai, logo_arr.size - 1] = logo_arr[logo_arr.size - 1]
1156
+ end
1127
1157
  end
1128
1158
 
1129
- $tot_logo_mat /= env_groups.size
1130
-
1131
- # calculating relative entropy for each amino acid pair H and
1132
- # the expected score E in bit units
1133
- #
1134
- # I'm a bit suspicious about this part...
1135
- tot_E = 0.0
1136
- tot_H = 0.0
1137
-
1138
- 0.upto($tot_logo_mat.shape[0] - 1) do |i|
1139
- 0.upto($tot_logo_mat.shape[0] - 1) do |j|
1140
- if i != j
1141
- tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]] / 2.0
1142
- tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 2.0 / 10000.0
1143
- else
1144
- tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]]
1145
- tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 10000.0
1146
- end
1159
+ $tot_logo_mat += grp_logo_mat
1160
+ grp_logo_mats << [grp_label, grp_logo_mat]
1161
+ end
1162
+
1163
+ $tot_logo_mat /= env_groups.size
1164
+
1165
+ # calculating relative entropy for each amino acid pair H and
1166
+ # the expected score E in bit units
1167
+ #
1168
+ # I'm a bit suspicious about this part...
1169
+ tot_E = 0.0
1170
+ tot_H = 0.0
1171
+
1172
+ 0.upto($tot_logo_mat.shape[0] - 1) do |i|
1173
+ 0.upto($tot_logo_mat.shape[0] - 1) do |j|
1174
+ if i != j
1175
+ tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]] / 2.0
1176
+ tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 2.0 / 10000.0
1177
+ else
1178
+ tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]]
1179
+ tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 10000.0
1147
1180
  end
1148
1181
  end
1182
+ end
1149
1183
 
1150
- $outfh.puts <<HEADER
1184
+ $outfh.puts <<HEADER
1151
1185
  #
1152
1186
  # Shown here are logarithms of these values multiplied by #{$scale}/log(2)
1153
1187
  # rounded to the nearest integer (log-odds scores in 1/3 bit units).
@@ -1156,31 +1190,30 @@ HEADER
1156
1190
  #
1157
1191
  HEADER
1158
1192
 
1159
- grp_logo_mats.each_with_index do |arr, grp_no|
1160
- grp_label = arr[0]
1161
- grp_logo_mat = arr[1]
1193
+ grp_logo_mats.each_with_index do |arr, grp_no|
1194
+ grp_label = arr[0]
1195
+ grp_logo_mat = arr[1]
1162
1196
 
1163
- $outfh.puts ">#{grp_label} #{grp_no}"
1164
- if $cys
1165
- $outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
1166
- else
1167
- $outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1168
- end
1197
+ $outfh.puts ">#{grp_label} #{grp_no}"
1198
+ if $cys
1199
+ $outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
1200
+ else
1201
+ $outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1169
1202
  end
1203
+ end
1170
1204
 
1171
- $outfh.puts ">Total #{grp_logo_mats.size}"
1205
+ $outfh.puts ">Total #{grp_logo_mats.size}"
1172
1206
 
1173
- if $cys == 0
1174
- $outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
1175
- else
1176
- $outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1177
- end
1178
- $outfh.close
1179
- exit 0
1207
+ if $cys == 0
1208
+ $outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
1209
+ else
1210
+ $outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1180
1211
  end
1212
+ $outfh.close
1213
+ exit 0
1181
1214
  end
1182
1215
  end
1216
+ end
1183
1217
 
1184
- end # class << self
1185
1218
  end # class CLI
1186
1219
  end # module Egor
data/lib/egor.rb CHANGED
@@ -2,5 +2,5 @@ $:.unshift(File.dirname(__FILE__)) unless
2
2
  $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
3
 
4
4
  module Egor
5
- VERSION = '0.0.3'
5
+ VERSION = '0.0.4'
6
6
  end
data/website/index.html CHANGED
@@ -34,7 +34,7 @@
34
34
  <div class="sidebar">
35
35
  <div id="version" class="clickable" onclick='document.location = "http://rubyforge.org/projects/egor"; return false'>
36
36
  <p>Get Version</p>
37
- <a href="http://rubyforge.org/projects/egor" class="numbers">0.0.3</a>
37
+ <a href="http://rubyforge.org/projects/egor" class="numbers">0.0.4</a>
38
38
  </div>
39
39
  </div>
40
40
  <h2>What</h2>
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: egor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Semin Lee
@@ -30,7 +30,7 @@ cert_chain:
30
30
  35w+y1Jd
31
31
  -----END CERTIFICATE-----
32
32
 
33
- date: 2008-12-09 00:00:00 +00:00
33
+ date: 2008-12-15 00:00:00 +00:00
34
34
  default_executable:
35
35
  dependencies:
36
36
  - !ruby/object:Gem::Dependency
@@ -81,7 +81,7 @@ dependencies:
81
81
  requirements:
82
82
  - - ">="
83
83
  - !ruby/object:Gem::Version
84
- version: 1.1.0
84
+ version: 1.2.1
85
85
  version:
86
86
  - !ruby/object:Gem::Dependency
87
87
  name: hoe
metadata.gz.sig CHANGED
Binary file