egor 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,3 +1,9 @@
1
+ == 0.0.4 2008-12-15
2
+
3
+ * 2 major enhancement:
4
+ * log-odds ratio matrices can be generated with --nosmooth option
5
+ * safe handling for zero observations or mutations of amino acids
6
+
1
7
  == 0.0.3 2008-12-09
2
8
 
3
9
  * 2 major enhancement:
data/README.rdoc CHANGED
@@ -65,7 +65,7 @@ egor: Esst GeneratOR, a program for calculating environment-specific substitutio
65
65
 
66
66
  == REQUIREMENTS:
67
67
 
68
- * ruby 1.8.6 or above (http://www.ruby-lang.org)
68
+ * ruby 1.8.7 or above (http://www.ruby-lang.org)
69
69
  * rubygems 1.2.0 or above (http://rubyforge.org/projects/rubygems/)
70
70
 
71
71
  Following RubyGems will be automatically installed if you have rubygems installed on your machine
data/egor.gemspec CHANGED
@@ -2,12 +2,12 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{egor}
5
- s.version = "0.0.3"
5
+ s.version = "0.0.4"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Semin Lee"]
9
9
  s.cert_chain = ["/Users/semin/.gem/gem-public_cert.pem"]
10
- s.date = %q{2008-12-09}
10
+ s.date = %q{2008-12-15}
11
11
  s.default_executable = %q{egor}
12
12
  s.description = %q{egor: Esst GeneratOR, a program for calculating environment-specific substitution tables}
13
13
  s.email = ["seminlee@gmail.com"]
@@ -34,14 +34,14 @@ Gem::Specification.new do |s|
34
34
  s.add_runtime_dependency(%q<bio>, [">= 1.2.1"])
35
35
  s.add_runtime_dependency(%q<facets>, [">= 2.4.5"])
36
36
  s.add_runtime_dependency(%q<simple_memoize>, [">= 1.0.0"])
37
- s.add_development_dependency(%q<newgem>, [">= 1.1.0"])
37
+ s.add_development_dependency(%q<newgem>, [">= 1.2.1"])
38
38
  s.add_development_dependency(%q<hoe>, [">= 1.8.0"])
39
39
  else
40
40
  s.add_dependency(%q<narray>, [">= 0.5.9.5"])
41
41
  s.add_dependency(%q<bio>, [">= 1.2.1"])
42
42
  s.add_dependency(%q<facets>, [">= 2.4.5"])
43
43
  s.add_dependency(%q<simple_memoize>, [">= 1.0.0"])
44
- s.add_dependency(%q<newgem>, [">= 1.1.0"])
44
+ s.add_dependency(%q<newgem>, [">= 1.2.1"])
45
45
  s.add_dependency(%q<hoe>, [">= 1.8.0"])
46
46
  end
47
47
  else
@@ -49,7 +49,7 @@ Gem::Specification.new do |s|
49
49
  s.add_dependency(%q<bio>, [">= 1.2.1"])
50
50
  s.add_dependency(%q<facets>, [">= 2.4.5"])
51
51
  s.add_dependency(%q<simple_memoize>, [">= 1.0.0"])
52
- s.add_dependency(%q<newgem>, [">= 1.1.0"])
52
+ s.add_dependency(%q<newgem>, [">= 1.2.1"])
53
53
  s.add_dependency(%q<hoe>, [">= 1.8.0"])
54
54
  end
55
55
  end
data/lib/egor/cli.rb CHANGED
@@ -48,13 +48,13 @@ Options:
48
48
  --weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting
49
49
  --noweight: calculate substitution counts with no weights (default)
50
50
  --smooth (-s) INTEGER:
51
- 0 for parial smoothing (default)
51
+ 0 for partial smoothing (default)
52
52
  1 for full smoothing
53
53
  --nosmooth: perform no smoothing operation
54
54
  --cys (-y) INTEGER:
55
55
  0 for using C and J only for structure (default)
56
56
  1 for both structure and sequence
57
- 2 for using only C for both
57
+ 2 for using only C for both (should be set having no 'disulphide bonds' environment feature)
58
58
  --output INTEGER:
59
59
  0 for raw counts (no-smoothing performed)
60
60
  1 for probabilities
@@ -163,6 +163,9 @@ Options:
163
163
  $tot_freq_mat = nil
164
164
  $tot_prob_mat = nil
165
165
  $tot_logo_mat = nil
166
+ #
167
+ # Part 1 END
168
+ #
166
169
 
167
170
  # Part 2.
168
171
  #
@@ -174,6 +177,7 @@ Options:
174
177
  [ '--tem-file', '-f', GetoptLong::REQUIRED_ARGUMENT ],
175
178
  [ '--classdef', '-c', GetoptLong::REQUIRED_ARGUMENT ],
176
179
  [ '--smooth', '-s', GetoptLong::REQUIRED_ARGUMENT ],
180
+ [ '--nosmooth', GetoptLong::NO_ARGUMENT ],
177
181
  [ '--weight', '-w', GetoptLong::REQUIRED_ARGUMENT ],
178
182
  [ '--noweight', GetoptLong::NO_ARGUMENT ],
179
183
  [ '--heatmap', GetoptLong::NO_ARGUMENT ],
@@ -249,6 +253,9 @@ Options:
249
253
  print_usage
250
254
  exit 1
251
255
  end
256
+ #
257
+ # Part 2 END
258
+ #
252
259
 
253
260
 
254
261
  # Part 3.
@@ -315,6 +322,9 @@ Options:
315
322
  }.each_with_index { |e, i|
316
323
  $envs[e.flatten.join] = Environment.new(i, e.flatten.join, $amino_acids)
317
324
  }
325
+ #
326
+ # Part 3 END
327
+ #
318
328
 
319
329
  # Part 4.
320
330
  #
@@ -621,7 +631,7 @@ Options:
621
631
  end
622
632
  end
623
633
  end
624
- end # if !$nosmooth
634
+ end
625
635
  end
626
636
 
627
637
  # print out default header
@@ -660,7 +670,13 @@ HEADER
660
670
 
661
671
  # calculate amino acid frequencies and mutabilities, and
662
672
  # print them as default statistics in the header part
663
- ala_factor = 100.0 * $aa_tot_obs["A"] / $aa_mut_obs["A"].to_f
673
+ ala_factor = if $aa_tot_obs["A"] == 0
674
+ 0.0
675
+ elsif $aa_mut_obs["A"] == 0
676
+ 0.0
677
+ else
678
+ 100.0 * $aa_tot_obs["A"] / $aa_mut_obs["A"].to_f
679
+ end
664
680
  $tot_aa = $aa_tot_obs.values.sum
665
681
 
666
682
  $outfh.puts "#"
@@ -668,9 +684,9 @@ HEADER
668
684
  $outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FRQ]
669
685
 
670
686
  $amino_acids.each do |res|
671
- $aa_mutb[res] = $aa_mut_obs[res] / $aa_tot_obs[res].to_f
687
+ $aa_mutb[res] = $aa_tot_obs[res] == 0 ? 1.0 : $aa_mut_obs[res] / $aa_tot_obs[res].to_f
672
688
  $aa_rel_mutb[res] = $aa_mutb[res] * ala_factor
673
- $aa_rel_freq[res] = $aa_tot_obs[res] / $tot_aa.to_f
689
+ $aa_rel_freq[res] = $aa_tot_obs[res] == 0 ? 0.0 : $aa_tot_obs[res] / $tot_aa.to_f
674
690
  end
675
691
 
676
692
  $amino_acids.each do |res|
@@ -682,6 +698,9 @@ HEADER
682
698
  [res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
683
699
  end
684
700
  end
701
+ #
702
+ # Part 4. END
703
+ #
685
704
 
686
705
 
687
706
  # Part 5.
@@ -728,6 +747,9 @@ HEADER
728
747
  $outfh.puts $tot_freq_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
729
748
  exit 0
730
749
  end
750
+ #
751
+ # Part 5. END
752
+ #
731
753
 
732
754
 
733
755
  # Part 6.
@@ -746,6 +768,7 @@ HEADER
746
768
  HEADER
747
769
  end
748
770
 
771
+ # when nosmoothing !!!
749
772
  if ($output > 0) && $nosmooth
750
773
  # Probability matrices
751
774
  $tot_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size)
@@ -781,7 +804,7 @@ HEADER
781
804
  end
782
805
  end
783
806
 
784
- # for smoothing...
807
+ # when smoothing!!!
785
808
  if ($output > 0) && !$nosmooth
786
809
  #
787
810
  # p1 probability
@@ -905,7 +928,13 @@ HEADER
905
928
 
906
929
  # entropy based weighting priors
907
930
  entropy_max = Math::log($amino_acids.size)
908
- entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p| p == 0.0 ? s - 1 : s + p * Math::log(p) } }
931
+ entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p|
932
+ begin
933
+ p == 0.0 ? s - 1 : s + p * Math::log(p)
934
+ rescue
935
+ #puts "P: #{p}"
936
+ end
937
+ } }
909
938
  mod_entropies = entropies.map_with_index { |entropy, i| (entropy_max - entropies[i]) / entropy_max }
910
939
  weights = mod_entropies.map { |mod_entropy| mod_entropy / mod_entropies.sum }
911
940
  weighted_priors = priors.map_with_index { |prior, i| prior * weights[i] }.sum
@@ -1058,96 +1087,101 @@ HEADER
1058
1087
  $outfh.close
1059
1088
  exit 0
1060
1089
  end
1090
+ end
1091
+ #
1092
+ # Part 6. END
1093
+ #
1061
1094
 
1062
1095
 
1063
- # Part 7.
1064
- #
1065
- # Calculating log odds ratio scoring matrices
1066
- #
1067
- if $output == 2
1068
- $outfh.puts <<HEADER
1096
+ # Part 7.
1097
+ #
1098
+ # Calculating log odds ratio scoring matrices
1099
+ #
1100
+ if $output == 2
1101
+ $outfh.puts <<HEADER
1069
1102
  #
1070
1103
  # The probabilities were then divided by the background probabilities
1071
1104
  HEADER
1072
- if $penv
1073
- $outfh.puts <<HEADER
1105
+ if $penv
1106
+ $outfh.puts <<HEADER
1074
1107
  # which were derived from the environment-independent amino acid frequencies.
1075
1108
  # ^^^^^^^^^^^^^^^^^^^^^^^
1076
1109
  HEADER
1077
- else
1078
- $outfh.puts <<HEADER
1110
+ else
1111
+ $outfh.puts <<HEADER
1079
1112
  # which were derived from the environment-dependent amino acid frequencies.
1080
1113
  # ^^^^^^^^^^^^^^^^^^^^^
1081
1114
  HEADER
1082
- end
1115
+ end
1083
1116
 
1084
- $tot_logo_mat = $cys == 0 ? NMatrix.float($amino_acids.size, $amino_acids.size + 1) : NMatrix.float($amino_acids.size, $amino_acids.size)
1085
- grp_logo_mats = []
1086
- factor = $scale / Math::log(2)
1087
-
1088
- # grouping environments by its environment labels but amino acid label
1089
- env_groups = $envs.values.group_by { |env| env.label[1..-1] }
1090
-
1091
- # sorting environments and build 21X21 substitution matrices
1092
- env_groups.to_a.sort_by { |env_group|
1093
- # a bit clumsy sorting here...
1094
- env_group[0].split("").map_with_index { |l, i|
1095
- $env_features[i + 1].labels.index(l)
1096
- }
1097
- }.each_with_index do |group, group_no|
1098
- # calculating 21X21 substitution probability matrix for each envrionment
1099
- grp_label = group[0]
1100
- grp_envs = group[1]
1101
- grp_logo_mat = $cys == 0 ? NMatrix.float($amino_acids.size, $amino_acids.size + 1) : NMatrix.float($amino_acids.size, $amino_acids.size)
1102
-
1103
- $amino_acids.each_with_index do |aa, ai|
1104
- env = grp_envs.detect { |e| e.label.start_with?(aa) }
1105
- logo_arr = $cys == 0 ? NArray.float($amino_acids.size + 1) : NArray.float($amino_acids.size)
1106
-
1107
- env.smooth_prob_array.to_a.each_with_index do |prob, j|
1108
- paj = 100.0 * $aa_rel_freq[$amino_acids[j]]
1109
- odds = prob == 0.0 ? 0.000001 / paj : prob / paj
1110
- logo_arr[j] = factor * Math::log(odds)
1111
- end
1117
+ $tot_logo_mat = $cys == 0 ? NMatrix.float($amino_acids.size, $amino_acids.size + 1) : NMatrix.float($amino_acids.size, $amino_acids.size)
1118
+ grp_logo_mats = []
1119
+ factor = $scale / Math::log(2)
1120
+
1121
+ # grouping environments by its environment labels but amino acid label
1122
+ env_groups = $envs.values.group_by { |env| env.label[1..-1] }
1123
+
1124
+ # sorting environments and build 21X21 substitution matrices
1125
+ env_groups.to_a.sort_by { |env_group|
1126
+ # a bit clumsy sorting here...
1127
+ env_group[0].split("").map_with_index { |l, i|
1128
+ $env_features[i + 1].labels.index(l)
1129
+ }
1130
+ }.each_with_index do |group, group_no|
1131
+ # calculating substitution probability matrix for each envrionment
1132
+ grp_label = group[0]
1133
+ grp_envs = group[1]
1134
+ grp_logo_mat = $cys == 0 ? NMatrix.float($amino_acids.size, $amino_acids.size + 1) : NMatrix.float($amino_acids.size, $amino_acids.size)
1112
1135
 
1113
- 0.upto($amino_acids.size - 1) { |j| grp_logo_mat[ai, j] = logo_arr[j] }
1136
+ $amino_acids.each_with_index do |aa, ai|
1137
+ env = grp_envs.detect { |e| e.label.start_with?(aa) }
1138
+ logo_arr = $cys == 0 ? NArray.float($amino_acids.size + 1) : NArray.float($amino_acids.size)
1114
1139
 
1115
- # adding log odds ratio for "U" (J or C) when --cyc is 0
1116
- if $cys == 0
1117
- paj = 100.0 * ($aa_rel_freq["C"] + $aa_rel_freq["J"])
1118
- prob = env.smooth_prob_array[$amino_acids.index("C")] + env.smooth_prob_array[$amino_acids.index("J")]
1119
- odds = prob == 0.0 ? 0.000001 / paj : prob / paj
1120
- logo_arr[logo_arr.size - 1] = factor * Math::log(odds)
1121
- grp_logo_mat[ai, logo_arr.size - 1] = logo_arr[logo_arr.size - 1]
1122
- end
1140
+ env.send($nosmooth ? "prob_array" : "smooth_prob_array").to_a.each_with_index do |prob, j|
1141
+ paj = 100.0 * $aa_rel_freq[$amino_acids[j]]
1142
+ odds = prob == 0.0 ? 0.000001 / paj : prob / paj
1143
+ logo_arr[j] = factor * Math::log(odds)
1123
1144
  end
1124
1145
 
1125
- $tot_logo_mat += grp_logo_mat
1126
- grp_logo_mats << [grp_label, grp_logo_mat]
1146
+ 0.upto($amino_acids.size - 1) { |j| grp_logo_mat[ai, j] = logo_arr[j] }
1147
+
1148
+ # adding log odds ratio for "U" (J or C) when --cyc is 0
1149
+ if $cys == 0
1150
+ paj = 100.0 * ($aa_rel_freq["C"] + $aa_rel_freq["J"])
1151
+ prob = env.send($nosmooth ? "prob_array" : "smooth_prob_array")[$amino_acids.index("C")] +
1152
+ env.send($nosmooth ? "prob_array" : "smooth_prob_array")[$amino_acids.index("J")]
1153
+ odds = prob == 0.0 ? 0.000001 / paj : prob / paj
1154
+ logo_arr[logo_arr.size - 1] = factor * Math::log(odds)
1155
+ grp_logo_mat[ai, logo_arr.size - 1] = logo_arr[logo_arr.size - 1]
1156
+ end
1127
1157
  end
1128
1158
 
1129
- $tot_logo_mat /= env_groups.size
1130
-
1131
- # calculating relative entropy for each amino acid pair H and
1132
- # the expected score E in bit units
1133
- #
1134
- # I'm a bit suspicious about this part...
1135
- tot_E = 0.0
1136
- tot_H = 0.0
1137
-
1138
- 0.upto($tot_logo_mat.shape[0] - 1) do |i|
1139
- 0.upto($tot_logo_mat.shape[0] - 1) do |j|
1140
- if i != j
1141
- tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]] / 2.0
1142
- tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 2.0 / 10000.0
1143
- else
1144
- tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]]
1145
- tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 10000.0
1146
- end
1159
+ $tot_logo_mat += grp_logo_mat
1160
+ grp_logo_mats << [grp_label, grp_logo_mat]
1161
+ end
1162
+
1163
+ $tot_logo_mat /= env_groups.size
1164
+
1165
+ # calculating relative entropy for each amino acid pair H and
1166
+ # the expected score E in bit units
1167
+ #
1168
+ # I'm a bit suspicious about this part...
1169
+ tot_E = 0.0
1170
+ tot_H = 0.0
1171
+
1172
+ 0.upto($tot_logo_mat.shape[0] - 1) do |i|
1173
+ 0.upto($tot_logo_mat.shape[0] - 1) do |j|
1174
+ if i != j
1175
+ tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]] / 2.0
1176
+ tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 2.0 / 10000.0
1177
+ else
1178
+ tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]]
1179
+ tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 10000.0
1147
1180
  end
1148
1181
  end
1182
+ end
1149
1183
 
1150
- $outfh.puts <<HEADER
1184
+ $outfh.puts <<HEADER
1151
1185
  #
1152
1186
  # Shown here are logarithms of these values multiplied by #{$scale}/log(2)
1153
1187
  # rounded to the nearest integer (log-odds scores in 1/3 bit units).
@@ -1156,31 +1190,30 @@ HEADER
1156
1190
  #
1157
1191
  HEADER
1158
1192
 
1159
- grp_logo_mats.each_with_index do |arr, grp_no|
1160
- grp_label = arr[0]
1161
- grp_logo_mat = arr[1]
1193
+ grp_logo_mats.each_with_index do |arr, grp_no|
1194
+ grp_label = arr[0]
1195
+ grp_logo_mat = arr[1]
1162
1196
 
1163
- $outfh.puts ">#{grp_label} #{grp_no}"
1164
- if $cys
1165
- $outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
1166
- else
1167
- $outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1168
- end
1197
+ $outfh.puts ">#{grp_label} #{grp_no}"
1198
+ if $cys
1199
+ $outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
1200
+ else
1201
+ $outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1169
1202
  end
1203
+ end
1170
1204
 
1171
- $outfh.puts ">Total #{grp_logo_mats.size}"
1205
+ $outfh.puts ">Total #{grp_logo_mats.size}"
1172
1206
 
1173
- if $cys == 0
1174
- $outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
1175
- else
1176
- $outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1177
- end
1178
- $outfh.close
1179
- exit 0
1207
+ if $cys == 0
1208
+ $outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
1209
+ else
1210
+ $outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1180
1211
  end
1212
+ $outfh.close
1213
+ exit 0
1181
1214
  end
1182
1215
  end
1216
+ end
1183
1217
 
1184
- end # class << self
1185
1218
  end # class CLI
1186
1219
  end # module Egor
data/lib/egor.rb CHANGED
@@ -2,5 +2,5 @@ $:.unshift(File.dirname(__FILE__)) unless
2
2
  $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
3
 
4
4
  module Egor
5
- VERSION = '0.0.3'
5
+ VERSION = '0.0.4'
6
6
  end
data/website/index.html CHANGED
@@ -34,7 +34,7 @@
34
34
  <div class="sidebar">
35
35
  <div id="version" class="clickable" onclick='document.location = "http://rubyforge.org/projects/egor"; return false'>
36
36
  <p>Get Version</p>
37
- <a href="http://rubyforge.org/projects/egor" class="numbers">0.0.3</a>
37
+ <a href="http://rubyforge.org/projects/egor" class="numbers">0.0.4</a>
38
38
  </div>
39
39
  </div>
40
40
  <h2>What</h2>
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: egor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Semin Lee
@@ -30,7 +30,7 @@ cert_chain:
30
30
  35w+y1Jd
31
31
  -----END CERTIFICATE-----
32
32
 
33
- date: 2008-12-09 00:00:00 +00:00
33
+ date: 2008-12-15 00:00:00 +00:00
34
34
  default_executable:
35
35
  dependencies:
36
36
  - !ruby/object:Gem::Dependency
@@ -81,7 +81,7 @@ dependencies:
81
81
  requirements:
82
82
  - - ">="
83
83
  - !ruby/object:Gem::Version
84
- version: 1.1.0
84
+ version: 1.2.1
85
85
  version:
86
86
  - !ruby/object:Gem::Dependency
87
87
  name: hoe
metadata.gz.sig CHANGED
Binary file