egor 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,3 +1,9 @@
1
+ == 0.0.2 2008-11-13
2
+
3
+ * 2 major enhancement:
4
+ * Constrained environment features are properly handled
5
+ * Sane log-odds ratio matrices are produced
6
+
1
7
  == 0.0.1 2008-11-07
2
8
 
3
9
  * 1 major enhancement:
data/README.rdoc CHANGED
@@ -9,8 +9,10 @@ egor: Esst GeneratOR, a program for calculating environment-specific substitutio
9
9
  == FEATURES/PROBLEMS:
10
10
 
11
11
  * No more segmentation fault
12
+ * Fast enough not to leave your place
13
+ * Slow enough to check your emails or have some chats with your colleagues next you
12
14
  * Full smoothing supported
13
- * Infinite number of environment features can be handled (in theory)
15
+ * In theory, infinite number of environment features can be handled
14
16
 
15
17
  == BASIC USAGE:
16
18
 
data/lib/egor/cli.rb CHANGED
@@ -41,19 +41,19 @@ Usage:
41
41
  egor [ options ] -f TEM-file -c CLASSDEF-file
42
42
 
43
43
  Options:
44
- --tem-file (-f) STRING: a tem file
45
- --tem-list (-l) STRING: a list for tem files
46
- --classdef (-c) STRING: a file for the defintion of environments (default: 'classdef.dat')
47
- --outfile (-o) STRING: output filename ("allmat.dat" if not specified)
44
+ --tem-file (-f) FILE: a tem file
45
+ --tem-list (-l) FILE: a list for tem files
46
+ --classdef (-c) FILE: a file for the defintion of environments (default: 'classdef.dat')
47
+ --outfile (-o) FILE: output filename ("allmat.dat" if not specified)
48
48
  --weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting
49
49
  --noweight: calculate substitution counts with no weights (default)
50
50
  --smooth (-s) INTEGER:
51
51
  0 for parial smoothing (default)
52
52
  1 for full smoothing
53
53
  --nosmooth: perform no smoothing operation
54
- --cys (-y) INTEGER: (NOT implemented yet)
55
- 0 for using C and J only for structure
56
- 1 for both structure and sequence (default)
54
+ --cys (-y) INTEGER:
55
+ 0 for using C and J only for structure (default)
56
+ 1 for both structure and sequence
57
57
  --output INTEGER:
58
58
  0 for raw counts (no-smoothing performed)
59
59
  1 for probabilities
@@ -61,7 +61,7 @@ Options:
61
61
  --scale INTEGER: log-odds matrices in 1/n bit units (default 3)
62
62
  --sigma DOUBLE: change the sigma value for smoothing (default 5)
63
63
  --add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1/#classes)
64
- --penv: use environment-dependent frequencies for log-odds calculation (default false) (NOT implemented yet)
64
+ --penv: use environment-dependent frequencies for log-odds calculation (default false) (NOT implemented yet!!!)
65
65
  --pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value (default none)
66
66
  --pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value (default none)
67
67
  --verbose (-v) INTEGER
@@ -137,12 +137,6 @@ Options:
137
137
  $outfile = "allmat.dat"
138
138
  $outfh = nil # file hanfle for outfile
139
139
  $output = 2
140
- $aa_tot_obs = {}
141
- $aa_mut_obs = {}
142
- $aa_mutb = {}
143
- $aa_rel_mutb = {}
144
- $aa_rel_freq = {}
145
- $env_aa_obs = {}
146
140
  $ali_size = 0
147
141
  $tot_aa = 0
148
142
  $sigma = 5.0
@@ -154,10 +148,20 @@ Options:
154
148
  $pidmin = nil
155
149
  $pidmax = nil
156
150
  $scale = 3
157
- $add = 0
151
+ $add = nil
152
+ $cys = 0
158
153
  $penv = false
159
- $heatmap = false
154
+
155
+ $aa_tot_obs = {}
156
+ $aa_mut_obs = {}
157
+ $aa_mutb = {}
158
+ $aa_rel_mutb = {}
159
+ $aa_rel_freq = {}
160
+ $env_aa_obs = {}
160
161
  $smooth_prob = {}
162
+ $tot_freq_mat = nil
163
+ $tot_prob_mat = nil
164
+ $tot_logo_mat = nil
161
165
 
162
166
  # Part 2.
163
167
  #
@@ -195,10 +199,8 @@ Options:
195
199
  $output = arg.to_i
196
200
  when '--outfile'
197
201
  $outfile = arg
198
- when '--cyc'
199
- $logger.error "!!! --cys option is not available yet"
200
- exit 1
201
- $cysteine = (arg.to_i == 1 ? false : true)
202
+ when '--cys'
203
+ $cys = (arg.to_i == 1 ? false : true)
202
204
  when '--weight'
203
205
  $weight = arg.to_i
204
206
  when '--sigma'
@@ -210,15 +212,17 @@ Options:
210
212
  when '--noweight'
211
213
  $noweight = true
212
214
  when '--smooth'
213
- $smooth = (arg.to_i == 1 ? :full : :parital)
215
+ $smooth = (arg.to_i == 1 ? :full : :partial)
214
216
  when '--nosmooth'
215
217
  $nosmooth = true
216
218
  when '--scale'
217
219
  $scale = arg.to_f
218
220
  when '--add'
221
+ $logger.error "!!! --add option is not supported yet"
222
+ exit 1
219
223
  $add = arg.to_f
220
224
  when '--penv'
221
- $logger.error "!!! --penv option is not available yet"
225
+ $logger.error "!!! --penv option is not supported yet"
222
226
  exit 1
223
227
  $penv = true
224
228
  when '--heatmap'
@@ -245,14 +249,19 @@ Options:
245
249
  exit 1
246
250
  end
247
251
 
252
+
248
253
  # Part 3.
249
254
  #
250
255
  # Reading Environment Class Definition File
251
256
  #
252
257
 
253
- # a hash for storing all environment feature objects
258
+ # an array for storing all environment feature objects
254
259
  $env_features = []
255
260
 
261
+
262
+ # an array for storing indexes of constrained environment features
263
+ $cst_features = []
264
+
256
265
  # aa1 amino acid in a substitution itself is a environment feature
257
266
  $env_features << EnvironmentFeature.new("sequence",
258
267
  $amino_acids,
@@ -262,24 +271,29 @@ Options:
262
271
 
263
272
  # read environment class definiton file and
264
273
  # store them into the hash prepared above
274
+ env_index = 1
275
+
265
276
  IO.foreach($classdef) do |line|
277
+ line.chomp!
266
278
  if line.start_with?("#")
267
279
  next
268
280
  elsif (env_ftr = line.chomp.split(/;/)).length == 5
269
- $logger.info ">>> An environment feature, #{line.chomp} detected"
281
+ $logger.info ">>> An environment feature, #{line} detected"
270
282
  if env_ftr[-1] == "T"
271
283
  # skip silenced environment feature
272
- $logger.warn "!!! The environment feature, #{line.chomp} silent"
284
+ $logger.warn "!!! The environment feature, #{line} silent"
273
285
  next
274
286
  end
275
287
  if env_ftr[-2] == "T"
276
- $logger.warn "!!! The environment feature, #{line.chomp} constrained"
288
+ $cst_features << env_index
289
+ $logger.warn "!!! The environment feature, #{line} constrained"
277
290
  end
278
291
  $env_features << EnvironmentFeature.new(env_ftr[0],
279
292
  env_ftr[1].split(""),
280
293
  env_ftr[2].split(""),
281
294
  env_ftr[3],
282
295
  env_ftr[4])
296
+ env_index += 1
283
297
  else
284
298
  $logger.error "@@@ #{line} doesn't seem to be a proper format for class definition"
285
299
  exit 1
@@ -325,7 +339,7 @@ Options:
325
339
  end
326
340
  end
327
341
 
328
- $ali_size += ali.size
342
+ $ali_size += 1
329
343
  env_labels = {}
330
344
  disulphide = {}
331
345
 
@@ -398,19 +412,27 @@ Options:
398
412
  aa2 = s2[pos].upcase
399
413
 
400
414
  if !$amino_acids.include?(aa1)
401
- $logger.warn "!!! #{id1}-#{pos}-#{aa1} is not standard amino acid" unless aa1 == "-"
415
+ $logger.warn "!!! #{id1}-#{pos}-#{aa1} is not a standard amino acid" unless aa1 == "-"
402
416
  next
403
417
  end
404
418
 
405
419
  if !$amino_acids.include?(aa2)
406
- $logger.warn "!!! #{id1}-#{pos}-#{aa2} is not standard amino acid" unless aa2 == "-"
420
+ $logger.warn "!!! #{id1}-#{pos}-#{aa2} is not a standard amino acid" unless aa2 == "-"
407
421
  next
408
422
  end
409
423
 
410
424
  aa1 = (((disulphide[id1][pos] == "F") && (aa1 == "C")) ? "J" : aa1)
411
425
  aa2 = (((disulphide[id2][pos] == "F") && (aa2 == "C")) ? "J" : aa2)
412
426
 
413
- $envs[env_labels[id1][pos]].add_residue_count(aa2)
427
+ if $cst_features.empty?
428
+ $envs[env_labels[id1][pos]].increase_residue_count(aa2)
429
+ elsif (env_labels[id1][pos].split("").values_at(*$cst_features) ==
430
+ env_labels[id2][pos].split("").values_at(*$cst_features))
431
+ $envs[env_labels[id1][pos]].increase_residue_count(aa2)
432
+ else
433
+ $logger.debug "*** #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other"
434
+ next
435
+ end
414
436
 
415
437
  grp_label = env_labels[id1][pos][1..-1]
416
438
 
@@ -485,6 +507,7 @@ Options:
485
507
  cluster2.each do |id2|
486
508
  seq1 = ali[id1].split("")
487
509
  seq2 = ali[id2].split("")
510
+
488
511
  seq1.each_with_index do |aa1, pos|
489
512
  if env_labels[id1][pos].include?("X")
490
513
  $logger.debug "*** Substitutions from #{id1}-#{pos}-#{aa1} were masked"
@@ -511,8 +534,17 @@ Options:
511
534
  obs1 = 1.0 / size1
512
535
  obs2 = 1.0 / size2
513
536
 
514
- $envs[env_labels[id1][pos]].add_residue_count(aa2, 1.0 / (size1 * size2))
515
- $envs[env_labels[id2][pos]].add_residue_count(aa1, 1.0 / (size1 * size2))
537
+ if $cst_features.empty?
538
+ $envs[env_labels[id1][pos]].increase_residue_count(aa2, 1.0 / (size1 * size2))
539
+ $envs[env_labels[id2][pos]].increase_residue_count(aa1, 1.0 / (size1 * size2))
540
+ elsif (env_labels[id1][pos].split("").values_at(*$cst_features) ==
541
+ env_labels[id2][pos].split("").values_at(*$cst_features))
542
+ $envs[env_labels[id1][pos]].increase_residue_count(aa2, 1.0 / (size1 * size2))
543
+ $envs[env_labels[id2][pos]].increase_residue_count(aa1, 1.0 / (size1 * size2))
544
+ else
545
+ $logger.debug "*** #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other"
546
+ next
547
+ end
516
548
 
517
549
  grp_label1 = env_labels[id1][pos][1..-1]
518
550
  grp_label2 = env_labels[id2][pos][1..-1]
@@ -605,7 +637,6 @@ HEADER
605
637
  else
606
638
  $outfh.puts "# Weighting scheme: clustering at PID #{$weight} level"
607
639
  end
608
- $outfh.puts "#"
609
640
 
610
641
  # calculate amino acid frequencies and mutabilities, and
611
642
  # print them as default statistics in the header part
@@ -614,7 +645,7 @@ HEADER
614
645
 
615
646
  $outfh.puts "#"
616
647
  $outfh.puts "# Total amino acid frequencies:\n"
617
- $outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES MUT_OBS TOT_OBS MUTB REL_MUTB REL_FRQ]
648
+ $outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FRQ]
618
649
 
619
650
  $aa_tot_obs.each_pair do |res, freq|
620
651
  $aa_mutb[res] = $aa_mut_obs[res] / freq.to_f
@@ -625,13 +656,18 @@ HEADER
625
656
  $amino_acids.each do |res|
626
657
  if $noweight
627
658
  $outfh.puts "# %-3s %9d %9d %5.2f %8d %8.4f" %
628
- [res, $aa_mut_obs[res], $aa_tot_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
659
+ [res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
629
660
  else
630
661
  $outfh.puts "# %-3s %9.2f %9.2f %5.2f %8d %8.4f" %
631
- [res, $aa_mut_obs[res], $aa_tot_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
662
+ [res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
632
663
  end
633
664
  end
634
- $outfh.puts "#"
665
+
666
+
667
+ # Part 5.
668
+ #
669
+ # Calculating substitution frequency tables
670
+ #
635
671
 
636
672
  # calculating probabilities for each environment
637
673
  $envs.values.each do |e|
@@ -641,7 +677,7 @@ HEADER
641
677
  end
642
678
 
643
679
  # count raw frequencies
644
- $tot_freq_matrix = ($noweight ? NMatrix.int(21,21) : NMatrix.float(21,21))
680
+ $tot_freq_mat = ($noweight ? NMatrix.int(21,21) : NMatrix.float(21,21))
645
681
 
646
682
  # for each combination of environment features
647
683
  env_groups = $envs.values.group_by { |env| env.label[1..-1] }
@@ -652,28 +688,33 @@ HEADER
652
688
  $env_features[i + 1].labels.index(l)
653
689
  }
654
690
  }.each_with_index do |group, group_no|
655
- grp_freq_matrix = ($noweight ? NMatrix.int(21,21) : NMatrix.float(21,21))
691
+ grp_freq_mat = ($noweight ? NMatrix.int(21,21) : NMatrix.float(21,21))
656
692
 
657
693
  $amino_acids.each_with_index do |aa, ai|
658
694
  freq_array = group[1].find { |e| e.label.start_with?(aa) }.freq_array
659
- 0.upto(20) { |j| grp_freq_matrix[ai, j] = freq_array[j] }
695
+ 0.upto(20) { |j| grp_freq_mat[ai, j] = freq_array[j] }
660
696
  end
661
697
 
662
- $tot_freq_matrix += grp_freq_matrix
698
+ $tot_freq_mat += grp_freq_mat
663
699
 
664
700
  if $output == 0
665
701
  $outfh.puts ">#{group[0]} #{group_no}"
666
- $outfh.puts grp_freq_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
702
+ $outfh.puts grp_freq_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
667
703
  end
668
704
  end
669
705
 
670
706
  if $output == 0
671
707
  $outfh.puts ">Total"
672
- $outfh.puts $tot_freq_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
708
+ $outfh.puts $tot_freq_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
673
709
  exit 0
674
710
  end
675
711
 
676
- # for probability
712
+
713
+ # Part 6.
714
+ #
715
+ # Calculating substitution probability tables
716
+ #
717
+
677
718
  if $output == 1
678
719
  $outfh.puts <<HEADER
679
720
  #
@@ -687,7 +728,7 @@ HEADER
687
728
 
688
729
  if ($output > 0) && $nosmooth
689
730
  # Probability matrices
690
- tot_prob_matrix = NMatrix.float(21, 21)
731
+ $tot_prob_mat = NMatrix.float(21, 21)
691
732
 
692
733
  # for each combination of environment features
693
734
  env_groups = $envs.values.group_by { |env| env.label[1..-1] }
@@ -697,24 +738,24 @@ HEADER
697
738
  $env_features[i + 1].labels.index(l)
698
739
  }
699
740
  }.each_with_index do |group, group_no|
700
- grp_prob_matrix = NMatrix.float(21,21)
741
+ grp_prob_mat = NMatrix.float(21,21)
701
742
 
702
743
  $amino_acids.each_with_index do |aa, ai|
703
744
  prob_array = group[1].find { |e| e.label.start_with?(aa) }.prob_array
704
- 0.upto(20) { |j| grp_prob_matrix[ai, j] = prob_array[j] }
745
+ 0.upto(20) { |j| grp_prob_mat[ai, j] = prob_array[j] }
705
746
  end
706
747
 
707
- tot_prob_matrix += grp_prob_matrix
748
+ $tot_prob_mat += grp_prob_mat
708
749
 
709
750
  if ($output == 1)
710
751
  $outfh.puts ">#{group[0]} #{group_no}"
711
- $outfh.puts grp_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
752
+ $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
712
753
  end
713
754
  end
714
755
 
715
756
  if ($output == 1)
716
757
  $outfh.puts ">Total"
717
- $outfh.puts tot_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
758
+ $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
718
759
  $outfh.close
719
760
  exit 0
720
761
  end
@@ -749,6 +790,7 @@ HEADER
749
790
 
750
791
  if $smooth == :partial
751
792
  $outfh.puts <<HEADER
793
+ #
752
794
  # Partial Smoothing:
753
795
  #
754
796
  # p1(ri) (i.e., amino acid composition) is estimated by summing over
@@ -771,7 +813,6 @@ HEADER
771
813
  # Weights (omegas) are calculated as in Topham et al. 1993)
772
814
  #
773
815
  # sigma value used is: 5.00
774
- #
775
816
  HEADER
776
817
  1.upto($env_features.size) do |ci|
777
818
  # for partial smoothing, only P1 ~ P3, and Pn are considered
@@ -872,6 +913,7 @@ HEADER
872
913
  end
873
914
  else
874
915
  $outfh.puts <<HEADER
916
+ #
875
917
  # Full Smoothing:
876
918
  #
877
919
  # p1(ri) is estimated as:
@@ -897,7 +939,6 @@ HEADER
897
939
  # Weights (omegas) are calculated as in Topham et al. 1993)
898
940
  #
899
941
  # sigma value used is: 5.00
900
- #
901
942
  HEADER
902
943
  # full smooting
903
944
  1.upto($env_features.size) do |ci|
@@ -959,7 +1000,7 @@ HEADER
959
1000
  $envs.values.each { |e| e.smooth_prob_array = $smooth_prob[$env_features.size + 1][e.label_set] }
960
1001
 
961
1002
  # for a total substitution probability matrix
962
- tot_smooth_prob_matrix = NMatrix.float(21,21)
1003
+ $tot_prob_mat = NMatrix.float(21,21)
963
1004
 
964
1005
  # grouping environments by its environment labels but amino acid label
965
1006
  env_groups = $envs.values.group_by { |env| env.label[1..-1] }
@@ -972,46 +1013,54 @@ HEADER
972
1013
  }
973
1014
  }.each_with_index do |group, group_no|
974
1015
  # calculating 21X21 substitution probability matrix for each envrionment
975
- grp_prob_matrix = NMatrix.float(21,21)
1016
+ grp_prob_mat = NMatrix.float(21,21)
976
1017
 
977
1018
  $amino_acids.each_with_index do |aa, ai|
978
1019
  smooth_prob_array = group[1].find { |e| e.label.start_with?(aa) }.smooth_prob_array
979
- 0.upto(20) { |j| grp_prob_matrix[ai, j] = smooth_prob_array[j] }
1020
+ 0.upto(20) { |j| grp_prob_mat[ai, j] = smooth_prob_array[j] }
980
1021
  end
981
1022
 
982
- tot_smooth_prob_matrix += grp_prob_matrix
1023
+ $tot_prob_mat += grp_prob_mat
983
1024
 
984
1025
  if $output == 1
985
1026
  $outfh.puts ">#{group[0]} #{group_no}"
986
- $outfh.puts grp_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1027
+ $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
987
1028
  end
988
1029
  end
989
1030
 
990
- tot_smooth_prob_matrix /= env_groups.size
1031
+ $tot_prob_mat /= env_groups.size
991
1032
 
992
1033
  if $output == 1
993
1034
  $outfh.puts ">Total"
994
- $outfh.puts tot_smooth_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1035
+ $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
995
1036
  $outfh.close
996
1037
  exit 0
997
1038
  end
998
1039
 
1040
+
1041
+ # Part 7.
1042
+ #
1043
+ # Calculating log-add ratio scoring matrices
1044
+ #
999
1045
  if $output == 2
1000
1046
  $outfh.puts <<HEADER
1001
1047
  #
1002
1048
  # The probabilities were then divided by the background probabilities
1049
+ HEADER
1050
+ if $penv
1051
+ $outfh.puts <<HEADER
1003
1052
  # which were derived from the environment-independent amino acid frequencies.
1004
1053
  # ^^^^^^^^^^^^^^^^^^^^^^^
1005
- #
1006
- # Shown here are logarithms of these values multiplied by 3/log(2)
1007
- # rounded to the nearest integer (log-odds scores in 1/3 bit units).
1008
- #
1009
- # For total (composite) matrix, Entropy = XXX bits, Expected score = XXX
1010
- #
1011
1054
  HEADER
1055
+ else
1056
+ $outfh.puts <<HEADER
1057
+ # which were derived from the environment-dependent amino acid frequencies.
1058
+ # ^^^^^^^^^^^^^^^^^^^^^
1059
+ HEADER
1060
+ end
1012
1061
 
1013
- # log-add ratio matrices from now on
1014
- tot_logo_mat = NMatrix.float(21,21)
1062
+ $tot_logo_mat = $cys ? NMatrix.float(21,22) : NMatrix.float(21,21)
1063
+ grp_logo_mats = []
1015
1064
  factor = $scale / Math::log(2)
1016
1065
 
1017
1066
  # grouping environments by its environment labels but amino acid label
@@ -1027,30 +1076,82 @@ HEADER
1027
1076
  # calculating 21X21 substitution probability matrix for each envrionment
1028
1077
  grp_label = group[0]
1029
1078
  grp_envs = group[1]
1030
- grp_logo_mat = NMatrix.float(21,21)
1079
+ grp_logo_mat = $cys ? NMatrix.float(21, 22) : NMatrix.float(21,21)
1031
1080
 
1032
1081
  $amino_acids.each_with_index do |aa, ai|
1033
1082
  env = grp_envs.detect { |e| e.label.start_with?(aa) }
1034
- logo_arr = NArray.float(21)
1083
+ logo_arr = $cys ? NArray.float(22) : NArray.float(21)
1035
1084
 
1036
1085
  env.smooth_prob_array.to_a.each_with_index do |prob, j|
1037
- paj = 100.0 * $aa_rel_freq[$amino_acids[j]]
1038
- odds = prob == 0.0 ? 0.000001 / paj : prob / paj
1086
+ paj = 100.0 * $aa_rel_freq[$amino_acids[j]]
1087
+ odds = prob == 0.0 ? 0.000001 / paj : prob / paj
1039
1088
  logo_arr[j] = factor * Math::log(odds)
1040
1089
  end
1090
+
1041
1091
  0.upto(20) { |j| grp_logo_mat[ai, j] = logo_arr[j] }
1092
+
1093
+ # adding log odds ratio for "U" (J or C) when --cyc is ON
1094
+ if $cys
1095
+ paj = 100.0 * ($aa_rel_freq["C"] + $aa_rel_freq["J"])
1096
+ prob = env.smooth_prob_array[$amino_acids.index("C")] + env.smooth_prob_array[$amino_acids.index("J")]
1097
+ odds = prob == 0.0 ? 0.000001 / paj : prob / paj
1098
+ logo_arr[logo_arr.size - 1] = factor * Math::log(odds)
1099
+ grp_logo_mat[ai, logo_arr.size - 1] = logo_arr[logo_arr.size - 1]
1100
+ end
1042
1101
  end
1043
1102
 
1044
- tot_logo_mat += grp_logo_mat
1103
+ $tot_logo_mat += grp_logo_mat
1104
+ grp_logo_mats << [grp_label, grp_logo_mat]
1105
+ end
1045
1106
 
1046
- $outfh.puts ">#{grp_label} #{group_no}"
1047
- $outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1107
+ $tot_logo_mat /= env_groups.size
1108
+
1109
+ # calculating relative entropy for each amino acid pair H and
1110
+ # the expected score E in bit units
1111
+ #
1112
+ # I'm a bit suspicious about this part...
1113
+ tot_E = 0.0
1114
+ tot_H = 0.0
1115
+
1116
+ 0.upto($tot_logo_mat.shape[0] - 1) do |i|
1117
+ 0.upto($tot_logo_mat.shape[0] - 1) do |j|
1118
+ if i != j
1119
+ tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]] / 2.0
1120
+ tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 2.0 / 10000.0
1121
+ else
1122
+ tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]]
1123
+ tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 10000.0
1124
+ end
1125
+ end
1048
1126
  end
1049
1127
 
1050
- tot_logo_mat /= env_groups.size
1128
+ $outfh.puts <<HEADER
1129
+ #
1130
+ # Shown here are logarithms of these values multiplied by #{$scale}/log(2)
1131
+ # rounded to the nearest integer (log-odds scores in 1/3 bit units).
1132
+ #
1133
+ # For total (composite) matrix, Entropy = #{"%5.4f" % tot_H} bits, Expected score = #{"%5.4f" % tot_E}
1134
+ #
1135
+ HEADER
1136
+
1137
+ grp_logo_mats.each_with_index do |arr, grp_no|
1138
+ grp_label = arr[0]
1139
+ grp_logo_mat = arr[1]
1051
1140
 
1052
- $outfh.puts ">Total"
1053
- $outfh.puts tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1141
+ $outfh.puts ">#{grp_label} #{grp_no}"
1142
+ if $cys
1143
+ $outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
1144
+ else
1145
+ $outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1146
+ end
1147
+ end
1148
+
1149
+ $outfh.puts ">Total #{grp_logo_mats.size}"
1150
+ if $cys
1151
+ $outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
1152
+ else
1153
+ $outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1154
+ end
1054
1155
  $outfh.close
1055
1156
  exit 0
1056
1157
  end
data/lib/egor.rb CHANGED
@@ -2,5 +2,5 @@ $:.unshift(File.dirname(__FILE__)) unless
2
2
  $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
3
 
4
4
  module Egor
5
- VERSION = '0.0.1'
6
- end
5
+ VERSION = '0.0.2'
6
+ end
data/lib/environment.rb CHANGED
@@ -23,7 +23,7 @@ class Environment
23
23
  @smooth_prob_array = NArray.float(21)
24
24
  end
25
25
 
26
- def add_residue_count(a, inc = 1.0)
26
+ def increase_residue_count(a, inc = 1.0)
27
27
  @freq_array[@@amino_acids.index(a.upcase)] += inc
28
28
  end
29
29
 
@@ -1,7 +1,7 @@
1
1
  class EnvironmentFeature < Struct.new(:name, :symbols, :labels, :constrained, :silent)
2
2
 
3
3
  def to_s
4
- values.join(";")
4
+ [name, symbols.join, labels.join, constrained, silent].join(";")
5
5
  end
6
6
 
7
7
  def constrained?
data/website/index.html CHANGED
@@ -44,6 +44,8 @@
44
44
  <h2>Features</h2>
45
45
  <ul>
46
46
  <li>No more segmentation fault</li>
47
+ <li>Fast enough not to leave your place</li>
48
+ <li>Slow enough to check your emails or have some chats with your colleagues next you</li>
47
49
  <li>Full smoothing supported</li>
48
50
  <li>In theory, infinite number of environment features can be handled</li>
49
51
  </ul>
@@ -53,7 +55,7 @@
53
55
  or
54
56
  <pre>$ egor -l TEM-file -c classdef.dat</pre>
55
57
  <h2>Repository</h2>
56
- <p>You can download a pre-built RubyGem package from</p>
58
+ <p>You can download a pre-built RubyGems package from</p>
57
59
  <ul>
58
60
  <li>rubyforge: <a href="http://rubyforge.org/projects/egor">http://rubyforge.org/projects/egor</a></li>
59
61
  </ul>
@@ -67,12 +69,19 @@ or
67
69
  <h2>Contact</h2>
68
70
  <p>Comments are welcome, please send an email to me (seminlee at gmail dot com).</p>
69
71
  <p class="coda">
70
- <a href="FIXME email">Semin Lee</a>, 10th November 2008<br>
72
+ Semin Lee, 12th November 2008<br>
71
73
  Theme extended from <a href="http://rb2js.rubyforge.org/">Paul Battley</a>
72
74
  </p>
73
75
  </div>
74
76
 
75
77
  <!-- insert site tracking codes here, like Google Urchin -->
76
-
78
+ <script type="text/javascript">
79
+ var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
80
+ document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
81
+ </script>
82
+ <script type="text/javascript">
83
+ var pageTracker = _gat._getTracker("UA-6291956-1");
84
+ pageTracker._trackPageview();
85
+ </script>
77
86
  </body>
78
87
  </html>
data/website/index.txt CHANGED
@@ -14,6 +14,8 @@ h2. Installation
14
14
  h2. Features
15
15
 
16
16
  * No more segmentation fault
17
+ * Fast enough not to leave your place
18
+ * Slow enough to check your emails or have some chats with your colleagues next you
17
19
  * Full smoothing supported
18
20
  * In theory, infinite number of environment features can be handled
19
21
 
@@ -29,7 +31,7 @@ It's pretty much the same as Kenji's subst, so in most cases, you just need swap
29
31
 
30
32
  h2. Repository
31
33
 
32
- You can download a pre-built RubyGem package from
34
+ You can download a pre-built RubyGems package from
33
35
 
34
36
  * rubyforge: "http://rubyforge.org/projects/egor":http://rubyforge.org/projects/egor
35
37
 
@@ -39,12 +39,19 @@
39
39
  </div>
40
40
  <%= body %>
41
41
  <p class="coda">
42
- <a href="FIXME email">Semin Lee</a>, <%= modified.pretty %><br>
42
+ Semin Lee, <%= modified.pretty %><br>
43
43
  Theme extended from <a href="http://rb2js.rubyforge.org/">Paul Battley</a>
44
44
  </p>
45
45
  </div>
46
46
 
47
47
  <!-- insert site tracking codes here, like Google Urchin -->
48
-
48
+ <script type="text/javascript">
49
+ var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
50
+ document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
51
+ </script>
52
+ <script type="text/javascript">
53
+ var pageTracker = _gat._getTracker("UA-6291956-1");
54
+ pageTracker._trackPageview();
55
+ </script>
49
56
  </body>
50
57
  </html>
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: egor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Semin Lee
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-11-10 00:00:00 +00:00
12
+ date: 2008-11-13 00:00:00 +00:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -60,7 +60,7 @@ dependencies:
60
60
  requirements:
61
61
  - - ">="
62
62
  - !ruby/object:Gem::Version
63
- version: 1.0.7
63
+ version: 1.1.0
64
64
  version:
65
65
  - !ruby/object:Gem::Dependency
66
66
  name: hoe