egor 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,3 +1,9 @@
1
+ == 0.0.2 2008-11-13
2
+
3
+ * 2 major enhancement:
4
+ * Constrained environment features are properly handled
5
+ * Sane log-odds ratio matrices are produced
6
+
1
7
  == 0.0.1 2008-11-07
2
8
 
3
9
  * 1 major enhancement:
data/README.rdoc CHANGED
@@ -9,8 +9,10 @@ egor: Esst GeneratOR, a program for calculating environment-specific substitutio
9
9
  == FEATURES/PROBLEMS:
10
10
 
11
11
  * No more segmentation fault
12
+ * Fast enough not to leave your place
13
+ * Slow enough to check your emails or have some chats with your colleagues next you
12
14
  * Full smoothing supported
13
- * Infinite number of environment features can be handled (in theory)
15
+ * In theory, infinite number of environment features can be handled
14
16
 
15
17
  == BASIC USAGE:
16
18
 
data/lib/egor/cli.rb CHANGED
@@ -41,19 +41,19 @@ Usage:
41
41
  egor [ options ] -f TEM-file -c CLASSDEF-file
42
42
 
43
43
  Options:
44
- --tem-file (-f) STRING: a tem file
45
- --tem-list (-l) STRING: a list for tem files
46
- --classdef (-c) STRING: a file for the defintion of environments (default: 'classdef.dat')
47
- --outfile (-o) STRING: output filename ("allmat.dat" if not specified)
44
+ --tem-file (-f) FILE: a tem file
45
+ --tem-list (-l) FILE: a list for tem files
46
+ --classdef (-c) FILE: a file for the defintion of environments (default: 'classdef.dat')
47
+ --outfile (-o) FILE: output filename ("allmat.dat" if not specified)
48
48
  --weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting
49
49
  --noweight: calculate substitution counts with no weights (default)
50
50
  --smooth (-s) INTEGER:
51
51
  0 for parial smoothing (default)
52
52
  1 for full smoothing
53
53
  --nosmooth: perform no smoothing operation
54
- --cys (-y) INTEGER: (NOT implemented yet)
55
- 0 for using C and J only for structure
56
- 1 for both structure and sequence (default)
54
+ --cys (-y) INTEGER:
55
+ 0 for using C and J only for structure (default)
56
+ 1 for both structure and sequence
57
57
  --output INTEGER:
58
58
  0 for raw counts (no-smoothing performed)
59
59
  1 for probabilities
@@ -61,7 +61,7 @@ Options:
61
61
  --scale INTEGER: log-odds matrices in 1/n bit units (default 3)
62
62
  --sigma DOUBLE: change the sigma value for smoothing (default 5)
63
63
  --add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1/#classes)
64
- --penv: use environment-dependent frequencies for log-odds calculation (default false) (NOT implemented yet)
64
+ --penv: use environment-dependent frequencies for log-odds calculation (default false) (NOT implemented yet!!!)
65
65
  --pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value (default none)
66
66
  --pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value (default none)
67
67
  --verbose (-v) INTEGER
@@ -137,12 +137,6 @@ Options:
137
137
  $outfile = "allmat.dat"
138
138
  $outfh = nil # file hanfle for outfile
139
139
  $output = 2
140
- $aa_tot_obs = {}
141
- $aa_mut_obs = {}
142
- $aa_mutb = {}
143
- $aa_rel_mutb = {}
144
- $aa_rel_freq = {}
145
- $env_aa_obs = {}
146
140
  $ali_size = 0
147
141
  $tot_aa = 0
148
142
  $sigma = 5.0
@@ -154,10 +148,20 @@ Options:
154
148
  $pidmin = nil
155
149
  $pidmax = nil
156
150
  $scale = 3
157
- $add = 0
151
+ $add = nil
152
+ $cys = 0
158
153
  $penv = false
159
- $heatmap = false
154
+
155
+ $aa_tot_obs = {}
156
+ $aa_mut_obs = {}
157
+ $aa_mutb = {}
158
+ $aa_rel_mutb = {}
159
+ $aa_rel_freq = {}
160
+ $env_aa_obs = {}
160
161
  $smooth_prob = {}
162
+ $tot_freq_mat = nil
163
+ $tot_prob_mat = nil
164
+ $tot_logo_mat = nil
161
165
 
162
166
  # Part 2.
163
167
  #
@@ -195,10 +199,8 @@ Options:
195
199
  $output = arg.to_i
196
200
  when '--outfile'
197
201
  $outfile = arg
198
- when '--cyc'
199
- $logger.error "!!! --cys option is not available yet"
200
- exit 1
201
- $cysteine = (arg.to_i == 1 ? false : true)
202
+ when '--cys'
203
+ $cys = (arg.to_i == 1 ? false : true)
202
204
  when '--weight'
203
205
  $weight = arg.to_i
204
206
  when '--sigma'
@@ -210,15 +212,17 @@ Options:
210
212
  when '--noweight'
211
213
  $noweight = true
212
214
  when '--smooth'
213
- $smooth = (arg.to_i == 1 ? :full : :parital)
215
+ $smooth = (arg.to_i == 1 ? :full : :partial)
214
216
  when '--nosmooth'
215
217
  $nosmooth = true
216
218
  when '--scale'
217
219
  $scale = arg.to_f
218
220
  when '--add'
221
+ $logger.error "!!! --add option is not supported yet"
222
+ exit 1
219
223
  $add = arg.to_f
220
224
  when '--penv'
221
- $logger.error "!!! --penv option is not available yet"
225
+ $logger.error "!!! --penv option is not supported yet"
222
226
  exit 1
223
227
  $penv = true
224
228
  when '--heatmap'
@@ -245,14 +249,19 @@ Options:
245
249
  exit 1
246
250
  end
247
251
 
252
+
248
253
  # Part 3.
249
254
  #
250
255
  # Reading Environment Class Definition File
251
256
  #
252
257
 
253
- # a hash for storing all environment feature objects
258
+ # an array for storing all environment feature objects
254
259
  $env_features = []
255
260
 
261
+
262
+ # an array for storing indexes of constrained environment features
263
+ $cst_features = []
264
+
256
265
  # aa1 amino acid in a substitution itself is a environment feature
257
266
  $env_features << EnvironmentFeature.new("sequence",
258
267
  $amino_acids,
@@ -262,24 +271,29 @@ Options:
262
271
 
263
272
  # read environment class definiton file and
264
273
  # store them into the hash prepared above
274
+ env_index = 1
275
+
265
276
  IO.foreach($classdef) do |line|
277
+ line.chomp!
266
278
  if line.start_with?("#")
267
279
  next
268
280
  elsif (env_ftr = line.chomp.split(/;/)).length == 5
269
- $logger.info ">>> An environment feature, #{line.chomp} detected"
281
+ $logger.info ">>> An environment feature, #{line} detected"
270
282
  if env_ftr[-1] == "T"
271
283
  # skip silenced environment feature
272
- $logger.warn "!!! The environment feature, #{line.chomp} silent"
284
+ $logger.warn "!!! The environment feature, #{line} silent"
273
285
  next
274
286
  end
275
287
  if env_ftr[-2] == "T"
276
- $logger.warn "!!! The environment feature, #{line.chomp} constrained"
288
+ $cst_features << env_index
289
+ $logger.warn "!!! The environment feature, #{line} constrained"
277
290
  end
278
291
  $env_features << EnvironmentFeature.new(env_ftr[0],
279
292
  env_ftr[1].split(""),
280
293
  env_ftr[2].split(""),
281
294
  env_ftr[3],
282
295
  env_ftr[4])
296
+ env_index += 1
283
297
  else
284
298
  $logger.error "@@@ #{line} doesn't seem to be a proper format for class definition"
285
299
  exit 1
@@ -325,7 +339,7 @@ Options:
325
339
  end
326
340
  end
327
341
 
328
- $ali_size += ali.size
342
+ $ali_size += 1
329
343
  env_labels = {}
330
344
  disulphide = {}
331
345
 
@@ -398,19 +412,27 @@ Options:
398
412
  aa2 = s2[pos].upcase
399
413
 
400
414
  if !$amino_acids.include?(aa1)
401
- $logger.warn "!!! #{id1}-#{pos}-#{aa1} is not standard amino acid" unless aa1 == "-"
415
+ $logger.warn "!!! #{id1}-#{pos}-#{aa1} is not a standard amino acid" unless aa1 == "-"
402
416
  next
403
417
  end
404
418
 
405
419
  if !$amino_acids.include?(aa2)
406
- $logger.warn "!!! #{id1}-#{pos}-#{aa2} is not standard amino acid" unless aa2 == "-"
420
+ $logger.warn "!!! #{id1}-#{pos}-#{aa2} is not a standard amino acid" unless aa2 == "-"
407
421
  next
408
422
  end
409
423
 
410
424
  aa1 = (((disulphide[id1][pos] == "F") && (aa1 == "C")) ? "J" : aa1)
411
425
  aa2 = (((disulphide[id2][pos] == "F") && (aa2 == "C")) ? "J" : aa2)
412
426
 
413
- $envs[env_labels[id1][pos]].add_residue_count(aa2)
427
+ if $cst_features.empty?
428
+ $envs[env_labels[id1][pos]].increase_residue_count(aa2)
429
+ elsif (env_labels[id1][pos].split("").values_at(*$cst_features) ==
430
+ env_labels[id2][pos].split("").values_at(*$cst_features))
431
+ $envs[env_labels[id1][pos]].increase_residue_count(aa2)
432
+ else
433
+ $logger.debug "*** #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other"
434
+ next
435
+ end
414
436
 
415
437
  grp_label = env_labels[id1][pos][1..-1]
416
438
 
@@ -485,6 +507,7 @@ Options:
485
507
  cluster2.each do |id2|
486
508
  seq1 = ali[id1].split("")
487
509
  seq2 = ali[id2].split("")
510
+
488
511
  seq1.each_with_index do |aa1, pos|
489
512
  if env_labels[id1][pos].include?("X")
490
513
  $logger.debug "*** Substitutions from #{id1}-#{pos}-#{aa1} were masked"
@@ -511,8 +534,17 @@ Options:
511
534
  obs1 = 1.0 / size1
512
535
  obs2 = 1.0 / size2
513
536
 
514
- $envs[env_labels[id1][pos]].add_residue_count(aa2, 1.0 / (size1 * size2))
515
- $envs[env_labels[id2][pos]].add_residue_count(aa1, 1.0 / (size1 * size2))
537
+ if $cst_features.empty?
538
+ $envs[env_labels[id1][pos]].increase_residue_count(aa2, 1.0 / (size1 * size2))
539
+ $envs[env_labels[id2][pos]].increase_residue_count(aa1, 1.0 / (size1 * size2))
540
+ elsif (env_labels[id1][pos].split("").values_at(*$cst_features) ==
541
+ env_labels[id2][pos].split("").values_at(*$cst_features))
542
+ $envs[env_labels[id1][pos]].increase_residue_count(aa2, 1.0 / (size1 * size2))
543
+ $envs[env_labels[id2][pos]].increase_residue_count(aa1, 1.0 / (size1 * size2))
544
+ else
545
+ $logger.debug "*** #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other"
546
+ next
547
+ end
516
548
 
517
549
  grp_label1 = env_labels[id1][pos][1..-1]
518
550
  grp_label2 = env_labels[id2][pos][1..-1]
@@ -605,7 +637,6 @@ HEADER
605
637
  else
606
638
  $outfh.puts "# Weighting scheme: clustering at PID #{$weight} level"
607
639
  end
608
- $outfh.puts "#"
609
640
 
610
641
  # calculate amino acid frequencies and mutabilities, and
611
642
  # print them as default statistics in the header part
@@ -614,7 +645,7 @@ HEADER
614
645
 
615
646
  $outfh.puts "#"
616
647
  $outfh.puts "# Total amino acid frequencies:\n"
617
- $outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES MUT_OBS TOT_OBS MUTB REL_MUTB REL_FRQ]
648
+ $outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FRQ]
618
649
 
619
650
  $aa_tot_obs.each_pair do |res, freq|
620
651
  $aa_mutb[res] = $aa_mut_obs[res] / freq.to_f
@@ -625,13 +656,18 @@ HEADER
625
656
  $amino_acids.each do |res|
626
657
  if $noweight
627
658
  $outfh.puts "# %-3s %9d %9d %5.2f %8d %8.4f" %
628
- [res, $aa_mut_obs[res], $aa_tot_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
659
+ [res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
629
660
  else
630
661
  $outfh.puts "# %-3s %9.2f %9.2f %5.2f %8d %8.4f" %
631
- [res, $aa_mut_obs[res], $aa_tot_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
662
+ [res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
632
663
  end
633
664
  end
634
- $outfh.puts "#"
665
+
666
+
667
+ # Part 5.
668
+ #
669
+ # Calculating substitution frequency tables
670
+ #
635
671
 
636
672
  # calculating probabilities for each environment
637
673
  $envs.values.each do |e|
@@ -641,7 +677,7 @@ HEADER
641
677
  end
642
678
 
643
679
  # count raw frequencies
644
- $tot_freq_matrix = ($noweight ? NMatrix.int(21,21) : NMatrix.float(21,21))
680
+ $tot_freq_mat = ($noweight ? NMatrix.int(21,21) : NMatrix.float(21,21))
645
681
 
646
682
  # for each combination of environment features
647
683
  env_groups = $envs.values.group_by { |env| env.label[1..-1] }
@@ -652,28 +688,33 @@ HEADER
652
688
  $env_features[i + 1].labels.index(l)
653
689
  }
654
690
  }.each_with_index do |group, group_no|
655
- grp_freq_matrix = ($noweight ? NMatrix.int(21,21) : NMatrix.float(21,21))
691
+ grp_freq_mat = ($noweight ? NMatrix.int(21,21) : NMatrix.float(21,21))
656
692
 
657
693
  $amino_acids.each_with_index do |aa, ai|
658
694
  freq_array = group[1].find { |e| e.label.start_with?(aa) }.freq_array
659
- 0.upto(20) { |j| grp_freq_matrix[ai, j] = freq_array[j] }
695
+ 0.upto(20) { |j| grp_freq_mat[ai, j] = freq_array[j] }
660
696
  end
661
697
 
662
- $tot_freq_matrix += grp_freq_matrix
698
+ $tot_freq_mat += grp_freq_mat
663
699
 
664
700
  if $output == 0
665
701
  $outfh.puts ">#{group[0]} #{group_no}"
666
- $outfh.puts grp_freq_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
702
+ $outfh.puts grp_freq_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
667
703
  end
668
704
  end
669
705
 
670
706
  if $output == 0
671
707
  $outfh.puts ">Total"
672
- $outfh.puts $tot_freq_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
708
+ $outfh.puts $tot_freq_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
673
709
  exit 0
674
710
  end
675
711
 
676
- # for probability
712
+
713
+ # Part 6.
714
+ #
715
+ # Calculating substitution probability tables
716
+ #
717
+
677
718
  if $output == 1
678
719
  $outfh.puts <<HEADER
679
720
  #
@@ -687,7 +728,7 @@ HEADER
687
728
 
688
729
  if ($output > 0) && $nosmooth
689
730
  # Probability matrices
690
- tot_prob_matrix = NMatrix.float(21, 21)
731
+ $tot_prob_mat = NMatrix.float(21, 21)
691
732
 
692
733
  # for each combination of environment features
693
734
  env_groups = $envs.values.group_by { |env| env.label[1..-1] }
@@ -697,24 +738,24 @@ HEADER
697
738
  $env_features[i + 1].labels.index(l)
698
739
  }
699
740
  }.each_with_index do |group, group_no|
700
- grp_prob_matrix = NMatrix.float(21,21)
741
+ grp_prob_mat = NMatrix.float(21,21)
701
742
 
702
743
  $amino_acids.each_with_index do |aa, ai|
703
744
  prob_array = group[1].find { |e| e.label.start_with?(aa) }.prob_array
704
- 0.upto(20) { |j| grp_prob_matrix[ai, j] = prob_array[j] }
745
+ 0.upto(20) { |j| grp_prob_mat[ai, j] = prob_array[j] }
705
746
  end
706
747
 
707
- tot_prob_matrix += grp_prob_matrix
748
+ $tot_prob_mat += grp_prob_mat
708
749
 
709
750
  if ($output == 1)
710
751
  $outfh.puts ">#{group[0]} #{group_no}"
711
- $outfh.puts grp_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
752
+ $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
712
753
  end
713
754
  end
714
755
 
715
756
  if ($output == 1)
716
757
  $outfh.puts ">Total"
717
- $outfh.puts tot_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
758
+ $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
718
759
  $outfh.close
719
760
  exit 0
720
761
  end
@@ -749,6 +790,7 @@ HEADER
749
790
 
750
791
  if $smooth == :partial
751
792
  $outfh.puts <<HEADER
793
+ #
752
794
  # Partial Smoothing:
753
795
  #
754
796
  # p1(ri) (i.e., amino acid composition) is estimated by summing over
@@ -771,7 +813,6 @@ HEADER
771
813
  # Weights (omegas) are calculated as in Topham et al. 1993)
772
814
  #
773
815
  # sigma value used is: 5.00
774
- #
775
816
  HEADER
776
817
  1.upto($env_features.size) do |ci|
777
818
  # for partial smoothing, only P1 ~ P3, and Pn are considered
@@ -872,6 +913,7 @@ HEADER
872
913
  end
873
914
  else
874
915
  $outfh.puts <<HEADER
916
+ #
875
917
  # Full Smoothing:
876
918
  #
877
919
  # p1(ri) is estimated as:
@@ -897,7 +939,6 @@ HEADER
897
939
  # Weights (omegas) are calculated as in Topham et al. 1993)
898
940
  #
899
941
  # sigma value used is: 5.00
900
- #
901
942
  HEADER
902
943
  # full smooting
903
944
  1.upto($env_features.size) do |ci|
@@ -959,7 +1000,7 @@ HEADER
959
1000
  $envs.values.each { |e| e.smooth_prob_array = $smooth_prob[$env_features.size + 1][e.label_set] }
960
1001
 
961
1002
  # for a total substitution probability matrix
962
- tot_smooth_prob_matrix = NMatrix.float(21,21)
1003
+ $tot_prob_mat = NMatrix.float(21,21)
963
1004
 
964
1005
  # grouping environments by its environment labels but amino acid label
965
1006
  env_groups = $envs.values.group_by { |env| env.label[1..-1] }
@@ -972,46 +1013,54 @@ HEADER
972
1013
  }
973
1014
  }.each_with_index do |group, group_no|
974
1015
  # calculating 21X21 substitution probability matrix for each envrionment
975
- grp_prob_matrix = NMatrix.float(21,21)
1016
+ grp_prob_mat = NMatrix.float(21,21)
976
1017
 
977
1018
  $amino_acids.each_with_index do |aa, ai|
978
1019
  smooth_prob_array = group[1].find { |e| e.label.start_with?(aa) }.smooth_prob_array
979
- 0.upto(20) { |j| grp_prob_matrix[ai, j] = smooth_prob_array[j] }
1020
+ 0.upto(20) { |j| grp_prob_mat[ai, j] = smooth_prob_array[j] }
980
1021
  end
981
1022
 
982
- tot_smooth_prob_matrix += grp_prob_matrix
1023
+ $tot_prob_mat += grp_prob_mat
983
1024
 
984
1025
  if $output == 1
985
1026
  $outfh.puts ">#{group[0]} #{group_no}"
986
- $outfh.puts grp_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1027
+ $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
987
1028
  end
988
1029
  end
989
1030
 
990
- tot_smooth_prob_matrix /= env_groups.size
1031
+ $tot_prob_mat /= env_groups.size
991
1032
 
992
1033
  if $output == 1
993
1034
  $outfh.puts ">Total"
994
- $outfh.puts tot_smooth_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1035
+ $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
995
1036
  $outfh.close
996
1037
  exit 0
997
1038
  end
998
1039
 
1040
+
1041
+ # Part 7.
1042
+ #
1043
+ # Calculating log-add ratio scoring matrices
1044
+ #
999
1045
  if $output == 2
1000
1046
  $outfh.puts <<HEADER
1001
1047
  #
1002
1048
  # The probabilities were then divided by the background probabilities
1049
+ HEADER
1050
+ if $penv
1051
+ $outfh.puts <<HEADER
1003
1052
  # which were derived from the environment-independent amino acid frequencies.
1004
1053
  # ^^^^^^^^^^^^^^^^^^^^^^^
1005
- #
1006
- # Shown here are logarithms of these values multiplied by 3/log(2)
1007
- # rounded to the nearest integer (log-odds scores in 1/3 bit units).
1008
- #
1009
- # For total (composite) matrix, Entropy = XXX bits, Expected score = XXX
1010
- #
1011
1054
  HEADER
1055
+ else
1056
+ $outfh.puts <<HEADER
1057
+ # which were derived from the environment-dependent amino acid frequencies.
1058
+ # ^^^^^^^^^^^^^^^^^^^^^
1059
+ HEADER
1060
+ end
1012
1061
 
1013
- # log-add ratio matrices from now on
1014
- tot_logo_mat = NMatrix.float(21,21)
1062
+ $tot_logo_mat = $cys ? NMatrix.float(21,22) : NMatrix.float(21,21)
1063
+ grp_logo_mats = []
1015
1064
  factor = $scale / Math::log(2)
1016
1065
 
1017
1066
  # grouping environments by its environment labels but amino acid label
@@ -1027,30 +1076,82 @@ HEADER
1027
1076
  # calculating 21X21 substitution probability matrix for each envrionment
1028
1077
  grp_label = group[0]
1029
1078
  grp_envs = group[1]
1030
- grp_logo_mat = NMatrix.float(21,21)
1079
+ grp_logo_mat = $cys ? NMatrix.float(21, 22) : NMatrix.float(21,21)
1031
1080
 
1032
1081
  $amino_acids.each_with_index do |aa, ai|
1033
1082
  env = grp_envs.detect { |e| e.label.start_with?(aa) }
1034
- logo_arr = NArray.float(21)
1083
+ logo_arr = $cys ? NArray.float(22) : NArray.float(21)
1035
1084
 
1036
1085
  env.smooth_prob_array.to_a.each_with_index do |prob, j|
1037
- paj = 100.0 * $aa_rel_freq[$amino_acids[j]]
1038
- odds = prob == 0.0 ? 0.000001 / paj : prob / paj
1086
+ paj = 100.0 * $aa_rel_freq[$amino_acids[j]]
1087
+ odds = prob == 0.0 ? 0.000001 / paj : prob / paj
1039
1088
  logo_arr[j] = factor * Math::log(odds)
1040
1089
  end
1090
+
1041
1091
  0.upto(20) { |j| grp_logo_mat[ai, j] = logo_arr[j] }
1092
+
1093
+ # adding log odds ratio for "U" (J or C) when --cyc is ON
1094
+ if $cys
1095
+ paj = 100.0 * ($aa_rel_freq["C"] + $aa_rel_freq["J"])
1096
+ prob = env.smooth_prob_array[$amino_acids.index("C")] + env.smooth_prob_array[$amino_acids.index("J")]
1097
+ odds = prob == 0.0 ? 0.000001 / paj : prob / paj
1098
+ logo_arr[logo_arr.size - 1] = factor * Math::log(odds)
1099
+ grp_logo_mat[ai, logo_arr.size - 1] = logo_arr[logo_arr.size - 1]
1100
+ end
1042
1101
  end
1043
1102
 
1044
- tot_logo_mat += grp_logo_mat
1103
+ $tot_logo_mat += grp_logo_mat
1104
+ grp_logo_mats << [grp_label, grp_logo_mat]
1105
+ end
1045
1106
 
1046
- $outfh.puts ">#{grp_label} #{group_no}"
1047
- $outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1107
+ $tot_logo_mat /= env_groups.size
1108
+
1109
+ # calculating relative entropy for each amino acid pair H and
1110
+ # the expected score E in bit units
1111
+ #
1112
+ # I'm a bit suspicious about this part...
1113
+ tot_E = 0.0
1114
+ tot_H = 0.0
1115
+
1116
+ 0.upto($tot_logo_mat.shape[0] - 1) do |i|
1117
+ 0.upto($tot_logo_mat.shape[0] - 1) do |j|
1118
+ if i != j
1119
+ tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]] / 2.0
1120
+ tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 2.0 / 10000.0
1121
+ else
1122
+ tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]]
1123
+ tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 10000.0
1124
+ end
1125
+ end
1048
1126
  end
1049
1127
 
1050
- tot_logo_mat /= env_groups.size
1128
+ $outfh.puts <<HEADER
1129
+ #
1130
+ # Shown here are logarithms of these values multiplied by #{$scale}/log(2)
1131
+ # rounded to the nearest integer (log-odds scores in 1/3 bit units).
1132
+ #
1133
+ # For total (composite) matrix, Entropy = #{"%5.4f" % tot_H} bits, Expected score = #{"%5.4f" % tot_E}
1134
+ #
1135
+ HEADER
1136
+
1137
+ grp_logo_mats.each_with_index do |arr, grp_no|
1138
+ grp_label = arr[0]
1139
+ grp_logo_mat = arr[1]
1051
1140
 
1052
- $outfh.puts ">Total"
1053
- $outfh.puts tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1141
+ $outfh.puts ">#{grp_label} #{grp_no}"
1142
+ if $cys
1143
+ $outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
1144
+ else
1145
+ $outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1146
+ end
1147
+ end
1148
+
1149
+ $outfh.puts ">Total #{grp_logo_mats.size}"
1150
+ if $cys
1151
+ $outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
1152
+ else
1153
+ $outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
1154
+ end
1054
1155
  $outfh.close
1055
1156
  exit 0
1056
1157
  end
data/lib/egor.rb CHANGED
@@ -2,5 +2,5 @@ $:.unshift(File.dirname(__FILE__)) unless
2
2
  $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
3
 
4
4
  module Egor
5
- VERSION = '0.0.1'
6
- end
5
+ VERSION = '0.0.2'
6
+ end
data/lib/environment.rb CHANGED
@@ -23,7 +23,7 @@ class Environment
23
23
  @smooth_prob_array = NArray.float(21)
24
24
  end
25
25
 
26
- def add_residue_count(a, inc = 1.0)
26
+ def increase_residue_count(a, inc = 1.0)
27
27
  @freq_array[@@amino_acids.index(a.upcase)] += inc
28
28
  end
29
29
 
@@ -1,7 +1,7 @@
1
1
  class EnvironmentFeature < Struct.new(:name, :symbols, :labels, :constrained, :silent)
2
2
 
3
3
  def to_s
4
- values.join(";")
4
+ [name, symbols.join, labels.join, constrained, silent].join(";")
5
5
  end
6
6
 
7
7
  def constrained?
data/website/index.html CHANGED
@@ -44,6 +44,8 @@
44
44
  <h2>Features</h2>
45
45
  <ul>
46
46
  <li>No more segmentation fault</li>
47
+ <li>Fast enough not to leave your place</li>
48
+ <li>Slow enough to check your emails or have some chats with your colleagues next you</li>
47
49
  <li>Full smoothing supported</li>
48
50
  <li>In theory, infinite number of environment features can be handled</li>
49
51
  </ul>
@@ -53,7 +55,7 @@
53
55
  or
54
56
  <pre>$ egor -l TEM-file -c classdef.dat</pre>
55
57
  <h2>Repository</h2>
56
- <p>You can download a pre-built RubyGem package from</p>
58
+ <p>You can download a pre-built RubyGems package from</p>
57
59
  <ul>
58
60
  <li>rubyforge: <a href="http://rubyforge.org/projects/egor">http://rubyforge.org/projects/egor</a></li>
59
61
  </ul>
@@ -67,12 +69,19 @@ or
67
69
  <h2>Contact</h2>
68
70
  <p>Comments are welcome, please send an email to me (seminlee at gmail dot com).</p>
69
71
  <p class="coda">
70
- <a href="FIXME email">Semin Lee</a>, 10th November 2008<br>
72
+ Semin Lee, 12th November 2008<br>
71
73
  Theme extended from <a href="http://rb2js.rubyforge.org/">Paul Battley</a>
72
74
  </p>
73
75
  </div>
74
76
 
75
77
  <!-- insert site tracking codes here, like Google Urchin -->
76
-
78
+ <script type="text/javascript">
79
+ var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
80
+ document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
81
+ </script>
82
+ <script type="text/javascript">
83
+ var pageTracker = _gat._getTracker("UA-6291956-1");
84
+ pageTracker._trackPageview();
85
+ </script>
77
86
  </body>
78
87
  </html>
data/website/index.txt CHANGED
@@ -14,6 +14,8 @@ h2. Installation
14
14
  h2. Features
15
15
 
16
16
  * No more segmentation fault
17
+ * Fast enough not to leave your place
18
+ * Slow enough to check your emails or have some chats with your colleagues next you
17
19
  * Full smoothing supported
18
20
  * In theory, infinite number of environment features can be handled
19
21
 
@@ -29,7 +31,7 @@ It's pretty much the same as Kenji's subst, so in most cases, you just need swap
29
31
 
30
32
  h2. Repository
31
33
 
32
- You can download a pre-built RubyGem package from
34
+ You can download a pre-built RubyGems package from
33
35
 
34
36
  * rubyforge: "http://rubyforge.org/projects/egor":http://rubyforge.org/projects/egor
35
37
 
@@ -39,12 +39,19 @@
39
39
  </div>
40
40
  <%= body %>
41
41
  <p class="coda">
42
- <a href="FIXME email">Semin Lee</a>, <%= modified.pretty %><br>
42
+ Semin Lee, <%= modified.pretty %><br>
43
43
  Theme extended from <a href="http://rb2js.rubyforge.org/">Paul Battley</a>
44
44
  </p>
45
45
  </div>
46
46
 
47
47
  <!-- insert site tracking codes here, like Google Urchin -->
48
-
48
+ <script type="text/javascript">
49
+ var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
50
+ document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
51
+ </script>
52
+ <script type="text/javascript">
53
+ var pageTracker = _gat._getTracker("UA-6291956-1");
54
+ pageTracker._trackPageview();
55
+ </script>
49
56
  </body>
50
57
  </html>
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: egor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Semin Lee
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-11-10 00:00:00 +00:00
12
+ date: 2008-11-13 00:00:00 +00:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -60,7 +60,7 @@ dependencies:
60
60
  requirements:
61
61
  - - ">="
62
62
  - !ruby/object:Gem::Version
63
- version: 1.0.7
63
+ version: 1.1.0
64
64
  version:
65
65
  - !ruby/object:Gem::Dependency
66
66
  name: hoe