egor 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +6 -0
- data/README.rdoc +3 -1
- data/lib/egor/cli.rb +178 -77
- data/lib/egor.rb +2 -2
- data/lib/environment.rb +1 -1
- data/lib/environment_feature.rb +1 -1
- data/website/index.html +12 -3
- data/website/index.txt +3 -1
- data/website/template.html.erb +9 -2
- metadata +3 -3
data/History.txt
CHANGED
data/README.rdoc
CHANGED
@@ -9,8 +9,10 @@ egor: Esst GeneratOR, a program for calculating environment-specific substitutio
|
|
9
9
|
== FEATURES/PROBLEMS:
|
10
10
|
|
11
11
|
* No more segmentation fault
|
12
|
+
* Fast enough not to leave your place
|
13
|
+
* Slow enough to check your emails or have some chats with your colleagues next you
|
12
14
|
* Full smoothing supported
|
13
|
-
*
|
15
|
+
* In theory, infinite number of environment features can be handled
|
14
16
|
|
15
17
|
== BASIC USAGE:
|
16
18
|
|
data/lib/egor/cli.rb
CHANGED
@@ -41,19 +41,19 @@ Usage:
|
|
41
41
|
egor [ options ] -f TEM-file -c CLASSDEF-file
|
42
42
|
|
43
43
|
Options:
|
44
|
-
--tem-file (-f)
|
45
|
-
--tem-list (-l)
|
46
|
-
--classdef (-c)
|
47
|
-
--outfile (-o)
|
44
|
+
--tem-file (-f) FILE: a tem file
|
45
|
+
--tem-list (-l) FILE: a list for tem files
|
46
|
+
--classdef (-c) FILE: a file for the defintion of environments (default: 'classdef.dat')
|
47
|
+
--outfile (-o) FILE: output filename ("allmat.dat" if not specified)
|
48
48
|
--weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting
|
49
49
|
--noweight: calculate substitution counts with no weights (default)
|
50
50
|
--smooth (-s) INTEGER:
|
51
51
|
0 for parial smoothing (default)
|
52
52
|
1 for full smoothing
|
53
53
|
--nosmooth: perform no smoothing operation
|
54
|
-
--cys (-y) INTEGER:
|
55
|
-
0 for using C and J only for structure
|
56
|
-
1 for both structure and sequence
|
54
|
+
--cys (-y) INTEGER:
|
55
|
+
0 for using C and J only for structure (default)
|
56
|
+
1 for both structure and sequence
|
57
57
|
--output INTEGER:
|
58
58
|
0 for raw counts (no-smoothing performed)
|
59
59
|
1 for probabilities
|
@@ -61,7 +61,7 @@ Options:
|
|
61
61
|
--scale INTEGER: log-odds matrices in 1/n bit units (default 3)
|
62
62
|
--sigma DOUBLE: change the sigma value for smoothing (default 5)
|
63
63
|
--add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1/#classes)
|
64
|
-
--penv: use environment-dependent frequencies for log-odds calculation (default false) (NOT implemented yet)
|
64
|
+
--penv: use environment-dependent frequencies for log-odds calculation (default false) (NOT implemented yet!!!)
|
65
65
|
--pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value (default none)
|
66
66
|
--pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value (default none)
|
67
67
|
--verbose (-v) INTEGER
|
@@ -137,12 +137,6 @@ Options:
|
|
137
137
|
$outfile = "allmat.dat"
|
138
138
|
$outfh = nil # file hanfle for outfile
|
139
139
|
$output = 2
|
140
|
-
$aa_tot_obs = {}
|
141
|
-
$aa_mut_obs = {}
|
142
|
-
$aa_mutb = {}
|
143
|
-
$aa_rel_mutb = {}
|
144
|
-
$aa_rel_freq = {}
|
145
|
-
$env_aa_obs = {}
|
146
140
|
$ali_size = 0
|
147
141
|
$tot_aa = 0
|
148
142
|
$sigma = 5.0
|
@@ -154,10 +148,20 @@ Options:
|
|
154
148
|
$pidmin = nil
|
155
149
|
$pidmax = nil
|
156
150
|
$scale = 3
|
157
|
-
$add =
|
151
|
+
$add = nil
|
152
|
+
$cys = 0
|
158
153
|
$penv = false
|
159
|
-
|
154
|
+
|
155
|
+
$aa_tot_obs = {}
|
156
|
+
$aa_mut_obs = {}
|
157
|
+
$aa_mutb = {}
|
158
|
+
$aa_rel_mutb = {}
|
159
|
+
$aa_rel_freq = {}
|
160
|
+
$env_aa_obs = {}
|
160
161
|
$smooth_prob = {}
|
162
|
+
$tot_freq_mat = nil
|
163
|
+
$tot_prob_mat = nil
|
164
|
+
$tot_logo_mat = nil
|
161
165
|
|
162
166
|
# Part 2.
|
163
167
|
#
|
@@ -195,10 +199,8 @@ Options:
|
|
195
199
|
$output = arg.to_i
|
196
200
|
when '--outfile'
|
197
201
|
$outfile = arg
|
198
|
-
when '--
|
199
|
-
$
|
200
|
-
exit 1
|
201
|
-
$cysteine = (arg.to_i == 1 ? false : true)
|
202
|
+
when '--cys'
|
203
|
+
$cys = (arg.to_i == 1 ? false : true)
|
202
204
|
when '--weight'
|
203
205
|
$weight = arg.to_i
|
204
206
|
when '--sigma'
|
@@ -210,15 +212,17 @@ Options:
|
|
210
212
|
when '--noweight'
|
211
213
|
$noweight = true
|
212
214
|
when '--smooth'
|
213
|
-
$smooth = (arg.to_i == 1 ? :full : :
|
215
|
+
$smooth = (arg.to_i == 1 ? :full : :partial)
|
214
216
|
when '--nosmooth'
|
215
217
|
$nosmooth = true
|
216
218
|
when '--scale'
|
217
219
|
$scale = arg.to_f
|
218
220
|
when '--add'
|
221
|
+
$logger.error "!!! --add option is not supported yet"
|
222
|
+
exit 1
|
219
223
|
$add = arg.to_f
|
220
224
|
when '--penv'
|
221
|
-
$logger.error "!!! --penv option is not
|
225
|
+
$logger.error "!!! --penv option is not supported yet"
|
222
226
|
exit 1
|
223
227
|
$penv = true
|
224
228
|
when '--heatmap'
|
@@ -245,14 +249,19 @@ Options:
|
|
245
249
|
exit 1
|
246
250
|
end
|
247
251
|
|
252
|
+
|
248
253
|
# Part 3.
|
249
254
|
#
|
250
255
|
# Reading Environment Class Definition File
|
251
256
|
#
|
252
257
|
|
253
|
-
#
|
258
|
+
# an array for storing all environment feature objects
|
254
259
|
$env_features = []
|
255
260
|
|
261
|
+
|
262
|
+
# an array for storing indexes of constrained environment features
|
263
|
+
$cst_features = []
|
264
|
+
|
256
265
|
# aa1 amino acid in a substitution itself is a environment feature
|
257
266
|
$env_features << EnvironmentFeature.new("sequence",
|
258
267
|
$amino_acids,
|
@@ -262,24 +271,29 @@ Options:
|
|
262
271
|
|
263
272
|
# read environment class definiton file and
|
264
273
|
# store them into the hash prepared above
|
274
|
+
env_index = 1
|
275
|
+
|
265
276
|
IO.foreach($classdef) do |line|
|
277
|
+
line.chomp!
|
266
278
|
if line.start_with?("#")
|
267
279
|
next
|
268
280
|
elsif (env_ftr = line.chomp.split(/;/)).length == 5
|
269
|
-
$logger.info ">>> An environment feature, #{line
|
281
|
+
$logger.info ">>> An environment feature, #{line} detected"
|
270
282
|
if env_ftr[-1] == "T"
|
271
283
|
# skip silenced environment feature
|
272
|
-
$logger.warn "!!! The environment feature, #{line
|
284
|
+
$logger.warn "!!! The environment feature, #{line} silent"
|
273
285
|
next
|
274
286
|
end
|
275
287
|
if env_ftr[-2] == "T"
|
276
|
-
$
|
288
|
+
$cst_features << env_index
|
289
|
+
$logger.warn "!!! The environment feature, #{line} constrained"
|
277
290
|
end
|
278
291
|
$env_features << EnvironmentFeature.new(env_ftr[0],
|
279
292
|
env_ftr[1].split(""),
|
280
293
|
env_ftr[2].split(""),
|
281
294
|
env_ftr[3],
|
282
295
|
env_ftr[4])
|
296
|
+
env_index += 1
|
283
297
|
else
|
284
298
|
$logger.error "@@@ #{line} doesn't seem to be a proper format for class definition"
|
285
299
|
exit 1
|
@@ -325,7 +339,7 @@ Options:
|
|
325
339
|
end
|
326
340
|
end
|
327
341
|
|
328
|
-
$ali_size +=
|
342
|
+
$ali_size += 1
|
329
343
|
env_labels = {}
|
330
344
|
disulphide = {}
|
331
345
|
|
@@ -398,19 +412,27 @@ Options:
|
|
398
412
|
aa2 = s2[pos].upcase
|
399
413
|
|
400
414
|
if !$amino_acids.include?(aa1)
|
401
|
-
$logger.warn "!!! #{id1}-#{pos}-#{aa1} is not standard amino acid" unless aa1 == "-"
|
415
|
+
$logger.warn "!!! #{id1}-#{pos}-#{aa1} is not a standard amino acid" unless aa1 == "-"
|
402
416
|
next
|
403
417
|
end
|
404
418
|
|
405
419
|
if !$amino_acids.include?(aa2)
|
406
|
-
$logger.warn "!!! #{id1}-#{pos}-#{aa2} is not standard amino acid" unless aa2 == "-"
|
420
|
+
$logger.warn "!!! #{id1}-#{pos}-#{aa2} is not a standard amino acid" unless aa2 == "-"
|
407
421
|
next
|
408
422
|
end
|
409
423
|
|
410
424
|
aa1 = (((disulphide[id1][pos] == "F") && (aa1 == "C")) ? "J" : aa1)
|
411
425
|
aa2 = (((disulphide[id2][pos] == "F") && (aa2 == "C")) ? "J" : aa2)
|
412
426
|
|
413
|
-
$
|
427
|
+
if $cst_features.empty?
|
428
|
+
$envs[env_labels[id1][pos]].increase_residue_count(aa2)
|
429
|
+
elsif (env_labels[id1][pos].split("").values_at(*$cst_features) ==
|
430
|
+
env_labels[id2][pos].split("").values_at(*$cst_features))
|
431
|
+
$envs[env_labels[id1][pos]].increase_residue_count(aa2)
|
432
|
+
else
|
433
|
+
$logger.debug "*** #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other"
|
434
|
+
next
|
435
|
+
end
|
414
436
|
|
415
437
|
grp_label = env_labels[id1][pos][1..-1]
|
416
438
|
|
@@ -485,6 +507,7 @@ Options:
|
|
485
507
|
cluster2.each do |id2|
|
486
508
|
seq1 = ali[id1].split("")
|
487
509
|
seq2 = ali[id2].split("")
|
510
|
+
|
488
511
|
seq1.each_with_index do |aa1, pos|
|
489
512
|
if env_labels[id1][pos].include?("X")
|
490
513
|
$logger.debug "*** Substitutions from #{id1}-#{pos}-#{aa1} were masked"
|
@@ -511,8 +534,17 @@ Options:
|
|
511
534
|
obs1 = 1.0 / size1
|
512
535
|
obs2 = 1.0 / size2
|
513
536
|
|
514
|
-
$
|
515
|
-
|
537
|
+
if $cst_features.empty?
|
538
|
+
$envs[env_labels[id1][pos]].increase_residue_count(aa2, 1.0 / (size1 * size2))
|
539
|
+
$envs[env_labels[id2][pos]].increase_residue_count(aa1, 1.0 / (size1 * size2))
|
540
|
+
elsif (env_labels[id1][pos].split("").values_at(*$cst_features) ==
|
541
|
+
env_labels[id2][pos].split("").values_at(*$cst_features))
|
542
|
+
$envs[env_labels[id1][pos]].increase_residue_count(aa2, 1.0 / (size1 * size2))
|
543
|
+
$envs[env_labels[id2][pos]].increase_residue_count(aa1, 1.0 / (size1 * size2))
|
544
|
+
else
|
545
|
+
$logger.debug "*** #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other"
|
546
|
+
next
|
547
|
+
end
|
516
548
|
|
517
549
|
grp_label1 = env_labels[id1][pos][1..-1]
|
518
550
|
grp_label2 = env_labels[id2][pos][1..-1]
|
@@ -605,7 +637,6 @@ HEADER
|
|
605
637
|
else
|
606
638
|
$outfh.puts "# Weighting scheme: clustering at PID #{$weight} level"
|
607
639
|
end
|
608
|
-
$outfh.puts "#"
|
609
640
|
|
610
641
|
# calculate amino acid frequencies and mutabilities, and
|
611
642
|
# print them as default statistics in the header part
|
@@ -614,7 +645,7 @@ HEADER
|
|
614
645
|
|
615
646
|
$outfh.puts "#"
|
616
647
|
$outfh.puts "# Total amino acid frequencies:\n"
|
617
|
-
$outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES MUT_OBS
|
648
|
+
$outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FRQ]
|
618
649
|
|
619
650
|
$aa_tot_obs.each_pair do |res, freq|
|
620
651
|
$aa_mutb[res] = $aa_mut_obs[res] / freq.to_f
|
@@ -625,13 +656,18 @@ HEADER
|
|
625
656
|
$amino_acids.each do |res|
|
626
657
|
if $noweight
|
627
658
|
$outfh.puts "# %-3s %9d %9d %5.2f %8d %8.4f" %
|
628
|
-
[res, $
|
659
|
+
[res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
|
629
660
|
else
|
630
661
|
$outfh.puts "# %-3s %9.2f %9.2f %5.2f %8d %8.4f" %
|
631
|
-
[res, $
|
662
|
+
[res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
|
632
663
|
end
|
633
664
|
end
|
634
|
-
|
665
|
+
|
666
|
+
|
667
|
+
# Part 5.
|
668
|
+
#
|
669
|
+
# Calculating substitution frequency tables
|
670
|
+
#
|
635
671
|
|
636
672
|
# calculating probabilities for each environment
|
637
673
|
$envs.values.each do |e|
|
@@ -641,7 +677,7 @@ HEADER
|
|
641
677
|
end
|
642
678
|
|
643
679
|
# count raw frequencies
|
644
|
-
$
|
680
|
+
$tot_freq_mat = ($noweight ? NMatrix.int(21,21) : NMatrix.float(21,21))
|
645
681
|
|
646
682
|
# for each combination of environment features
|
647
683
|
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
@@ -652,28 +688,33 @@ HEADER
|
|
652
688
|
$env_features[i + 1].labels.index(l)
|
653
689
|
}
|
654
690
|
}.each_with_index do |group, group_no|
|
655
|
-
|
691
|
+
grp_freq_mat = ($noweight ? NMatrix.int(21,21) : NMatrix.float(21,21))
|
656
692
|
|
657
693
|
$amino_acids.each_with_index do |aa, ai|
|
658
694
|
freq_array = group[1].find { |e| e.label.start_with?(aa) }.freq_array
|
659
|
-
0.upto(20) { |j|
|
695
|
+
0.upto(20) { |j| grp_freq_mat[ai, j] = freq_array[j] }
|
660
696
|
end
|
661
697
|
|
662
|
-
$
|
698
|
+
$tot_freq_mat += grp_freq_mat
|
663
699
|
|
664
700
|
if $output == 0
|
665
701
|
$outfh.puts ">#{group[0]} #{group_no}"
|
666
|
-
$outfh.puts
|
702
|
+
$outfh.puts grp_freq_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
667
703
|
end
|
668
704
|
end
|
669
705
|
|
670
706
|
if $output == 0
|
671
707
|
$outfh.puts ">Total"
|
672
|
-
$outfh.puts $
|
708
|
+
$outfh.puts $tot_freq_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
673
709
|
exit 0
|
674
710
|
end
|
675
711
|
|
676
|
-
|
712
|
+
|
713
|
+
# Part 6.
|
714
|
+
#
|
715
|
+
# Calculating substitution probability tables
|
716
|
+
#
|
717
|
+
|
677
718
|
if $output == 1
|
678
719
|
$outfh.puts <<HEADER
|
679
720
|
#
|
@@ -687,7 +728,7 @@ HEADER
|
|
687
728
|
|
688
729
|
if ($output > 0) && $nosmooth
|
689
730
|
# Probability matrices
|
690
|
-
|
731
|
+
$tot_prob_mat = NMatrix.float(21, 21)
|
691
732
|
|
692
733
|
# for each combination of environment features
|
693
734
|
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
@@ -697,24 +738,24 @@ HEADER
|
|
697
738
|
$env_features[i + 1].labels.index(l)
|
698
739
|
}
|
699
740
|
}.each_with_index do |group, group_no|
|
700
|
-
|
741
|
+
grp_prob_mat = NMatrix.float(21,21)
|
701
742
|
|
702
743
|
$amino_acids.each_with_index do |aa, ai|
|
703
744
|
prob_array = group[1].find { |e| e.label.start_with?(aa) }.prob_array
|
704
|
-
0.upto(20) { |j|
|
745
|
+
0.upto(20) { |j| grp_prob_mat[ai, j] = prob_array[j] }
|
705
746
|
end
|
706
747
|
|
707
|
-
|
748
|
+
$tot_prob_mat += grp_prob_mat
|
708
749
|
|
709
750
|
if ($output == 1)
|
710
751
|
$outfh.puts ">#{group[0]} #{group_no}"
|
711
|
-
$outfh.puts
|
752
|
+
$outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
712
753
|
end
|
713
754
|
end
|
714
755
|
|
715
756
|
if ($output == 1)
|
716
757
|
$outfh.puts ">Total"
|
717
|
-
$outfh.puts
|
758
|
+
$outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
718
759
|
$outfh.close
|
719
760
|
exit 0
|
720
761
|
end
|
@@ -749,6 +790,7 @@ HEADER
|
|
749
790
|
|
750
791
|
if $smooth == :partial
|
751
792
|
$outfh.puts <<HEADER
|
793
|
+
#
|
752
794
|
# Partial Smoothing:
|
753
795
|
#
|
754
796
|
# p1(ri) (i.e., amino acid composition) is estimated by summing over
|
@@ -771,7 +813,6 @@ HEADER
|
|
771
813
|
# Weights (omegas) are calculated as in Topham et al. 1993)
|
772
814
|
#
|
773
815
|
# sigma value used is: 5.00
|
774
|
-
#
|
775
816
|
HEADER
|
776
817
|
1.upto($env_features.size) do |ci|
|
777
818
|
# for partial smoothing, only P1 ~ P3, and Pn are considered
|
@@ -872,6 +913,7 @@ HEADER
|
|
872
913
|
end
|
873
914
|
else
|
874
915
|
$outfh.puts <<HEADER
|
916
|
+
#
|
875
917
|
# Full Smoothing:
|
876
918
|
#
|
877
919
|
# p1(ri) is estimated as:
|
@@ -897,7 +939,6 @@ HEADER
|
|
897
939
|
# Weights (omegas) are calculated as in Topham et al. 1993)
|
898
940
|
#
|
899
941
|
# sigma value used is: 5.00
|
900
|
-
#
|
901
942
|
HEADER
|
902
943
|
# full smooting
|
903
944
|
1.upto($env_features.size) do |ci|
|
@@ -959,7 +1000,7 @@ HEADER
|
|
959
1000
|
$envs.values.each { |e| e.smooth_prob_array = $smooth_prob[$env_features.size + 1][e.label_set] }
|
960
1001
|
|
961
1002
|
# for a total substitution probability matrix
|
962
|
-
|
1003
|
+
$tot_prob_mat = NMatrix.float(21,21)
|
963
1004
|
|
964
1005
|
# grouping environments by its environment labels but amino acid label
|
965
1006
|
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
@@ -972,46 +1013,54 @@ HEADER
|
|
972
1013
|
}
|
973
1014
|
}.each_with_index do |group, group_no|
|
974
1015
|
# calculating 21X21 substitution probability matrix for each envrionment
|
975
|
-
|
1016
|
+
grp_prob_mat = NMatrix.float(21,21)
|
976
1017
|
|
977
1018
|
$amino_acids.each_with_index do |aa, ai|
|
978
1019
|
smooth_prob_array = group[1].find { |e| e.label.start_with?(aa) }.smooth_prob_array
|
979
|
-
0.upto(20) { |j|
|
1020
|
+
0.upto(20) { |j| grp_prob_mat[ai, j] = smooth_prob_array[j] }
|
980
1021
|
end
|
981
1022
|
|
982
|
-
|
1023
|
+
$tot_prob_mat += grp_prob_mat
|
983
1024
|
|
984
1025
|
if $output == 1
|
985
1026
|
$outfh.puts ">#{group[0]} #{group_no}"
|
986
|
-
$outfh.puts
|
1027
|
+
$outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
987
1028
|
end
|
988
1029
|
end
|
989
1030
|
|
990
|
-
|
1031
|
+
$tot_prob_mat /= env_groups.size
|
991
1032
|
|
992
1033
|
if $output == 1
|
993
1034
|
$outfh.puts ">Total"
|
994
|
-
$outfh.puts
|
1035
|
+
$outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
995
1036
|
$outfh.close
|
996
1037
|
exit 0
|
997
1038
|
end
|
998
1039
|
|
1040
|
+
|
1041
|
+
# Part 7.
|
1042
|
+
#
|
1043
|
+
# Calculating log-add ratio scoring matrices
|
1044
|
+
#
|
999
1045
|
if $output == 2
|
1000
1046
|
$outfh.puts <<HEADER
|
1001
1047
|
#
|
1002
1048
|
# The probabilities were then divided by the background probabilities
|
1049
|
+
HEADER
|
1050
|
+
if $penv
|
1051
|
+
$outfh.puts <<HEADER
|
1003
1052
|
# which were derived from the environment-independent amino acid frequencies.
|
1004
1053
|
# ^^^^^^^^^^^^^^^^^^^^^^^
|
1005
|
-
#
|
1006
|
-
# Shown here are logarithms of these values multiplied by 3/log(2)
|
1007
|
-
# rounded to the nearest integer (log-odds scores in 1/3 bit units).
|
1008
|
-
#
|
1009
|
-
# For total (composite) matrix, Entropy = XXX bits, Expected score = XXX
|
1010
|
-
#
|
1011
1054
|
HEADER
|
1055
|
+
else
|
1056
|
+
$outfh.puts <<HEADER
|
1057
|
+
# which were derived from the environment-dependent amino acid frequencies.
|
1058
|
+
# ^^^^^^^^^^^^^^^^^^^^^
|
1059
|
+
HEADER
|
1060
|
+
end
|
1012
1061
|
|
1013
|
-
|
1014
|
-
|
1062
|
+
$tot_logo_mat = $cys ? NMatrix.float(21,22) : NMatrix.float(21,21)
|
1063
|
+
grp_logo_mats = []
|
1015
1064
|
factor = $scale / Math::log(2)
|
1016
1065
|
|
1017
1066
|
# grouping environments by its environment labels but amino acid label
|
@@ -1027,30 +1076,82 @@ HEADER
|
|
1027
1076
|
# calculating 21X21 substitution probability matrix for each envrionment
|
1028
1077
|
grp_label = group[0]
|
1029
1078
|
grp_envs = group[1]
|
1030
|
-
grp_logo_mat = NMatrix.float(21,21)
|
1079
|
+
grp_logo_mat = $cys ? NMatrix.float(21, 22) : NMatrix.float(21,21)
|
1031
1080
|
|
1032
1081
|
$amino_acids.each_with_index do |aa, ai|
|
1033
1082
|
env = grp_envs.detect { |e| e.label.start_with?(aa) }
|
1034
|
-
logo_arr = NArray.float(21)
|
1083
|
+
logo_arr = $cys ? NArray.float(22) : NArray.float(21)
|
1035
1084
|
|
1036
1085
|
env.smooth_prob_array.to_a.each_with_index do |prob, j|
|
1037
|
-
paj
|
1038
|
-
odds
|
1086
|
+
paj = 100.0 * $aa_rel_freq[$amino_acids[j]]
|
1087
|
+
odds = prob == 0.0 ? 0.000001 / paj : prob / paj
|
1039
1088
|
logo_arr[j] = factor * Math::log(odds)
|
1040
1089
|
end
|
1090
|
+
|
1041
1091
|
0.upto(20) { |j| grp_logo_mat[ai, j] = logo_arr[j] }
|
1092
|
+
|
1093
|
+
# adding log odds ratio for "U" (J or C) when --cyc is ON
|
1094
|
+
if $cys
|
1095
|
+
paj = 100.0 * ($aa_rel_freq["C"] + $aa_rel_freq["J"])
|
1096
|
+
prob = env.smooth_prob_array[$amino_acids.index("C")] + env.smooth_prob_array[$amino_acids.index("J")]
|
1097
|
+
odds = prob == 0.0 ? 0.000001 / paj : prob / paj
|
1098
|
+
logo_arr[logo_arr.size - 1] = factor * Math::log(odds)
|
1099
|
+
grp_logo_mat[ai, logo_arr.size - 1] = logo_arr[logo_arr.size - 1]
|
1100
|
+
end
|
1042
1101
|
end
|
1043
1102
|
|
1044
|
-
tot_logo_mat += grp_logo_mat
|
1103
|
+
$tot_logo_mat += grp_logo_mat
|
1104
|
+
grp_logo_mats << [grp_label, grp_logo_mat]
|
1105
|
+
end
|
1045
1106
|
|
1046
|
-
|
1047
|
-
|
1107
|
+
$tot_logo_mat /= env_groups.size
|
1108
|
+
|
1109
|
+
# calculating relative entropy for each amino acid pair H and
|
1110
|
+
# the expected score E in bit units
|
1111
|
+
#
|
1112
|
+
# I'm a bit suspicious about this part...
|
1113
|
+
tot_E = 0.0
|
1114
|
+
tot_H = 0.0
|
1115
|
+
|
1116
|
+
0.upto($tot_logo_mat.shape[0] - 1) do |i|
|
1117
|
+
0.upto($tot_logo_mat.shape[0] - 1) do |j|
|
1118
|
+
if i != j
|
1119
|
+
tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]] / 2.0
|
1120
|
+
tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 2.0 / 10000.0
|
1121
|
+
else
|
1122
|
+
tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]]
|
1123
|
+
tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 10000.0
|
1124
|
+
end
|
1125
|
+
end
|
1048
1126
|
end
|
1049
1127
|
|
1050
|
-
|
1128
|
+
$outfh.puts <<HEADER
|
1129
|
+
#
|
1130
|
+
# Shown here are logarithms of these values multiplied by #{$scale}/log(2)
|
1131
|
+
# rounded to the nearest integer (log-odds scores in 1/3 bit units).
|
1132
|
+
#
|
1133
|
+
# For total (composite) matrix, Entropy = #{"%5.4f" % tot_H} bits, Expected score = #{"%5.4f" % tot_E}
|
1134
|
+
#
|
1135
|
+
HEADER
|
1136
|
+
|
1137
|
+
grp_logo_mats.each_with_index do |arr, grp_no|
|
1138
|
+
grp_label = arr[0]
|
1139
|
+
grp_logo_mat = arr[1]
|
1051
1140
|
|
1052
|
-
|
1053
|
-
|
1141
|
+
$outfh.puts ">#{grp_label} #{grp_no}"
|
1142
|
+
if $cys
|
1143
|
+
$outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
|
1144
|
+
else
|
1145
|
+
$outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
1146
|
+
end
|
1147
|
+
end
|
1148
|
+
|
1149
|
+
$outfh.puts ">Total #{grp_logo_mats.size}"
|
1150
|
+
if $cys
|
1151
|
+
$outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
|
1152
|
+
else
|
1153
|
+
$outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
1154
|
+
end
|
1054
1155
|
$outfh.close
|
1055
1156
|
exit 0
|
1056
1157
|
end
|
data/lib/egor.rb
CHANGED
data/lib/environment.rb
CHANGED
data/lib/environment_feature.rb
CHANGED
data/website/index.html
CHANGED
@@ -44,6 +44,8 @@
|
|
44
44
|
<h2>Features</h2>
|
45
45
|
<ul>
|
46
46
|
<li>No more segmentation fault</li>
|
47
|
+
<li>Fast enough not to leave your place</li>
|
48
|
+
<li>Slow enough to check your emails or have some chats with your colleagues next you</li>
|
47
49
|
<li>Full smoothing supported</li>
|
48
50
|
<li>In theory, infinite number of environment features can be handled</li>
|
49
51
|
</ul>
|
@@ -53,7 +55,7 @@
|
|
53
55
|
or
|
54
56
|
<pre>$ egor -l TEM-file -c classdef.dat</pre>
|
55
57
|
<h2>Repository</h2>
|
56
|
-
<p>You can download a pre-built
|
58
|
+
<p>You can download a pre-built RubyGems package from</p>
|
57
59
|
<ul>
|
58
60
|
<li>rubyforge: <a href="http://rubyforge.org/projects/egor">http://rubyforge.org/projects/egor</a></li>
|
59
61
|
</ul>
|
@@ -67,12 +69,19 @@ or
|
|
67
69
|
<h2>Contact</h2>
|
68
70
|
<p>Comments are welcome, please send an email to me (seminlee at gmail dot com).</p>
|
69
71
|
<p class="coda">
|
70
|
-
|
72
|
+
Semin Lee, 12th November 2008<br>
|
71
73
|
Theme extended from <a href="http://rb2js.rubyforge.org/">Paul Battley</a>
|
72
74
|
</p>
|
73
75
|
</div>
|
74
76
|
|
75
77
|
<!-- insert site tracking codes here, like Google Urchin -->
|
76
|
-
|
78
|
+
<script type="text/javascript">
|
79
|
+
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
|
80
|
+
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
|
81
|
+
</script>
|
82
|
+
<script type="text/javascript">
|
83
|
+
var pageTracker = _gat._getTracker("UA-6291956-1");
|
84
|
+
pageTracker._trackPageview();
|
85
|
+
</script>
|
77
86
|
</body>
|
78
87
|
</html>
|
data/website/index.txt
CHANGED
@@ -14,6 +14,8 @@ h2. Installation
|
|
14
14
|
h2. Features
|
15
15
|
|
16
16
|
* No more segmentation fault
|
17
|
+
* Fast enough not to leave your place
|
18
|
+
* Slow enough to check your emails or have some chats with your colleagues next you
|
17
19
|
* Full smoothing supported
|
18
20
|
* In theory, infinite number of environment features can be handled
|
19
21
|
|
@@ -29,7 +31,7 @@ It's pretty much the same as Kenji's subst, so in most cases, you just need swap
|
|
29
31
|
|
30
32
|
h2. Repository
|
31
33
|
|
32
|
-
You can download a pre-built
|
34
|
+
You can download a pre-built RubyGems package from
|
33
35
|
|
34
36
|
* rubyforge: "http://rubyforge.org/projects/egor":http://rubyforge.org/projects/egor
|
35
37
|
|
data/website/template.html.erb
CHANGED
@@ -39,12 +39,19 @@
|
|
39
39
|
</div>
|
40
40
|
<%= body %>
|
41
41
|
<p class="coda">
|
42
|
-
|
42
|
+
Semin Lee, <%= modified.pretty %><br>
|
43
43
|
Theme extended from <a href="http://rb2js.rubyforge.org/">Paul Battley</a>
|
44
44
|
</p>
|
45
45
|
</div>
|
46
46
|
|
47
47
|
<!-- insert site tracking codes here, like Google Urchin -->
|
48
|
-
|
48
|
+
<script type="text/javascript">
|
49
|
+
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
|
50
|
+
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
|
51
|
+
</script>
|
52
|
+
<script type="text/javascript">
|
53
|
+
var pageTracker = _gat._getTracker("UA-6291956-1");
|
54
|
+
pageTracker._trackPageview();
|
55
|
+
</script>
|
49
56
|
</body>
|
50
57
|
</html>
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: egor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Semin Lee
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-11-
|
12
|
+
date: 2008-11-13 00:00:00 +00:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -60,7 +60,7 @@ dependencies:
|
|
60
60
|
requirements:
|
61
61
|
- - ">="
|
62
62
|
- !ruby/object:Gem::Version
|
63
|
-
version: 1.0
|
63
|
+
version: 1.1.0
|
64
64
|
version:
|
65
65
|
- !ruby/object:Gem::Dependency
|
66
66
|
name: hoe
|