egor 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +6 -0
- data/README.rdoc +3 -1
- data/lib/egor/cli.rb +178 -77
- data/lib/egor.rb +2 -2
- data/lib/environment.rb +1 -1
- data/lib/environment_feature.rb +1 -1
- data/website/index.html +12 -3
- data/website/index.txt +3 -1
- data/website/template.html.erb +9 -2
- metadata +3 -3
data/History.txt
CHANGED
data/README.rdoc
CHANGED
@@ -9,8 +9,10 @@ egor: Esst GeneratOR, a program for calculating environment-specific substitutio
|
|
9
9
|
== FEATURES/PROBLEMS:
|
10
10
|
|
11
11
|
* No more segmentation fault
|
12
|
+
* Fast enough not to leave your place
|
13
|
+
* Slow enough to check your emails or have some chats with your colleagues next you
|
12
14
|
* Full smoothing supported
|
13
|
-
*
|
15
|
+
* In theory, infinite number of environment features can be handled
|
14
16
|
|
15
17
|
== BASIC USAGE:
|
16
18
|
|
data/lib/egor/cli.rb
CHANGED
@@ -41,19 +41,19 @@ Usage:
|
|
41
41
|
egor [ options ] -f TEM-file -c CLASSDEF-file
|
42
42
|
|
43
43
|
Options:
|
44
|
-
--tem-file (-f)
|
45
|
-
--tem-list (-l)
|
46
|
-
--classdef (-c)
|
47
|
-
--outfile (-o)
|
44
|
+
--tem-file (-f) FILE: a tem file
|
45
|
+
--tem-list (-l) FILE: a list for tem files
|
46
|
+
--classdef (-c) FILE: a file for the defintion of environments (default: 'classdef.dat')
|
47
|
+
--outfile (-o) FILE: output filename ("allmat.dat" if not specified)
|
48
48
|
--weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting
|
49
49
|
--noweight: calculate substitution counts with no weights (default)
|
50
50
|
--smooth (-s) INTEGER:
|
51
51
|
0 for parial smoothing (default)
|
52
52
|
1 for full smoothing
|
53
53
|
--nosmooth: perform no smoothing operation
|
54
|
-
--cys (-y) INTEGER:
|
55
|
-
0 for using C and J only for structure
|
56
|
-
1 for both structure and sequence
|
54
|
+
--cys (-y) INTEGER:
|
55
|
+
0 for using C and J only for structure (default)
|
56
|
+
1 for both structure and sequence
|
57
57
|
--output INTEGER:
|
58
58
|
0 for raw counts (no-smoothing performed)
|
59
59
|
1 for probabilities
|
@@ -61,7 +61,7 @@ Options:
|
|
61
61
|
--scale INTEGER: log-odds matrices in 1/n bit units (default 3)
|
62
62
|
--sigma DOUBLE: change the sigma value for smoothing (default 5)
|
63
63
|
--add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1/#classes)
|
64
|
-
--penv: use environment-dependent frequencies for log-odds calculation (default false) (NOT implemented yet)
|
64
|
+
--penv: use environment-dependent frequencies for log-odds calculation (default false) (NOT implemented yet!!!)
|
65
65
|
--pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value (default none)
|
66
66
|
--pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value (default none)
|
67
67
|
--verbose (-v) INTEGER
|
@@ -137,12 +137,6 @@ Options:
|
|
137
137
|
$outfile = "allmat.dat"
|
138
138
|
$outfh = nil # file hanfle for outfile
|
139
139
|
$output = 2
|
140
|
-
$aa_tot_obs = {}
|
141
|
-
$aa_mut_obs = {}
|
142
|
-
$aa_mutb = {}
|
143
|
-
$aa_rel_mutb = {}
|
144
|
-
$aa_rel_freq = {}
|
145
|
-
$env_aa_obs = {}
|
146
140
|
$ali_size = 0
|
147
141
|
$tot_aa = 0
|
148
142
|
$sigma = 5.0
|
@@ -154,10 +148,20 @@ Options:
|
|
154
148
|
$pidmin = nil
|
155
149
|
$pidmax = nil
|
156
150
|
$scale = 3
|
157
|
-
$add =
|
151
|
+
$add = nil
|
152
|
+
$cys = 0
|
158
153
|
$penv = false
|
159
|
-
|
154
|
+
|
155
|
+
$aa_tot_obs = {}
|
156
|
+
$aa_mut_obs = {}
|
157
|
+
$aa_mutb = {}
|
158
|
+
$aa_rel_mutb = {}
|
159
|
+
$aa_rel_freq = {}
|
160
|
+
$env_aa_obs = {}
|
160
161
|
$smooth_prob = {}
|
162
|
+
$tot_freq_mat = nil
|
163
|
+
$tot_prob_mat = nil
|
164
|
+
$tot_logo_mat = nil
|
161
165
|
|
162
166
|
# Part 2.
|
163
167
|
#
|
@@ -195,10 +199,8 @@ Options:
|
|
195
199
|
$output = arg.to_i
|
196
200
|
when '--outfile'
|
197
201
|
$outfile = arg
|
198
|
-
when '--
|
199
|
-
$
|
200
|
-
exit 1
|
201
|
-
$cysteine = (arg.to_i == 1 ? false : true)
|
202
|
+
when '--cys'
|
203
|
+
$cys = (arg.to_i == 1 ? false : true)
|
202
204
|
when '--weight'
|
203
205
|
$weight = arg.to_i
|
204
206
|
when '--sigma'
|
@@ -210,15 +212,17 @@ Options:
|
|
210
212
|
when '--noweight'
|
211
213
|
$noweight = true
|
212
214
|
when '--smooth'
|
213
|
-
$smooth = (arg.to_i == 1 ? :full : :
|
215
|
+
$smooth = (arg.to_i == 1 ? :full : :partial)
|
214
216
|
when '--nosmooth'
|
215
217
|
$nosmooth = true
|
216
218
|
when '--scale'
|
217
219
|
$scale = arg.to_f
|
218
220
|
when '--add'
|
221
|
+
$logger.error "!!! --add option is not supported yet"
|
222
|
+
exit 1
|
219
223
|
$add = arg.to_f
|
220
224
|
when '--penv'
|
221
|
-
$logger.error "!!! --penv option is not
|
225
|
+
$logger.error "!!! --penv option is not supported yet"
|
222
226
|
exit 1
|
223
227
|
$penv = true
|
224
228
|
when '--heatmap'
|
@@ -245,14 +249,19 @@ Options:
|
|
245
249
|
exit 1
|
246
250
|
end
|
247
251
|
|
252
|
+
|
248
253
|
# Part 3.
|
249
254
|
#
|
250
255
|
# Reading Environment Class Definition File
|
251
256
|
#
|
252
257
|
|
253
|
-
#
|
258
|
+
# an array for storing all environment feature objects
|
254
259
|
$env_features = []
|
255
260
|
|
261
|
+
|
262
|
+
# an array for storing indexes of constrained environment features
|
263
|
+
$cst_features = []
|
264
|
+
|
256
265
|
# aa1 amino acid in a substitution itself is a environment feature
|
257
266
|
$env_features << EnvironmentFeature.new("sequence",
|
258
267
|
$amino_acids,
|
@@ -262,24 +271,29 @@ Options:
|
|
262
271
|
|
263
272
|
# read environment class definiton file and
|
264
273
|
# store them into the hash prepared above
|
274
|
+
env_index = 1
|
275
|
+
|
265
276
|
IO.foreach($classdef) do |line|
|
277
|
+
line.chomp!
|
266
278
|
if line.start_with?("#")
|
267
279
|
next
|
268
280
|
elsif (env_ftr = line.chomp.split(/;/)).length == 5
|
269
|
-
$logger.info ">>> An environment feature, #{line
|
281
|
+
$logger.info ">>> An environment feature, #{line} detected"
|
270
282
|
if env_ftr[-1] == "T"
|
271
283
|
# skip silenced environment feature
|
272
|
-
$logger.warn "!!! The environment feature, #{line
|
284
|
+
$logger.warn "!!! The environment feature, #{line} silent"
|
273
285
|
next
|
274
286
|
end
|
275
287
|
if env_ftr[-2] == "T"
|
276
|
-
$
|
288
|
+
$cst_features << env_index
|
289
|
+
$logger.warn "!!! The environment feature, #{line} constrained"
|
277
290
|
end
|
278
291
|
$env_features << EnvironmentFeature.new(env_ftr[0],
|
279
292
|
env_ftr[1].split(""),
|
280
293
|
env_ftr[2].split(""),
|
281
294
|
env_ftr[3],
|
282
295
|
env_ftr[4])
|
296
|
+
env_index += 1
|
283
297
|
else
|
284
298
|
$logger.error "@@@ #{line} doesn't seem to be a proper format for class definition"
|
285
299
|
exit 1
|
@@ -325,7 +339,7 @@ Options:
|
|
325
339
|
end
|
326
340
|
end
|
327
341
|
|
328
|
-
$ali_size +=
|
342
|
+
$ali_size += 1
|
329
343
|
env_labels = {}
|
330
344
|
disulphide = {}
|
331
345
|
|
@@ -398,19 +412,27 @@ Options:
|
|
398
412
|
aa2 = s2[pos].upcase
|
399
413
|
|
400
414
|
if !$amino_acids.include?(aa1)
|
401
|
-
$logger.warn "!!! #{id1}-#{pos}-#{aa1} is not standard amino acid" unless aa1 == "-"
|
415
|
+
$logger.warn "!!! #{id1}-#{pos}-#{aa1} is not a standard amino acid" unless aa1 == "-"
|
402
416
|
next
|
403
417
|
end
|
404
418
|
|
405
419
|
if !$amino_acids.include?(aa2)
|
406
|
-
$logger.warn "!!! #{id1}-#{pos}-#{aa2} is not standard amino acid" unless aa2 == "-"
|
420
|
+
$logger.warn "!!! #{id1}-#{pos}-#{aa2} is not a standard amino acid" unless aa2 == "-"
|
407
421
|
next
|
408
422
|
end
|
409
423
|
|
410
424
|
aa1 = (((disulphide[id1][pos] == "F") && (aa1 == "C")) ? "J" : aa1)
|
411
425
|
aa2 = (((disulphide[id2][pos] == "F") && (aa2 == "C")) ? "J" : aa2)
|
412
426
|
|
413
|
-
$
|
427
|
+
if $cst_features.empty?
|
428
|
+
$envs[env_labels[id1][pos]].increase_residue_count(aa2)
|
429
|
+
elsif (env_labels[id1][pos].split("").values_at(*$cst_features) ==
|
430
|
+
env_labels[id2][pos].split("").values_at(*$cst_features))
|
431
|
+
$envs[env_labels[id1][pos]].increase_residue_count(aa2)
|
432
|
+
else
|
433
|
+
$logger.debug "*** #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other"
|
434
|
+
next
|
435
|
+
end
|
414
436
|
|
415
437
|
grp_label = env_labels[id1][pos][1..-1]
|
416
438
|
|
@@ -485,6 +507,7 @@ Options:
|
|
485
507
|
cluster2.each do |id2|
|
486
508
|
seq1 = ali[id1].split("")
|
487
509
|
seq2 = ali[id2].split("")
|
510
|
+
|
488
511
|
seq1.each_with_index do |aa1, pos|
|
489
512
|
if env_labels[id1][pos].include?("X")
|
490
513
|
$logger.debug "*** Substitutions from #{id1}-#{pos}-#{aa1} were masked"
|
@@ -511,8 +534,17 @@ Options:
|
|
511
534
|
obs1 = 1.0 / size1
|
512
535
|
obs2 = 1.0 / size2
|
513
536
|
|
514
|
-
$
|
515
|
-
|
537
|
+
if $cst_features.empty?
|
538
|
+
$envs[env_labels[id1][pos]].increase_residue_count(aa2, 1.0 / (size1 * size2))
|
539
|
+
$envs[env_labels[id2][pos]].increase_residue_count(aa1, 1.0 / (size1 * size2))
|
540
|
+
elsif (env_labels[id1][pos].split("").values_at(*$cst_features) ==
|
541
|
+
env_labels[id2][pos].split("").values_at(*$cst_features))
|
542
|
+
$envs[env_labels[id1][pos]].increase_residue_count(aa2, 1.0 / (size1 * size2))
|
543
|
+
$envs[env_labels[id2][pos]].increase_residue_count(aa1, 1.0 / (size1 * size2))
|
544
|
+
else
|
545
|
+
$logger.debug "*** #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other"
|
546
|
+
next
|
547
|
+
end
|
516
548
|
|
517
549
|
grp_label1 = env_labels[id1][pos][1..-1]
|
518
550
|
grp_label2 = env_labels[id2][pos][1..-1]
|
@@ -605,7 +637,6 @@ HEADER
|
|
605
637
|
else
|
606
638
|
$outfh.puts "# Weighting scheme: clustering at PID #{$weight} level"
|
607
639
|
end
|
608
|
-
$outfh.puts "#"
|
609
640
|
|
610
641
|
# calculate amino acid frequencies and mutabilities, and
|
611
642
|
# print them as default statistics in the header part
|
@@ -614,7 +645,7 @@ HEADER
|
|
614
645
|
|
615
646
|
$outfh.puts "#"
|
616
647
|
$outfh.puts "# Total amino acid frequencies:\n"
|
617
|
-
$outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES MUT_OBS
|
648
|
+
$outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FRQ]
|
618
649
|
|
619
650
|
$aa_tot_obs.each_pair do |res, freq|
|
620
651
|
$aa_mutb[res] = $aa_mut_obs[res] / freq.to_f
|
@@ -625,13 +656,18 @@ HEADER
|
|
625
656
|
$amino_acids.each do |res|
|
626
657
|
if $noweight
|
627
658
|
$outfh.puts "# %-3s %9d %9d %5.2f %8d %8.4f" %
|
628
|
-
[res, $
|
659
|
+
[res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
|
629
660
|
else
|
630
661
|
$outfh.puts "# %-3s %9.2f %9.2f %5.2f %8d %8.4f" %
|
631
|
-
[res, $
|
662
|
+
[res, $aa_tot_obs[res], $aa_mut_obs[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
|
632
663
|
end
|
633
664
|
end
|
634
|
-
|
665
|
+
|
666
|
+
|
667
|
+
# Part 5.
|
668
|
+
#
|
669
|
+
# Calculating substitution frequency tables
|
670
|
+
#
|
635
671
|
|
636
672
|
# calculating probabilities for each environment
|
637
673
|
$envs.values.each do |e|
|
@@ -641,7 +677,7 @@ HEADER
|
|
641
677
|
end
|
642
678
|
|
643
679
|
# count raw frequencies
|
644
|
-
$
|
680
|
+
$tot_freq_mat = ($noweight ? NMatrix.int(21,21) : NMatrix.float(21,21))
|
645
681
|
|
646
682
|
# for each combination of environment features
|
647
683
|
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
@@ -652,28 +688,33 @@ HEADER
|
|
652
688
|
$env_features[i + 1].labels.index(l)
|
653
689
|
}
|
654
690
|
}.each_with_index do |group, group_no|
|
655
|
-
|
691
|
+
grp_freq_mat = ($noweight ? NMatrix.int(21,21) : NMatrix.float(21,21))
|
656
692
|
|
657
693
|
$amino_acids.each_with_index do |aa, ai|
|
658
694
|
freq_array = group[1].find { |e| e.label.start_with?(aa) }.freq_array
|
659
|
-
0.upto(20) { |j|
|
695
|
+
0.upto(20) { |j| grp_freq_mat[ai, j] = freq_array[j] }
|
660
696
|
end
|
661
697
|
|
662
|
-
$
|
698
|
+
$tot_freq_mat += grp_freq_mat
|
663
699
|
|
664
700
|
if $output == 0
|
665
701
|
$outfh.puts ">#{group[0]} #{group_no}"
|
666
|
-
$outfh.puts
|
702
|
+
$outfh.puts grp_freq_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
667
703
|
end
|
668
704
|
end
|
669
705
|
|
670
706
|
if $output == 0
|
671
707
|
$outfh.puts ">Total"
|
672
|
-
$outfh.puts $
|
708
|
+
$outfh.puts $tot_freq_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
673
709
|
exit 0
|
674
710
|
end
|
675
711
|
|
676
|
-
|
712
|
+
|
713
|
+
# Part 6.
|
714
|
+
#
|
715
|
+
# Calculating substitution probability tables
|
716
|
+
#
|
717
|
+
|
677
718
|
if $output == 1
|
678
719
|
$outfh.puts <<HEADER
|
679
720
|
#
|
@@ -687,7 +728,7 @@ HEADER
|
|
687
728
|
|
688
729
|
if ($output > 0) && $nosmooth
|
689
730
|
# Probability matrices
|
690
|
-
|
731
|
+
$tot_prob_mat = NMatrix.float(21, 21)
|
691
732
|
|
692
733
|
# for each combination of environment features
|
693
734
|
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
@@ -697,24 +738,24 @@ HEADER
|
|
697
738
|
$env_features[i + 1].labels.index(l)
|
698
739
|
}
|
699
740
|
}.each_with_index do |group, group_no|
|
700
|
-
|
741
|
+
grp_prob_mat = NMatrix.float(21,21)
|
701
742
|
|
702
743
|
$amino_acids.each_with_index do |aa, ai|
|
703
744
|
prob_array = group[1].find { |e| e.label.start_with?(aa) }.prob_array
|
704
|
-
0.upto(20) { |j|
|
745
|
+
0.upto(20) { |j| grp_prob_mat[ai, j] = prob_array[j] }
|
705
746
|
end
|
706
747
|
|
707
|
-
|
748
|
+
$tot_prob_mat += grp_prob_mat
|
708
749
|
|
709
750
|
if ($output == 1)
|
710
751
|
$outfh.puts ">#{group[0]} #{group_no}"
|
711
|
-
$outfh.puts
|
752
|
+
$outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
712
753
|
end
|
713
754
|
end
|
714
755
|
|
715
756
|
if ($output == 1)
|
716
757
|
$outfh.puts ">Total"
|
717
|
-
$outfh.puts
|
758
|
+
$outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
718
759
|
$outfh.close
|
719
760
|
exit 0
|
720
761
|
end
|
@@ -749,6 +790,7 @@ HEADER
|
|
749
790
|
|
750
791
|
if $smooth == :partial
|
751
792
|
$outfh.puts <<HEADER
|
793
|
+
#
|
752
794
|
# Partial Smoothing:
|
753
795
|
#
|
754
796
|
# p1(ri) (i.e., amino acid composition) is estimated by summing over
|
@@ -771,7 +813,6 @@ HEADER
|
|
771
813
|
# Weights (omegas) are calculated as in Topham et al. 1993)
|
772
814
|
#
|
773
815
|
# sigma value used is: 5.00
|
774
|
-
#
|
775
816
|
HEADER
|
776
817
|
1.upto($env_features.size) do |ci|
|
777
818
|
# for partial smoothing, only P1 ~ P3, and Pn are considered
|
@@ -872,6 +913,7 @@ HEADER
|
|
872
913
|
end
|
873
914
|
else
|
874
915
|
$outfh.puts <<HEADER
|
916
|
+
#
|
875
917
|
# Full Smoothing:
|
876
918
|
#
|
877
919
|
# p1(ri) is estimated as:
|
@@ -897,7 +939,6 @@ HEADER
|
|
897
939
|
# Weights (omegas) are calculated as in Topham et al. 1993)
|
898
940
|
#
|
899
941
|
# sigma value used is: 5.00
|
900
|
-
#
|
901
942
|
HEADER
|
902
943
|
# full smooting
|
903
944
|
1.upto($env_features.size) do |ci|
|
@@ -959,7 +1000,7 @@ HEADER
|
|
959
1000
|
$envs.values.each { |e| e.smooth_prob_array = $smooth_prob[$env_features.size + 1][e.label_set] }
|
960
1001
|
|
961
1002
|
# for a total substitution probability matrix
|
962
|
-
|
1003
|
+
$tot_prob_mat = NMatrix.float(21,21)
|
963
1004
|
|
964
1005
|
# grouping environments by its environment labels but amino acid label
|
965
1006
|
env_groups = $envs.values.group_by { |env| env.label[1..-1] }
|
@@ -972,46 +1013,54 @@ HEADER
|
|
972
1013
|
}
|
973
1014
|
}.each_with_index do |group, group_no|
|
974
1015
|
# calculating 21X21 substitution probability matrix for each envrionment
|
975
|
-
|
1016
|
+
grp_prob_mat = NMatrix.float(21,21)
|
976
1017
|
|
977
1018
|
$amino_acids.each_with_index do |aa, ai|
|
978
1019
|
smooth_prob_array = group[1].find { |e| e.label.start_with?(aa) }.smooth_prob_array
|
979
|
-
0.upto(20) { |j|
|
1020
|
+
0.upto(20) { |j| grp_prob_mat[ai, j] = smooth_prob_array[j] }
|
980
1021
|
end
|
981
1022
|
|
982
|
-
|
1023
|
+
$tot_prob_mat += grp_prob_mat
|
983
1024
|
|
984
1025
|
if $output == 1
|
985
1026
|
$outfh.puts ">#{group[0]} #{group_no}"
|
986
|
-
$outfh.puts
|
1027
|
+
$outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
987
1028
|
end
|
988
1029
|
end
|
989
1030
|
|
990
|
-
|
1031
|
+
$tot_prob_mat /= env_groups.size
|
991
1032
|
|
992
1033
|
if $output == 1
|
993
1034
|
$outfh.puts ">Total"
|
994
|
-
$outfh.puts
|
1035
|
+
$outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
995
1036
|
$outfh.close
|
996
1037
|
exit 0
|
997
1038
|
end
|
998
1039
|
|
1040
|
+
|
1041
|
+
# Part 7.
|
1042
|
+
#
|
1043
|
+
# Calculating log-add ratio scoring matrices
|
1044
|
+
#
|
999
1045
|
if $output == 2
|
1000
1046
|
$outfh.puts <<HEADER
|
1001
1047
|
#
|
1002
1048
|
# The probabilities were then divided by the background probabilities
|
1049
|
+
HEADER
|
1050
|
+
if $penv
|
1051
|
+
$outfh.puts <<HEADER
|
1003
1052
|
# which were derived from the environment-independent amino acid frequencies.
|
1004
1053
|
# ^^^^^^^^^^^^^^^^^^^^^^^
|
1005
|
-
#
|
1006
|
-
# Shown here are logarithms of these values multiplied by 3/log(2)
|
1007
|
-
# rounded to the nearest integer (log-odds scores in 1/3 bit units).
|
1008
|
-
#
|
1009
|
-
# For total (composite) matrix, Entropy = XXX bits, Expected score = XXX
|
1010
|
-
#
|
1011
1054
|
HEADER
|
1055
|
+
else
|
1056
|
+
$outfh.puts <<HEADER
|
1057
|
+
# which were derived from the environment-dependent amino acid frequencies.
|
1058
|
+
# ^^^^^^^^^^^^^^^^^^^^^
|
1059
|
+
HEADER
|
1060
|
+
end
|
1012
1061
|
|
1013
|
-
|
1014
|
-
|
1062
|
+
$tot_logo_mat = $cys ? NMatrix.float(21,22) : NMatrix.float(21,21)
|
1063
|
+
grp_logo_mats = []
|
1015
1064
|
factor = $scale / Math::log(2)
|
1016
1065
|
|
1017
1066
|
# grouping environments by its environment labels but amino acid label
|
@@ -1027,30 +1076,82 @@ HEADER
|
|
1027
1076
|
# calculating 21X21 substitution probability matrix for each envrionment
|
1028
1077
|
grp_label = group[0]
|
1029
1078
|
grp_envs = group[1]
|
1030
|
-
grp_logo_mat = NMatrix.float(21,21)
|
1079
|
+
grp_logo_mat = $cys ? NMatrix.float(21, 22) : NMatrix.float(21,21)
|
1031
1080
|
|
1032
1081
|
$amino_acids.each_with_index do |aa, ai|
|
1033
1082
|
env = grp_envs.detect { |e| e.label.start_with?(aa) }
|
1034
|
-
logo_arr = NArray.float(21)
|
1083
|
+
logo_arr = $cys ? NArray.float(22) : NArray.float(21)
|
1035
1084
|
|
1036
1085
|
env.smooth_prob_array.to_a.each_with_index do |prob, j|
|
1037
|
-
paj
|
1038
|
-
odds
|
1086
|
+
paj = 100.0 * $aa_rel_freq[$amino_acids[j]]
|
1087
|
+
odds = prob == 0.0 ? 0.000001 / paj : prob / paj
|
1039
1088
|
logo_arr[j] = factor * Math::log(odds)
|
1040
1089
|
end
|
1090
|
+
|
1041
1091
|
0.upto(20) { |j| grp_logo_mat[ai, j] = logo_arr[j] }
|
1092
|
+
|
1093
|
+
# adding log odds ratio for "U" (J or C) when --cyc is ON
|
1094
|
+
if $cys
|
1095
|
+
paj = 100.0 * ($aa_rel_freq["C"] + $aa_rel_freq["J"])
|
1096
|
+
prob = env.smooth_prob_array[$amino_acids.index("C")] + env.smooth_prob_array[$amino_acids.index("J")]
|
1097
|
+
odds = prob == 0.0 ? 0.000001 / paj : prob / paj
|
1098
|
+
logo_arr[logo_arr.size - 1] = factor * Math::log(odds)
|
1099
|
+
grp_logo_mat[ai, logo_arr.size - 1] = logo_arr[logo_arr.size - 1]
|
1100
|
+
end
|
1042
1101
|
end
|
1043
1102
|
|
1044
|
-
tot_logo_mat += grp_logo_mat
|
1103
|
+
$tot_logo_mat += grp_logo_mat
|
1104
|
+
grp_logo_mats << [grp_label, grp_logo_mat]
|
1105
|
+
end
|
1045
1106
|
|
1046
|
-
|
1047
|
-
|
1107
|
+
$tot_logo_mat /= env_groups.size
|
1108
|
+
|
1109
|
+
# calculating relative entropy for each amino acid pair H and
|
1110
|
+
# the expected score E in bit units
|
1111
|
+
#
|
1112
|
+
# I'm a bit suspicious about this part...
|
1113
|
+
tot_E = 0.0
|
1114
|
+
tot_H = 0.0
|
1115
|
+
|
1116
|
+
0.upto($tot_logo_mat.shape[0] - 1) do |i|
|
1117
|
+
0.upto($tot_logo_mat.shape[0] - 1) do |j|
|
1118
|
+
if i != j
|
1119
|
+
tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]] / 2.0
|
1120
|
+
tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 2.0 / 10000.0
|
1121
|
+
else
|
1122
|
+
tot_E += $tot_logo_mat[i, j] * $aa_rel_freq[$amino_acids[i]] * $aa_rel_freq[$amino_acids[j]]
|
1123
|
+
tot_H += $tot_logo_mat[i, j] * $tot_prob_mat[i, j] / 10000.0
|
1124
|
+
end
|
1125
|
+
end
|
1048
1126
|
end
|
1049
1127
|
|
1050
|
-
|
1128
|
+
$outfh.puts <<HEADER
|
1129
|
+
#
|
1130
|
+
# Shown here are logarithms of these values multiplied by #{$scale}/log(2)
|
1131
|
+
# rounded to the nearest integer (log-odds scores in 1/3 bit units).
|
1132
|
+
#
|
1133
|
+
# For total (composite) matrix, Entropy = #{"%5.4f" % tot_H} bits, Expected score = #{"%5.4f" % tot_E}
|
1134
|
+
#
|
1135
|
+
HEADER
|
1136
|
+
|
1137
|
+
grp_logo_mats.each_with_index do |arr, grp_no|
|
1138
|
+
grp_label = arr[0]
|
1139
|
+
grp_logo_mat = arr[1]
|
1051
1140
|
|
1052
|
-
|
1053
|
-
|
1141
|
+
$outfh.puts ">#{grp_label} #{grp_no}"
|
1142
|
+
if $cys
|
1143
|
+
$outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
|
1144
|
+
else
|
1145
|
+
$outfh.puts grp_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
1146
|
+
end
|
1147
|
+
end
|
1148
|
+
|
1149
|
+
$outfh.puts ">Total #{grp_logo_mats.size}"
|
1150
|
+
if $cys
|
1151
|
+
$outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids + %w[U])
|
1152
|
+
else
|
1153
|
+
$outfh.puts $tot_logo_mat.round.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
|
1154
|
+
end
|
1054
1155
|
$outfh.close
|
1055
1156
|
exit 0
|
1056
1157
|
end
|
data/lib/egor.rb
CHANGED
data/lib/environment.rb
CHANGED
data/lib/environment_feature.rb
CHANGED
data/website/index.html
CHANGED
@@ -44,6 +44,8 @@
|
|
44
44
|
<h2>Features</h2>
|
45
45
|
<ul>
|
46
46
|
<li>No more segmentation fault</li>
|
47
|
+
<li>Fast enough not to leave your place</li>
|
48
|
+
<li>Slow enough to check your emails or have some chats with your colleagues next you</li>
|
47
49
|
<li>Full smoothing supported</li>
|
48
50
|
<li>In theory, infinite number of environment features can be handled</li>
|
49
51
|
</ul>
|
@@ -53,7 +55,7 @@
|
|
53
55
|
or
|
54
56
|
<pre>$ egor -l TEM-file -c classdef.dat</pre>
|
55
57
|
<h2>Repository</h2>
|
56
|
-
<p>You can download a pre-built
|
58
|
+
<p>You can download a pre-built RubyGems package from</p>
|
57
59
|
<ul>
|
58
60
|
<li>rubyforge: <a href="http://rubyforge.org/projects/egor">http://rubyforge.org/projects/egor</a></li>
|
59
61
|
</ul>
|
@@ -67,12 +69,19 @@ or
|
|
67
69
|
<h2>Contact</h2>
|
68
70
|
<p>Comments are welcome, please send an email to me (seminlee at gmail dot com).</p>
|
69
71
|
<p class="coda">
|
70
|
-
|
72
|
+
Semin Lee, 12th November 2008<br>
|
71
73
|
Theme extended from <a href="http://rb2js.rubyforge.org/">Paul Battley</a>
|
72
74
|
</p>
|
73
75
|
</div>
|
74
76
|
|
75
77
|
<!-- insert site tracking codes here, like Google Urchin -->
|
76
|
-
|
78
|
+
<script type="text/javascript">
|
79
|
+
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
|
80
|
+
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
|
81
|
+
</script>
|
82
|
+
<script type="text/javascript">
|
83
|
+
var pageTracker = _gat._getTracker("UA-6291956-1");
|
84
|
+
pageTracker._trackPageview();
|
85
|
+
</script>
|
77
86
|
</body>
|
78
87
|
</html>
|
data/website/index.txt
CHANGED
@@ -14,6 +14,8 @@ h2. Installation
|
|
14
14
|
h2. Features
|
15
15
|
|
16
16
|
* No more segmentation fault
|
17
|
+
* Fast enough not to leave your place
|
18
|
+
* Slow enough to check your emails or have some chats with your colleagues next you
|
17
19
|
* Full smoothing supported
|
18
20
|
* In theory, infinite number of environment features can be handled
|
19
21
|
|
@@ -29,7 +31,7 @@ It's pretty much the same as Kenji's subst, so in most cases, you just need swap
|
|
29
31
|
|
30
32
|
h2. Repository
|
31
33
|
|
32
|
-
You can download a pre-built
|
34
|
+
You can download a pre-built RubyGems package from
|
33
35
|
|
34
36
|
* rubyforge: "http://rubyforge.org/projects/egor":http://rubyforge.org/projects/egor
|
35
37
|
|
data/website/template.html.erb
CHANGED
@@ -39,12 +39,19 @@
|
|
39
39
|
</div>
|
40
40
|
<%= body %>
|
41
41
|
<p class="coda">
|
42
|
-
|
42
|
+
Semin Lee, <%= modified.pretty %><br>
|
43
43
|
Theme extended from <a href="http://rb2js.rubyforge.org/">Paul Battley</a>
|
44
44
|
</p>
|
45
45
|
</div>
|
46
46
|
|
47
47
|
<!-- insert site tracking codes here, like Google Urchin -->
|
48
|
-
|
48
|
+
<script type="text/javascript">
|
49
|
+
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
|
50
|
+
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
|
51
|
+
</script>
|
52
|
+
<script type="text/javascript">
|
53
|
+
var pageTracker = _gat._getTracker("UA-6291956-1");
|
54
|
+
pageTracker._trackPageview();
|
55
|
+
</script>
|
49
56
|
</body>
|
50
57
|
</html>
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: egor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Semin Lee
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-11-
|
12
|
+
date: 2008-11-13 00:00:00 +00:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -60,7 +60,7 @@ dependencies:
|
|
60
60
|
requirements:
|
61
61
|
- - ">="
|
62
62
|
- !ruby/object:Gem::Version
|
63
|
-
version: 1.0
|
63
|
+
version: 1.1.0
|
64
64
|
version:
|
65
65
|
- !ruby/object:Gem::Dependency
|
66
66
|
name: hoe
|