bio 0.7.1 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (142) hide show
  1. data/bin/bioruby +71 -27
  2. data/bin/br_biofetch.rb +5 -17
  3. data/bin/br_bioflat.rb +14 -26
  4. data/bin/br_biogetseq.rb +6 -18
  5. data/bin/br_pmfetch.rb +6 -16
  6. data/doc/Changes-0.7.rd +35 -0
  7. data/doc/KEGG_API.rd +287 -172
  8. data/doc/KEGG_API.rd.ja +273 -160
  9. data/doc/Tutorial.rd +18 -9
  10. data/doc/Tutorial.rd.ja +656 -138
  11. data/lib/bio.rb +6 -24
  12. data/lib/bio/alignment.rb +5 -5
  13. data/lib/bio/appl/blast.rb +132 -98
  14. data/lib/bio/appl/blast/format0.rb +9 -19
  15. data/lib/bio/appl/blast/wublast.rb +5 -18
  16. data/lib/bio/appl/emboss.rb +40 -47
  17. data/lib/bio/appl/hmmer.rb +116 -82
  18. data/lib/bio/appl/hmmer/report.rb +509 -364
  19. data/lib/bio/appl/spidey/report.rb +7 -18
  20. data/lib/bio/data/na.rb +3 -21
  21. data/lib/bio/db.rb +3 -21
  22. data/lib/bio/db/aaindex.rb +147 -52
  23. data/lib/bio/db/embl/common.rb +27 -6
  24. data/lib/bio/db/embl/embl.rb +18 -10
  25. data/lib/bio/db/embl/sptr.rb +87 -67
  26. data/lib/bio/db/embl/swissprot.rb +32 -3
  27. data/lib/bio/db/embl/trembl.rb +32 -3
  28. data/lib/bio/db/embl/uniprot.rb +32 -3
  29. data/lib/bio/db/fasta.rb +327 -289
  30. data/lib/bio/db/medline.rb +25 -4
  31. data/lib/bio/db/nbrf.rb +12 -20
  32. data/lib/bio/db/pdb.rb +4 -1
  33. data/lib/bio/db/pdb/chemicalcomponent.rb +240 -0
  34. data/lib/bio/db/pdb/pdb.rb +13 -8
  35. data/lib/bio/db/rebase.rb +93 -97
  36. data/lib/bio/feature.rb +2 -31
  37. data/lib/bio/io/ddbjxml.rb +167 -139
  38. data/lib/bio/io/fastacmd.rb +89 -56
  39. data/lib/bio/io/flatfile.rb +994 -278
  40. data/lib/bio/io/flatfile/index.rb +257 -194
  41. data/lib/bio/io/flatfile/indexer.rb +37 -29
  42. data/lib/bio/reference.rb +147 -64
  43. data/lib/bio/sequence.rb +57 -417
  44. data/lib/bio/sequence/aa.rb +64 -0
  45. data/lib/bio/sequence/common.rb +175 -0
  46. data/lib/bio/sequence/compat.rb +68 -0
  47. data/lib/bio/sequence/format.rb +134 -0
  48. data/lib/bio/sequence/generic.rb +24 -0
  49. data/lib/bio/sequence/na.rb +189 -0
  50. data/lib/bio/shell.rb +9 -23
  51. data/lib/bio/shell/core.rb +130 -125
  52. data/lib/bio/shell/demo.rb +143 -0
  53. data/lib/bio/shell/{session.rb → interface.rb} +42 -40
  54. data/lib/bio/shell/object.rb +52 -0
  55. data/lib/bio/shell/plugin/codon.rb +4 -22
  56. data/lib/bio/shell/plugin/emboss.rb +23 -0
  57. data/lib/bio/shell/plugin/entry.rb +34 -25
  58. data/lib/bio/shell/plugin/flatfile.rb +5 -23
  59. data/lib/bio/shell/plugin/keggapi.rb +11 -24
  60. data/lib/bio/shell/plugin/midi.rb +5 -23
  61. data/lib/bio/shell/plugin/obda.rb +4 -22
  62. data/lib/bio/shell/plugin/seq.rb +6 -24
  63. data/lib/bio/shell/rails/Rakefile +10 -0
  64. data/lib/bio/shell/rails/app/controllers/application.rb +4 -0
  65. data/lib/bio/shell/rails/app/controllers/shell_controller.rb +94 -0
  66. data/lib/bio/shell/rails/app/helpers/application_helper.rb +3 -0
  67. data/lib/bio/shell/rails/app/models/shell_connection.rb +30 -0
  68. data/lib/bio/shell/rails/app/views/layouts/shell.rhtml +37 -0
  69. data/lib/bio/shell/rails/app/views/shell/history.rhtml +5 -0
  70. data/lib/bio/shell/rails/app/views/shell/index.rhtml +2 -0
  71. data/lib/bio/shell/rails/app/views/shell/show.rhtml +13 -0
  72. data/lib/bio/shell/rails/config/boot.rb +19 -0
  73. data/lib/bio/shell/rails/config/database.yml +85 -0
  74. data/lib/bio/shell/rails/config/environment.rb +53 -0
  75. data/lib/bio/shell/rails/config/environments/development.rb +19 -0
  76. data/lib/bio/shell/rails/config/environments/production.rb +19 -0
  77. data/lib/bio/shell/rails/config/environments/test.rb +19 -0
  78. data/lib/bio/shell/rails/config/routes.rb +19 -0
  79. data/lib/bio/shell/rails/doc/README_FOR_APP +2 -0
  80. data/lib/bio/shell/rails/public/404.html +8 -0
  81. data/lib/bio/shell/rails/public/500.html +8 -0
  82. data/lib/bio/shell/rails/public/dispatch.cgi +10 -0
  83. data/lib/bio/shell/rails/public/dispatch.fcgi +24 -0
  84. data/lib/bio/shell/rails/public/dispatch.rb +10 -0
  85. data/lib/bio/shell/rails/public/favicon.ico +0 -0
  86. data/lib/bio/shell/rails/public/images/icon.png +0 -0
  87. data/lib/bio/shell/rails/public/images/rails.png +0 -0
  88. data/lib/bio/shell/rails/public/index.html +277 -0
  89. data/lib/bio/shell/rails/public/javascripts/controls.js +750 -0
  90. data/lib/bio/shell/rails/public/javascripts/dragdrop.js +584 -0
  91. data/lib/bio/shell/rails/public/javascripts/effects.js +854 -0
  92. data/lib/bio/shell/rails/public/javascripts/prototype.js +1785 -0
  93. data/lib/bio/shell/rails/public/robots.txt +1 -0
  94. data/lib/bio/shell/rails/public/stylesheets/main.css +187 -0
  95. data/lib/bio/shell/rails/script/about +3 -0
  96. data/lib/bio/shell/rails/script/breakpointer +3 -0
  97. data/lib/bio/shell/rails/script/console +3 -0
  98. data/lib/bio/shell/rails/script/destroy +3 -0
  99. data/lib/bio/shell/rails/script/generate +3 -0
  100. data/lib/bio/shell/rails/script/performance/benchmarker +3 -0
  101. data/lib/bio/shell/rails/script/performance/profiler +3 -0
  102. data/lib/bio/shell/rails/script/plugin +3 -0
  103. data/lib/bio/shell/rails/script/process/reaper +3 -0
  104. data/lib/bio/shell/rails/script/process/spawner +3 -0
  105. data/lib/bio/shell/rails/script/process/spinner +3 -0
  106. data/lib/bio/shell/rails/script/runner +3 -0
  107. data/lib/bio/shell/rails/script/server +42 -0
  108. data/lib/bio/shell/rails/test/test_helper.rb +28 -0
  109. data/lib/bio/shell/web.rb +90 -0
  110. data/lib/bio/util/contingency_table.rb +231 -225
  111. data/sample/any2fasta.rb +59 -0
  112. data/test/data/HMMER/hmmpfam.out +64 -0
  113. data/test/data/HMMER/hmmsearch.out +88 -0
  114. data/test/data/aaindex/DAYM780301 +30 -0
  115. data/test/data/aaindex/PRAM900102 +20 -0
  116. data/test/data/bl2seq/cd8a_cd8b_blastp.bl2seq +53 -0
  117. data/test/data/bl2seq/cd8a_p53_e-5blastp.bl2seq +37 -0
  118. data/test/data/blast/{eco:b0002.faa → b0002.faa} +0 -0
  119. data/test/data/blast/{eco:b0002.faa.m0 → b0002.faa.m0} +2 -2
  120. data/test/data/blast/{eco:b0002.faa.m7 → b0002.faa.m7} +1 -1
  121. data/test/data/blast/{eco:b0002.faa.m8 → b0002.faa.m8} +0 -0
  122. data/test/unit/bio/appl/bl2seq/test_report.rb +134 -0
  123. data/test/unit/bio/appl/blast/test_report.rb +15 -12
  124. data/test/unit/bio/appl/blast/test_xmlparser.rb +4 -4
  125. data/test/unit/bio/appl/hmmer/test_report.rb +355 -0
  126. data/test/unit/bio/appl/test_blast.rb +5 -5
  127. data/test/unit/bio/data/test_na.rb +9 -18
  128. data/test/unit/bio/db/pdb/test_pdb.rb +169 -0
  129. data/test/unit/bio/db/test_aaindex.rb +197 -0
  130. data/test/unit/bio/io/test_fastacmd.rb +55 -0
  131. data/test/unit/bio/sequence/test_aa.rb +102 -0
  132. data/test/unit/bio/sequence/test_common.rb +178 -0
  133. data/test/unit/bio/sequence/test_compat.rb +82 -0
  134. data/test/unit/bio/sequence/test_na.rb +242 -0
  135. data/test/unit/bio/shell/plugin/test_seq.rb +29 -19
  136. data/test/unit/bio/test_alignment.rb +15 -7
  137. data/test/unit/bio/test_reference.rb +198 -0
  138. data/test/unit/bio/test_sequence.rb +4 -49
  139. data/test/unit/bio/test_shell.rb +2 -2
  140. metadata +118 -15
  141. data/lib/bio/io/brdb.rb +0 -103
  142. data/lib/bioruby.rb +0 -34
@@ -0,0 +1,64 @@
1
+ #
2
+ # = bio/sequence/aa.rb - amino acid sequence class
3
+ #
4
+ # Copyright:: Copyright (C) 2006
5
+ # Toshiaki Katayama <k@bioruby.org>
6
+ # License:: Ruby's
7
+ #
8
+ # $Id: aa.rb,v 1.2 2006/02/06 14:11:31 k Exp $
9
+ #
10
+
11
+ require 'bio/sequence/common'
12
+
13
+ module Bio
14
+
15
+ autoload :AminoAcid, 'bio/data/aa'
16
+
17
+ class Sequence
18
+
19
+
20
+ # Amino Acid sequence
21
+ class AA < String
22
+
23
+ include Bio::Sequence::Common
24
+
25
+ # Generate a amino acid sequence object from a string.
26
+ def initialize(str)
27
+ super
28
+ self.upcase!
29
+ self.tr!(" \t\n\r",'')
30
+ end
31
+
32
+
33
+ # Estimate the weight of this protein.
34
+ def molecular_weight
35
+ Bio::AminoAcid.weight(self)
36
+ end
37
+
38
+ def to_re
39
+ Bio::AminoAcid.to_re(self)
40
+ end
41
+
42
+ # Generate the list of the names of the each residue along with the
43
+ # sequence (3 letters code).
44
+ def codes
45
+ array = []
46
+ self.each_byte do |x|
47
+ array.push(Bio::AminoAcid.names[x.chr])
48
+ end
49
+ return array
50
+ end
51
+
52
+ # Similar to codes but returns long names.
53
+ def names
54
+ self.codes.map do |x|
55
+ Bio::AminoAcid.names[x]
56
+ end
57
+ end
58
+
59
+ end # AA
60
+
61
+ end # Sequence
62
+
63
+ end # Bio
64
+
@@ -0,0 +1,175 @@
1
+ #
2
+ # = bio/sequence/common.rb - common methods for biological sequence
3
+ #
4
+ # Copyright:: Copyright (C) 2006
5
+ # Toshiaki Katayama <k@bioruby.org>
6
+ # License:: Ruby's
7
+ #
8
+ # $Id: common.rb,v 1.2 2006/02/06 14:16:17 k Exp $
9
+ #
10
+
11
+ module Bio
12
+
13
+ autoload :Locations, 'bio/location'
14
+
15
+ class Sequence
16
+
17
+ # This module provides common methods for biological sequence classes
18
+ # which must inherit String.
19
+ module Common
20
+
21
+ def to_s
22
+ String.new(self)
23
+ end
24
+ alias to_str to_s
25
+
26
+ # Force self to re-initialize for clean up (remove white spaces,
27
+ # case unification).
28
+ def seq
29
+ self.class.new(self)
30
+ end
31
+
32
+ # Similar to the 'seq' method, but changes the self object destructively.
33
+ def normalize!
34
+ initialize(self)
35
+ self
36
+ end
37
+ alias seq! normalize!
38
+
39
+ def <<(*arg)
40
+ super(self.class.new(*arg))
41
+ end
42
+ alias concat <<
43
+
44
+ def +(*arg)
45
+ self.class.new(super(*arg))
46
+ end
47
+
48
+ # Returns the subsequence of the self string.
49
+ def subseq(s = 1, e = self.length)
50
+ raise "Error: start/end position must be a positive integer" unless s > 0 and e > 0
51
+ s -= 1
52
+ e -= 1
53
+ self[s..e]
54
+ end
55
+
56
+ # This method iterates on sub string with specified length 'window_size'.
57
+ # By specifing 'step_size', codon sized shifting or spliting genome
58
+ # sequence with ovelapping each end can easily be yielded.
59
+ #
60
+ # The remainder sequence at the terminal end will be returned.
61
+ #
62
+ # Example:
63
+ # # prints average GC% on each 100bp
64
+ # seq.window_search(100) do |subseq|
65
+ # puts subseq.gc
66
+ # end
67
+ # # prints every translated peptide (length 5aa) in the same frame
68
+ # seq.window_search(15, 3) do |subseq|
69
+ # puts subseq.translate
70
+ # end
71
+ # # split genome sequence by 10000bp with 1000bp overlap in fasta format
72
+ # i = 1
73
+ # remainder = seq.window_search(10000, 9000) do |subseq|
74
+ # puts subseq.to_fasta("segment #{i}", 60)
75
+ # i += 1
76
+ # end
77
+ # puts remainder.to_fasta("segment #{i}", 60)
78
+ #
79
+ def window_search(window_size, step_size = 1)
80
+ i = 0
81
+ 0.step(self.length - window_size, step_size) do |i|
82
+ yield self[i, window_size]
83
+ end
84
+ return self[i + window_size .. -1]
85
+ end
86
+
87
+ # This method receive a hash of residues/bases to the particular values,
88
+ # and sum up the value along with the self sequence. Especially useful
89
+ # to use with the window_search method and amino acid indices etc.
90
+ def total(hash)
91
+ hash.default = 0.0 unless hash.default
92
+ sum = 0.0
93
+ self.each_byte do |x|
94
+ begin
95
+ sum += hash[x.chr]
96
+ end
97
+ end
98
+ return sum
99
+ end
100
+
101
+ # Returns a hash of the occurrence counts for each residue or base.
102
+ def composition
103
+ count = Hash.new(0)
104
+ self.scan(/./) do |x|
105
+ count[x] += 1
106
+ end
107
+ return count
108
+ end
109
+
110
+ # Returns a randomized sequence keeping its composition by default.
111
+ # The argument is required when generating a random sequence from the empty
112
+ # sequence (used by the class methods NA.randomize, AA.randomize).
113
+ # If the block is given, yields for each random residue/base.
114
+ def randomize(hash = nil)
115
+ length = self.length
116
+ if hash
117
+ count = hash.clone
118
+ count.each_value {|x| length += x}
119
+ else
120
+ count = self.composition
121
+ end
122
+
123
+ seq = ''
124
+ tmp = {}
125
+ length.times do
126
+ count.each do |k, v|
127
+ tmp[k] = v * rand
128
+ end
129
+ max = tmp.max {|a, b| a[1] <=> b[1]}
130
+ count[max.first] -= 1
131
+
132
+ if block_given?
133
+ yield max.first
134
+ else
135
+ seq += max.first
136
+ end
137
+ end
138
+ return self.class.new(seq)
139
+ end
140
+
141
+ # Generate a new random sequence with the given frequency of bases
142
+ # or residues. The sequence length is determined by the sum of each
143
+ # base/residue occurences.
144
+ def self.randomize(*arg, &block)
145
+ self.new('').randomize(*arg, &block)
146
+ end
147
+
148
+ # Receive a GenBank style position string and convert it to the Locations
149
+ # objects to splice the sequence itself. See also: bio/location.rb
150
+ def splice(position)
151
+ unless position.is_a?(Locations) then
152
+ position = Locations.new(position)
153
+ end
154
+ s = ''
155
+ position.each do |location|
156
+ if location.sequence
157
+ s << location.sequence
158
+ else
159
+ exon = self.subseq(location.from, location.to)
160
+ begin
161
+ exon.complement! if location.strand < 0
162
+ rescue NameError
163
+ end
164
+ s << exon
165
+ end
166
+ end
167
+ return self.class.new(s)
168
+ end
169
+ alias splicing splice
170
+
171
+ end # Common
172
+
173
+ end # Sequence
174
+
175
+ end # Bio
@@ -0,0 +1,68 @@
1
+ #
2
+ # = bio/sequence/compat.rb - methods for backward compatibility
3
+ #
4
+ # Copyright:: Copyright (C) 2006
5
+ # Toshiaki Katayama <k@bioruby.org>
6
+ # License:: Ruby's
7
+ #
8
+ # $Id: compat.rb,v 1.2 2006/02/06 14:18:03 k Exp $
9
+ #
10
+
11
+
12
+ module Bio
13
+
14
+ class Sequence
15
+
16
+ autoload :Common, 'bio/sequence/common'
17
+ autoload :NA, 'bio/sequence/na'
18
+ autoload :AA, 'bio/sequence/aa'
19
+
20
+ def to_s
21
+ String.new(@seq)
22
+ end
23
+ alias to_str to_s
24
+
25
+
26
+ module Common
27
+
28
+ # Output the FASTA format string of the sequence. The 1st argument is
29
+ # used as the comment string. If the 2nd option is given, the output
30
+ # sequence will be folded.
31
+ def to_fasta(header = '', width = nil)
32
+ warn "Bio::Sequence#to_fasta is obsolete. Use Bio::Sequence#output(:fasta) instead" if $DEBUG
33
+ ">#{header}\n" +
34
+ if width
35
+ self.to_s.gsub(Regexp.new(".{1,#{width}}"), "\\0\n")
36
+ else
37
+ self.to_s + "\n"
38
+ end
39
+ end
40
+
41
+ end # Common
42
+
43
+
44
+ class NA
45
+
46
+ def self.randomize(*arg, &block)
47
+ self.new('').randomize(*arg, &block)
48
+ end
49
+
50
+ def pikachu
51
+ self.dna.tr("atgc", "pika") # joke, of course :-)
52
+ end
53
+
54
+ end # NA
55
+
56
+
57
+ class AA
58
+
59
+ def self.randomize(*arg, &block)
60
+ self.new('').randomize(*arg, &block)
61
+ end
62
+
63
+ end # AA
64
+
65
+
66
+ end # Sequence
67
+
68
+ end # Bio
@@ -0,0 +1,134 @@
1
+ #
2
+ # = bio/sequence/format.rb - various output format of the biological sequence
3
+ #
4
+ # Copyright:: Copyright (C) 2006
5
+ # Toshiaki Katayama <k@bioruby.org>,
6
+ # Naohisa Goto <ng@bioruby.org>
7
+ # License:: Ruby's
8
+ #
9
+ # = TODO
10
+ #
11
+ # porting from N. Goto's feature-output.rb on BioRuby list.
12
+ #
13
+ # $Id: format.rb,v 1.2 2006/02/06 14:20:35 k Exp $
14
+ #
15
+
16
+
17
+ module Bio
18
+
19
+ autoload :Sequence, 'bio/sequence'
20
+
21
+ class Sequence
22
+
23
+ module Format
24
+
25
+ # Output the FASTA format string of the sequence. The 1st argument is
26
+ # used in the comment line. If the 2nd argument (integer) is given,
27
+ # the output sequence will be folded.
28
+ def format_fasta(header = nil, width = nil)
29
+ header ||= "#{@entry_id} #{@definition}"
30
+
31
+ ">#{header}\n" +
32
+ if width
33
+ @seq.to_s.gsub(Regexp.new(".{1,#{width}}"), "\\0\n")
34
+ else
35
+ @seq.to_s + "\n"
36
+ end
37
+ end
38
+
39
+ def format_gff
40
+ raise NotImplementedError
41
+ end
42
+
43
+ def format_genbank
44
+ prefix = ' ' * 5
45
+ indent = prefix + ' ' * 16
46
+ fwidth = 79 - indent.length
47
+
48
+ format_features(prefix, indent, fwidth)
49
+ end
50
+
51
+ def format_embl
52
+ prefix = 'FT '
53
+ indent = prefix + ' ' * 16
54
+ fwidth = 80 - indent.length
55
+
56
+ format_features(prefix, indent, fwidth)
57
+ end
58
+
59
+
60
+ private
61
+
62
+ def format_features(prefix, indent, width)
63
+ result = ''
64
+ @features.each do |feature|
65
+ result << prefix + sprintf("%-16s", feature.feature)
66
+
67
+ position = feature.position
68
+ #position = feature.locations.to_s
69
+
70
+ head = ''
71
+ wrap(position, width).each_line do |line|
72
+ result << head << line
73
+ head = indent
74
+ end
75
+
76
+ result << format_qualifiers(feature.qualifiers, width)
77
+ end
78
+ return result
79
+ end
80
+
81
+ def format_qualifiers(qualifiers, indent, width)
82
+ qualifiers.each do |qualifier|
83
+ q = qualifier.qualifier
84
+ v = qualifier.value.to_s
85
+
86
+ if v == true
87
+ lines = wrap('/' + q, width)
88
+ elsif q == 'translation'
89
+ lines = fold('/' + q + '=' + val, width)
90
+ else
91
+ if v[/\D/]
92
+ #v.delete!("\x00-\x1f\x7f-\xff")
93
+ v.gsub!(/"/, '""')
94
+ v = '"' + v + '"'
95
+ end
96
+ lines = wrap('/' + q + '=' + val, width)
97
+ end
98
+
99
+ return lines.gsub(/^/, indent)
100
+ end
101
+ end
102
+
103
+ def fold(str, width)
104
+ str.gsub(Regexp.new("(.{1,#{width}})"), "\\1\n")
105
+ end
106
+
107
+ def wrap(str, width)
108
+ result = []
109
+ left = str.dup
110
+ while left and left.length > width
111
+ line = nil
112
+ width.downto(1) do |i|
113
+ if left[i..i] == ' ' or /[,;]/ =~ left[(i-1)..(i-1)] then
114
+ line = left[0..(i-1)].sub(/ +\z/, '')
115
+ left = left[i..-1].sub(/\A +/, '')
116
+ break
117
+ end
118
+ end
119
+ if line.nil? then
120
+ line = left[0..(width-1)]
121
+ left = left[width..-1]
122
+ end
123
+ result << line
124
+ end
125
+ result << left if left
126
+ return result.join("\n")
127
+ end
128
+
129
+ end # Format
130
+
131
+ end # Sequence
132
+
133
+ end # Bio
134
+
@@ -0,0 +1,24 @@
1
+ #
2
+ # = bio/sequence/generic.rb - generic sequence class to store an intact string
3
+ #
4
+ # Copyright:: Copyright (C) 2006
5
+ # Toshiaki Katayama <k@bioruby.org>
6
+ # License:: Ruby's
7
+ #
8
+ # $Id: generic.rb,v 1.3 2006/02/06 14:26:04 k Exp $
9
+ #
10
+
11
+ require 'bio/sequence/common'
12
+
13
+ module Bio
14
+ class Sequence
15
+
16
+ class Generic < String
17
+
18
+ include Bio::Sequence::Common
19
+
20
+ end # Generic
21
+
22
+ end # Sequence
23
+ end # Bio
24
+