bio 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. data/bin/bioruby +107 -0
  2. data/bin/br_biofetch.rb +59 -0
  3. data/bin/br_bioflat.rb +294 -0
  4. data/bin/br_biogetseq.rb +57 -0
  5. data/bin/br_pmfetch.rb +431 -0
  6. data/doc/BioRuby.rd.ja +225 -0
  7. data/doc/Changes-0.7.rd +236 -0
  8. data/doc/Design.rd.ja +341 -0
  9. data/doc/KEGG_API.rd +1437 -0
  10. data/doc/KEGG_API.rd.ja +1399 -0
  11. data/doc/TODO.rd.ja +138 -0
  12. data/doc/Tutorial.rd +1138 -0
  13. data/doc/Tutorial.rd.ja +2110 -0
  14. data/etc/bioinformatics/seqdatabase.ini +210 -0
  15. data/lib/bio.rb +256 -0
  16. data/lib/bio/alignment.rb +1906 -0
  17. data/lib/bio/appl/bl2seq/report.rb +350 -0
  18. data/lib/bio/appl/blast.rb +269 -0
  19. data/lib/bio/appl/blast/format0.rb +1402 -0
  20. data/lib/bio/appl/blast/format8.rb +95 -0
  21. data/lib/bio/appl/blast/report.rb +652 -0
  22. data/lib/bio/appl/blast/rexml.rb +151 -0
  23. data/lib/bio/appl/blast/wublast.rb +553 -0
  24. data/lib/bio/appl/blast/xmlparser.rb +222 -0
  25. data/lib/bio/appl/blat/report.rb +392 -0
  26. data/lib/bio/appl/clustalw.rb +191 -0
  27. data/lib/bio/appl/clustalw/report.rb +154 -0
  28. data/lib/bio/appl/emboss.rb +68 -0
  29. data/lib/bio/appl/fasta.rb +262 -0
  30. data/lib/bio/appl/fasta/format10.rb +428 -0
  31. data/lib/bio/appl/fasta/format6.rb +37 -0
  32. data/lib/bio/appl/genscan/report.rb +570 -0
  33. data/lib/bio/appl/hmmer.rb +129 -0
  34. data/lib/bio/appl/hmmer/report.rb +556 -0
  35. data/lib/bio/appl/mafft.rb +222 -0
  36. data/lib/bio/appl/mafft/report.rb +119 -0
  37. data/lib/bio/appl/psort.rb +555 -0
  38. data/lib/bio/appl/psort/report.rb +473 -0
  39. data/lib/bio/appl/sim4.rb +134 -0
  40. data/lib/bio/appl/sim4/report.rb +501 -0
  41. data/lib/bio/appl/sosui/report.rb +166 -0
  42. data/lib/bio/appl/spidey/report.rb +604 -0
  43. data/lib/bio/appl/targetp/report.rb +283 -0
  44. data/lib/bio/appl/tmhmm/report.rb +238 -0
  45. data/lib/bio/command.rb +166 -0
  46. data/lib/bio/data/aa.rb +354 -0
  47. data/lib/bio/data/codontable.rb +740 -0
  48. data/lib/bio/data/na.rb +226 -0
  49. data/lib/bio/db.rb +340 -0
  50. data/lib/bio/db/aaindex.rb +280 -0
  51. data/lib/bio/db/embl/common.rb +332 -0
  52. data/lib/bio/db/embl/embl.rb +446 -0
  53. data/lib/bio/db/embl/sptr.rb +954 -0
  54. data/lib/bio/db/embl/swissprot.rb +32 -0
  55. data/lib/bio/db/embl/trembl.rb +31 -0
  56. data/lib/bio/db/embl/uniprot.rb +32 -0
  57. data/lib/bio/db/fantom.rb +604 -0
  58. data/lib/bio/db/fasta.rb +869 -0
  59. data/lib/bio/db/genbank/common.rb +299 -0
  60. data/lib/bio/db/genbank/ddbj.rb +34 -0
  61. data/lib/bio/db/genbank/genbank.rb +354 -0
  62. data/lib/bio/db/genbank/genpept.rb +73 -0
  63. data/lib/bio/db/genbank/refseq.rb +31 -0
  64. data/lib/bio/db/gff.rb +106 -0
  65. data/lib/bio/db/go.rb +497 -0
  66. data/lib/bio/db/kegg/brite.rb +51 -0
  67. data/lib/bio/db/kegg/cell.rb +88 -0
  68. data/lib/bio/db/kegg/compound.rb +130 -0
  69. data/lib/bio/db/kegg/enzyme.rb +125 -0
  70. data/lib/bio/db/kegg/expression.rb +173 -0
  71. data/lib/bio/db/kegg/genes.rb +293 -0
  72. data/lib/bio/db/kegg/genome.rb +362 -0
  73. data/lib/bio/db/kegg/glycan.rb +213 -0
  74. data/lib/bio/db/kegg/keggtab.rb +418 -0
  75. data/lib/bio/db/kegg/kgml.rb +299 -0
  76. data/lib/bio/db/kegg/ko.rb +178 -0
  77. data/lib/bio/db/kegg/reaction.rb +97 -0
  78. data/lib/bio/db/litdb.rb +131 -0
  79. data/lib/bio/db/medline.rb +317 -0
  80. data/lib/bio/db/nbrf.rb +199 -0
  81. data/lib/bio/db/pdb.rb +38 -0
  82. data/lib/bio/db/pdb/atom.rb +60 -0
  83. data/lib/bio/db/pdb/chain.rb +117 -0
  84. data/lib/bio/db/pdb/model.rb +106 -0
  85. data/lib/bio/db/pdb/pdb.rb +1682 -0
  86. data/lib/bio/db/pdb/residue.rb +122 -0
  87. data/lib/bio/db/pdb/utils.rb +234 -0
  88. data/lib/bio/db/prosite.rb +616 -0
  89. data/lib/bio/db/rebase.rb +417 -0
  90. data/lib/bio/db/transfac.rb +387 -0
  91. data/lib/bio/feature.rb +201 -0
  92. data/lib/bio/io/brdb.rb +103 -0
  93. data/lib/bio/io/das.rb +471 -0
  94. data/lib/bio/io/dbget.rb +212 -0
  95. data/lib/bio/io/ddbjxml.rb +614 -0
  96. data/lib/bio/io/fastacmd.rb +123 -0
  97. data/lib/bio/io/fetch.rb +114 -0
  98. data/lib/bio/io/flatfile.rb +496 -0
  99. data/lib/bio/io/flatfile/bdb.rb +266 -0
  100. data/lib/bio/io/flatfile/index.rb +1308 -0
  101. data/lib/bio/io/flatfile/indexer.rb +778 -0
  102. data/lib/bio/io/higet.rb +92 -0
  103. data/lib/bio/io/keggapi.rb +863 -0
  104. data/lib/bio/io/pubmed.rb +189 -0
  105. data/lib/bio/io/registry.rb +308 -0
  106. data/lib/bio/io/soapwsdl.rb +114 -0
  107. data/lib/bio/io/sql.rb +428 -0
  108. data/lib/bio/location.rb +650 -0
  109. data/lib/bio/pathway.rb +991 -0
  110. data/lib/bio/reference.rb +308 -0
  111. data/lib/bio/sequence.rb +593 -0
  112. data/lib/bio/shell.rb +51 -0
  113. data/lib/bio/shell/core.rb +512 -0
  114. data/lib/bio/shell/plugin/codon.rb +228 -0
  115. data/lib/bio/shell/plugin/entry.rb +85 -0
  116. data/lib/bio/shell/plugin/flatfile.rb +119 -0
  117. data/lib/bio/shell/plugin/keggapi.rb +187 -0
  118. data/lib/bio/shell/plugin/midi.rb +448 -0
  119. data/lib/bio/shell/plugin/obda.rb +63 -0
  120. data/lib/bio/shell/plugin/seq.rb +238 -0
  121. data/lib/bio/shell/session.rb +214 -0
  122. data/lib/bio/util/color_scheme.rb +214 -0
  123. data/lib/bio/util/color_scheme/buried.rb +78 -0
  124. data/lib/bio/util/color_scheme/helix.rb +78 -0
  125. data/lib/bio/util/color_scheme/hydropathy.rb +83 -0
  126. data/lib/bio/util/color_scheme/nucleotide.rb +50 -0
  127. data/lib/bio/util/color_scheme/strand.rb +78 -0
  128. data/lib/bio/util/color_scheme/taylor.rb +69 -0
  129. data/lib/bio/util/color_scheme/turn.rb +78 -0
  130. data/lib/bio/util/color_scheme/zappo.rb +69 -0
  131. data/lib/bio/util/contingency_table.rb +337 -0
  132. data/lib/bio/util/sirna.rb +306 -0
  133. data/lib/bioruby.rb +34 -0
  134. data/sample/biofetch.rb +475 -0
  135. data/sample/color_scheme_na.rb +99 -0
  136. data/sample/dbget +37 -0
  137. data/sample/fasta2tab.rb +99 -0
  138. data/sample/fsplit.rb +51 -0
  139. data/sample/gb2fasta.rb +31 -0
  140. data/sample/gb2tab.rb +325 -0
  141. data/sample/gbtab2mysql.rb +161 -0
  142. data/sample/genes2nuc.rb +33 -0
  143. data/sample/genes2pep.rb +33 -0
  144. data/sample/genes2tab.rb +81 -0
  145. data/sample/genome2rb.rb +29 -0
  146. data/sample/genome2tab.rb +76 -0
  147. data/sample/goslim.rb +311 -0
  148. data/sample/gt2fasta.rb +47 -0
  149. data/sample/pmfetch.rb +42 -0
  150. data/sample/pmsearch.rb +42 -0
  151. data/sample/psortplot_html.rb +222 -0
  152. data/sample/ssearch2tab.rb +96 -0
  153. data/sample/tdiary.rb +158 -0
  154. data/sample/tfastx2tab.rb +100 -0
  155. data/sample/vs-genes.rb +212 -0
  156. data/test/data/SOSUI/sample.report +11 -0
  157. data/test/data/TMHMM/sample.report +21 -0
  158. data/test/data/blast/eco:b0002.faa +15 -0
  159. data/test/data/blast/eco:b0002.faa.m0 +128 -0
  160. data/test/data/blast/eco:b0002.faa.m7 +65 -0
  161. data/test/data/blast/eco:b0002.faa.m8 +1 -0
  162. data/test/data/embl/AB090716.embl +65 -0
  163. data/test/data/genscan/sample.report +63 -0
  164. data/test/data/prosite/prosite.dat +2233 -0
  165. data/test/data/refseq/nm_126355.entret +64 -0
  166. data/test/data/uniprot/p53_human.uniprot +1456 -0
  167. data/test/runner.rb +10 -0
  168. data/test/unit/bio/appl/blast/test_report.rb +427 -0
  169. data/test/unit/bio/appl/blast/test_xmlparser.rb +400 -0
  170. data/test/unit/bio/appl/genscan/test_report.rb +195 -0
  171. data/test/unit/bio/appl/sosui/test_report.rb +94 -0
  172. data/test/unit/bio/appl/targetp/test_report.rb +159 -0
  173. data/test/unit/bio/appl/test_blast.rb +159 -0
  174. data/test/unit/bio/appl/test_fasta.rb +142 -0
  175. data/test/unit/bio/appl/tmhmm/test_report.rb +139 -0
  176. data/test/unit/bio/data/test_aa.rb +103 -0
  177. data/test/unit/bio/data/test_codontable.rb +120 -0
  178. data/test/unit/bio/data/test_na.rb +89 -0
  179. data/test/unit/bio/db/embl/test_common.rb +130 -0
  180. data/test/unit/bio/db/embl/test_embl.rb +227 -0
  181. data/test/unit/bio/db/embl/test_sptr.rb +268 -0
  182. data/test/unit/bio/db/embl/test_uniprot.rb +44 -0
  183. data/test/unit/bio/db/kegg/test_genes.rb +58 -0
  184. data/test/unit/bio/db/test_fasta.rb +263 -0
  185. data/test/unit/bio/db/test_gff.rb +140 -0
  186. data/test/unit/bio/db/test_prosite.rb +1450 -0
  187. data/test/unit/bio/io/test_ddbjxml.rb +87 -0
  188. data/test/unit/bio/io/test_soapwsdl.rb +45 -0
  189. data/test/unit/bio/shell/plugin/test_seq.rb +175 -0
  190. data/test/unit/bio/test_alignment.rb +1028 -0
  191. data/test/unit/bio/test_command.rb +71 -0
  192. data/test/unit/bio/test_db.rb +109 -0
  193. data/test/unit/bio/test_feature.rb +128 -0
  194. data/test/unit/bio/test_location.rb +51 -0
  195. data/test/unit/bio/test_pathway.rb +485 -0
  196. data/test/unit/bio/test_sequence.rb +386 -0
  197. data/test/unit/bio/test_shell.rb +31 -0
  198. data/test/unit/bio/util/test_color_scheme.rb +45 -0
  199. data/test/unit/bio/util/test_contingency_table.rb +106 -0
  200. data/test/unit/bio/util/test_sirna.rb +258 -0
  201. metadata +295 -0
@@ -0,0 +1,308 @@
1
+ #
2
+ # bio/reference.rb - journal reference class
3
+ #
4
+ # Copyright (C) 2001 KATAYAMA Toshiaki <k@bioruby.org>
5
+ #
6
+ # This library is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU Lesser General Public
8
+ # License as published by the Free Software Foundation; either
9
+ # version 2 of the License, or (at your option) any later version.
10
+ #
11
+ # This library is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ # Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with this library; if not, write to the Free Software
18
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ #
20
+ # $Id: reference.rb,v 1.18 2005/12/18 16:58:58 nakao Exp $
21
+ #
22
+
23
+ module Bio
24
+
25
+ class Reference
26
+
27
+ def initialize(hash)
28
+ hash.default = ''
29
+ @authors = hash['authors'] # [ "Hoge, J.P.", "Fuga, F.B." ]
30
+ @title = hash['title'] # "Title of the study."
31
+ @journal = hash['journal'] # "Theor. J. Hoge"
32
+ @volume = hash['volume'] # 12
33
+ @issue = hash['issue'] # 3
34
+ @pages = hash['pages'] # 123-145
35
+ @year = hash['year'] # 2001
36
+ @pubmed = hash['pubmed'] # 12345678
37
+ @medline = hash['medline'] # 98765432
38
+ @abstract = hash['abstract']
39
+ @url = hash['url']
40
+ @mesh = hash['mesh']
41
+ @affiliations = hash['affiliations']
42
+ @authors = [] if @authors.empty?
43
+ @mesh = [] if @mesh.empty?
44
+ @affiliations = [] if @affiliations.empty?
45
+ end
46
+ attr_reader :authors, :title, :journal, :volume, :issue, :pages, :year,
47
+ :pubmed, :medline, :abstract, :url, :mesh, :affiliations
48
+
49
+ def format(style = nil, option = nil)
50
+ case style
51
+ when 'endnote'
52
+ return endnote
53
+ when 'bibitem'
54
+ return bibitem(option)
55
+ when 'bibtex'
56
+ return bibtex(option)
57
+ when 'rd'
58
+ return rd(option)
59
+ when /^nature$/i
60
+ return nature(option)
61
+ when /^science$/i
62
+ return science
63
+ when /^genome\s*_*biol/i
64
+ return genome_biol
65
+ when /^genome\s*_*res/i
66
+ return genome_res
67
+ when /^nar$/i
68
+ return nar
69
+ when /^current/i
70
+ return current
71
+ when /^trends/i
72
+ return trends
73
+ when /^cell$/i
74
+ return cell
75
+ else
76
+ return general
77
+ end
78
+ end
79
+
80
+ def endnote
81
+ lines = []
82
+ lines << "%0 Journal Article"
83
+ @authors.each do |author|
84
+ lines << "%A #{author}"
85
+ end
86
+ lines << "%D #{@year}" unless @year.empty?
87
+ lines << "%T #{@title}" unless @title.empty?
88
+ lines << "%J #{@journal}" unless @journal.empty?
89
+ lines << "%V #{@volume}" unless @volume.empty?
90
+ lines << "%N #{@issue}" unless @issue.empty?
91
+ lines << "%P #{@pages}" unless @pages.empty?
92
+ lines << "%M #{@pubmed}" unless @pubmed.empty?
93
+ if @pubmed
94
+ cgi = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi"
95
+ opts = "cmd=Retrieve&db=PubMed&dopt=Citation&list_uids"
96
+ @url = "#{cgi}?#{opts}=#{@pubmed}"
97
+ end
98
+ lines << "%U #{@url}" unless @url.empty?
99
+ lines << "%X #{@abstract}" unless @abstract.empty?
100
+ @mesh.each do |term|
101
+ lines << "%K #{term}"
102
+ end
103
+ lines << "%+ #{@affiliations.join(' ')}" unless @affiliations.empty?
104
+ return lines.join("\n")
105
+ end
106
+
107
+ def bibitem(item = nil)
108
+ item = "PMID:#{@pubmed}" unless item
109
+ pages = @pages.sub('-', '--')
110
+ return <<-"END".collect {|line| line.strip}.join("\n")
111
+ \\bibitem{#{item}}
112
+ #{@authors.join(', ')}
113
+ #{@title},
114
+ {\\em #{@journal}}, #{@volume}(#{@issue}):#{pages}, #{@year}.
115
+ END
116
+ end
117
+
118
+ def bibtex(section = nil)
119
+ section = "article" unless section
120
+ authors = authors_join(' and ', ' and ')
121
+ pages = @pages.sub('-', '--')
122
+ return <<-"END".gsub(/\t/, '')
123
+ @#{section}{PMID:#{@pubmed},
124
+ author = {#{authors}},
125
+ title = {#{@title}},
126
+ journal = {#{@journal}},
127
+ year = {#{@year}},
128
+ volume = {#{@volume}},
129
+ number = {#{@issue}},
130
+ pages = {#{pages}},
131
+ }
132
+ END
133
+ end
134
+
135
+ def general
136
+ authors = @authors.join(', ')
137
+ "#{authors} (#{@year}). \"#{@title}\" #{@journal} #{@volume}:#{@pages}."
138
+ end
139
+
140
+ def rd(str = nil)
141
+ @abstract ||= str
142
+ lines = []
143
+ lines << "== " + @title
144
+ lines << "* " + authors_join(' and ')
145
+ lines << "* #{@journal} #{@year} #{@volume}:#{@pages} [PMID:#{@pubmed}]"
146
+ lines << @abstract
147
+ return lines.join("\n\n")
148
+ end
149
+
150
+ def nature(short = false)
151
+ if short
152
+ if @authors.size > 4
153
+ authors = "#{@authors[0]} et al."
154
+ elsif @authors.size == 1
155
+ authors = "#{@authors[0]}"
156
+ else
157
+ authors = authors_join(' & ')
158
+ end
159
+ "#{authors} #{@journal} #{@volume}, #{@pages} (#{@year})."
160
+ else
161
+ authors = authors_join(' & ')
162
+ "#{authors} #{@title} #{@journal} #{@volume}, #{@pages} (#{@year})."
163
+ end
164
+ end
165
+
166
+ def science
167
+ if @authors.size > 4
168
+ authors = rev_name(@authors[0]) + " et al."
169
+ else
170
+ authors = @authors.collect {|name| rev_name(name)}.join(', ')
171
+ end
172
+ page_from, = @pages.split('-')
173
+ "#{authors}, #{@journal} #{@volume} #{page_from} (#{@year})."
174
+ end
175
+
176
+ def genome_biol
177
+ authors = @authors.collect {|name| strip_dots(name)}.join(', ')
178
+ journal = strip_dots(@journal)
179
+ "#{authors}: #{@title} #{journal} #{@year}, #{@volume}:#{@pages}."
180
+ end
181
+ alias current genome_biol
182
+
183
+ def genome_res
184
+ authors = authors_join(' and ')
185
+ "#{authors} #{@year}.\n #{@title} #{@journal} #{@volume}: #{@pages}."
186
+ end
187
+
188
+ def nar
189
+ authors = authors_join(' and ')
190
+ "#{authors} (#{@year}) #{@title} #{@journal}, #{@volume}, #{@pages}."
191
+ end
192
+
193
+ def cell
194
+ authors = authors_join(' and ')
195
+ "#{authors} (#{@year}). #{@title} #{@journal} #{@volume}, #{pages}."
196
+ end
197
+
198
+ def trends
199
+ if @authors.size > 2
200
+ authors = "#{@authors[0]} et al."
201
+ elsif @authors.size == 1
202
+ authors = "#{@authors[0]}"
203
+ else
204
+ authors = authors_join(' and ')
205
+ end
206
+ "#{authors} (#{@year}) #{@title} #{@journal} #{@volume}, #{@pages}"
207
+ end
208
+
209
+
210
+ private
211
+
212
+ def strip_dots(data)
213
+ data.tr(',.', '') if data
214
+ end
215
+
216
+ def authors_join(amp, sep = ', ')
217
+ authors = @authors.clone
218
+ if authors.length > 1
219
+ last = authors.pop
220
+ authors = authors.join(sep) + "#{amp}" + last
221
+ elsif authors.length == 1
222
+ authors = authors.pop
223
+ else
224
+ authors = ""
225
+ end
226
+ end
227
+
228
+ def rev_name(name)
229
+ if name =~ /,/
230
+ name, initial = name.split(/,\s+/)
231
+ name = "#{initial} #{name}"
232
+ end
233
+ return name
234
+ end
235
+
236
+ end
237
+
238
+
239
+ class References
240
+
241
+ def initialize(ary = [])
242
+ @references = ary
243
+ end
244
+ attr_accessor :references
245
+
246
+ def append(a)
247
+ @references.push(a) if a.is_a? Reference
248
+ return self
249
+ end
250
+
251
+ def each
252
+ @references.each do |x|
253
+ yield x
254
+ end
255
+ end
256
+
257
+ end
258
+
259
+ end
260
+
261
+
262
+
263
+ =begin
264
+
265
+ = Bio::Reference
266
+
267
+ --- Bio::Reference.new(hash)
268
+
269
+ --- Bio::Reference#authors -> Array
270
+ --- Bio::Reference#title -> String
271
+ --- Bio::Reference#journal -> String
272
+ --- Bio::Reference#volume -> Fixnum
273
+ --- Bio::Reference#issue -> Fixnum
274
+ --- Bio::Reference#pages -> String
275
+ --- Bio::Reference#year -> Fixnum
276
+ --- Bio::Reference#pubmed -> Fixnum
277
+ --- Bio::Reference#medline -> Fixnum
278
+ --- Bio::Reference#abstract -> String
279
+ --- Bio::Reference#url -> String
280
+ --- Bio::Reference#mesh -> Array
281
+ --- Bio::Reference#affiliations -> Array
282
+
283
+ --- Bio::Reference#format(style = nil, option = nil) -> String
284
+
285
+ --- Bio::Reference#endnote
286
+ --- Bio::Reference#bibitem(item = nil) -> String
287
+ --- Bio::Reference#bibtex(section = nil) -> String
288
+ --- Bio::Reference#rd(str = nil) -> String
289
+ --- Bio::Reference#nature(short = false) -> String
290
+ --- Bio::Reference#science -> String
291
+ --- Bio::Reference#genome_biol -> String
292
+ --- Bio::Reference#genome_res -> String
293
+ --- Bio::Reference#nar -> String
294
+ --- Bio::Reference#cell -> String
295
+ --- Bio::Reference#trends -> String
296
+ --- Bio::Reference#general -> String
297
+
298
+ = Bio::References
299
+
300
+ --- Bio::References.new(ary = [])
301
+
302
+ --- Bio::References#references -> Array
303
+ --- Bio::References#append(a) -> Bio::References
304
+ --- Bio::References#each -> Array
305
+
306
+ =end
307
+
308
+
@@ -0,0 +1,593 @@
1
+ #
2
+ # = bio/sequence.rb - biological sequence class
3
+ #
4
+ # Copyright:: Copyright (C) 2000-2005
5
+ # Toshiaki Katayama <k@bioruby.org>,
6
+ # Yoshinori K. Okuji <okuji@embug.org>,
7
+ # Naohisa Goto <ng@bioruby.org>
8
+ # License:: LGPL
9
+ #
10
+ # $Id: sequence.rb,v 0.49 2005/11/27 15:46:01 k Exp $
11
+ #
12
+ #--
13
+ # *TODO* remove this functionality?
14
+ # You can use Bio::Seq instead of Bio::Sequence for short.
15
+ #++
16
+ #
17
+ #--
18
+ #
19
+ # This library is free software; you can redistribute it and/or
20
+ # modify it under the terms of the GNU Lesser General Public
21
+ # License as published by the Free Software Foundation; either
22
+ # version 2 of the License, or (at your option) any later version.
23
+ #
24
+ # This library is distributed in the hope that it will be useful,
25
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
26
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
27
+ # Lesser General Public License for more details.
28
+ #
29
+ # You should have received a copy of the GNU Lesser General Public
30
+ # License along with this library; if not, write to the Free Software
31
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
32
+ #
33
+ #++
34
+ #
35
+
36
+ require 'bio/data/na'
37
+ require 'bio/data/aa'
38
+ require 'bio/data/codontable'
39
+ require 'bio/location'
40
+
41
+ module Bio
42
+
43
+ # Nucleic/Amino Acid sequence
44
+
45
+ class Sequence < String
46
+
47
+ def self.auto(str)
48
+ moltype = self.guess(str)
49
+ if moltype == NA
50
+ NA.new(str)
51
+ else
52
+ AA.new(str)
53
+ end
54
+ end
55
+
56
+ def guess(threshold = 0.9)
57
+ cmp = self.composition
58
+
59
+ bases = cmp['A'] + cmp['T'] + cmp['G'] + cmp['C'] +
60
+ cmp['a'] + cmp['t'] + cmp['g'] + cmp['c']
61
+
62
+ total = self.length - cmp['N'] - cmp['n']
63
+
64
+ if bases.to_f / total > threshold
65
+ return NA
66
+ else
67
+ return AA
68
+ end
69
+ end
70
+
71
+ def self.guess(str, *args)
72
+ self.new(str).guess(*args)
73
+ end
74
+
75
+ def to_s
76
+ String.new(self)
77
+ end
78
+ alias to_str to_s
79
+
80
+ # Force self to re-initialize for clean up (remove white spaces,
81
+ # case unification).
82
+ def seq
83
+ self.class.new(self)
84
+ end
85
+
86
+ # Similar to the 'seq' method, but changes the self object destructively.
87
+ def normalize!
88
+ initialize(self)
89
+ self
90
+ end
91
+ alias seq! normalize!
92
+
93
+ def <<(*arg)
94
+ super(self.class.new(*arg))
95
+ end
96
+ alias concat <<
97
+
98
+ def +(*arg)
99
+ self.class.new(super(*arg))
100
+ end
101
+
102
+ # Returns the subsequence of the self string.
103
+ def subseq(s = 1, e = self.length)
104
+ return nil if s < 1 or e < 1
105
+ s -= 1
106
+ e -= 1
107
+ self[s..e]
108
+ end
109
+
110
+ # Output the FASTA format string of the sequence. The 1st argument is
111
+ # used as the comment string. If the 2nd option is given, the output
112
+ # sequence will be folded.
113
+ def to_fasta(header = '', width = nil)
114
+ ">#{header}\n" +
115
+ if width
116
+ self.to_s.gsub(Regexp.new(".{1,#{width}}"), "\\0\n")
117
+ else
118
+ self.to_s + "\n"
119
+ end
120
+ end
121
+
122
+ # This method iterates on sub string with specified length 'window_size'.
123
+ # By specifing 'step_size', codon sized shifting or spliting genome
124
+ # sequence with ovelapping each end can easily be yielded.
125
+ #
126
+ # The remainder sequence at the terminal end will be returned.
127
+ #
128
+ # Example:
129
+ # # prints average GC% on each 100bp
130
+ # seq.window_search(100) do |subseq|
131
+ # puts subseq.gc
132
+ # end
133
+ # # prints every translated peptide (length 5aa) in the same frame
134
+ # seq.window_search(15, 3) do |subseq|
135
+ # puts subseq.translate
136
+ # end
137
+ # # split genome sequence by 10000bp with 1000bp overlap in fasta format
138
+ # i = 1
139
+ # remainder = seq.window_search(10000, 9000) do |subseq|
140
+ # puts subseq.to_fasta("segment #{i}", 60)
141
+ # i += 1
142
+ # end
143
+ # puts remainder.to_fasta("segment #{i}", 60)
144
+ #
145
+ def window_search(window_size, step_size = 1)
146
+ i = 0
147
+ 0.step(self.length - window_size, step_size) do |i|
148
+ yield self[i, window_size]
149
+ end
150
+ return self[i + window_size .. -1]
151
+ end
152
+
153
+ # This method receive a hash of residues/bases to the particular values,
154
+ # and sum up the value along with the self sequence. Especially useful
155
+ # to use with the window_search method and amino acid indices etc.
156
+ def total(hash)
157
+ hash.default = 0.0 unless hash.default
158
+ sum = 0.0
159
+ self.each_byte do |x|
160
+ begin
161
+ sum += hash[x.chr]
162
+ end
163
+ end
164
+ return sum
165
+ end
166
+
167
+ # Returns a hash of the occurrence counts for each residue or base.
168
+ def composition
169
+ count = Hash.new(0)
170
+ self.scan(/./) do |x|
171
+ count[x] += 1
172
+ end
173
+ return count
174
+ end
175
+
176
+ # Returns a randomized sequence keeping its composition by default.
177
+ # The argument is required when generating a random sequence from the empty
178
+ # sequence (used by the class methods NA.randomize, AA.randomize).
179
+ # If the block is given, yields for each random residue/base.
180
+ def randomize(hash = nil)
181
+ length = self.length
182
+ if hash
183
+ count = hash.clone
184
+ count.each_value {|x| length += x}
185
+ else
186
+ count = self.composition
187
+ end
188
+
189
+ seq = ''
190
+ tmp = {}
191
+ length.times do
192
+ count.each do |k, v|
193
+ tmp[k] = v * rand
194
+ end
195
+ max = tmp.max {|a, b| a[1] <=> b[1]}
196
+ count[max.first] -= 1
197
+
198
+ if block_given?
199
+ yield max.first
200
+ else
201
+ seq += max.first
202
+ end
203
+ end
204
+ return self.class.new(seq)
205
+ end
206
+
207
+ # Generate a new random sequence with the given frequency of bases
208
+ # or residues. The sequence length is determined by the sum of each
209
+ # base/residue occurences.
210
+ def self.randomize(*arg, &block)
211
+ self.new('').randomize(*arg, &block)
212
+ end
213
+
214
+ # Receive a GenBank style position string and convert it to the Locations
215
+ # objects to splice the sequence itself. See also: bio/location.rb
216
+ #
217
+ # This method depends on Locations class, see bio/location.rb
218
+ def splicing(position)
219
+ unless position.is_a?(Locations) then
220
+ position = Locations.new(position)
221
+ end
222
+ s = ''
223
+ position.each do |location|
224
+ if location.sequence
225
+ s << location.sequence
226
+ else
227
+ exon = self.subseq(location.from, location.to)
228
+ begin
229
+ exon.complement! if location.strand < 0
230
+ rescue NameError
231
+ end
232
+ s << exon
233
+ end
234
+ end
235
+ return self.class.new(s)
236
+ end
237
+
238
+
239
+ # Nucleic Acid sequence
240
+
241
+ class NA < Sequence
242
+
243
+ # Generate a nucleic acid sequence object from a string.
244
+ def initialize(str)
245
+ super
246
+ self.downcase!
247
+ self.tr!(" \t\n\r",'')
248
+ end
249
+
250
+ # This method depends on Locations class, see bio/location.rb
251
+ def splicing(position)
252
+ mRNA = super
253
+ if mRNA.rna?
254
+ mRNA.tr!('t', 'u')
255
+ else
256
+ mRNA.tr!('u', 't')
257
+ end
258
+ mRNA
259
+ end
260
+
261
+ # Returns complement sequence without reversing ("atgc" -> "tacg")
262
+ def forward_complement
263
+ s = self.class.new(self)
264
+ s.forward_complement!
265
+ s
266
+ end
267
+
268
+ # Convert to complement sequence without reversing ("atgc" -> "tacg")
269
+ def forward_complement!
270
+ if self.rna?
271
+ self.tr!('augcrymkdhvbswn', 'uacgyrkmhdbvswn')
272
+ else
273
+ self.tr!('atgcrymkdhvbswn', 'tacgyrkmhdbvswn')
274
+ end
275
+ self
276
+ end
277
+
278
+ # Returns reverse complement sequence ("atgc" -> "gcat")
279
+ def reverse_complement
280
+ s = self.class.new(self)
281
+ s.reverse_complement!
282
+ s
283
+ end
284
+
285
+ # Convert to reverse complement sequence ("atgc" -> "gcat")
286
+ def reverse_complement!
287
+ self.reverse!
288
+ self.forward_complement!
289
+ end
290
+
291
+ # Aliases for short
292
+ alias complement reverse_complement
293
+ alias complement! reverse_complement!
294
+
295
+
296
+ # Translate into the amino acid sequence from the given frame and the
297
+ # selected codon table. The table also can be a Bio::CodonTable object.
298
+ # The 'unknown' character is used for invalid/unknown codon (can be
299
+ # used for 'nnn' and/or gap translation in practice).
300
+ #
301
+ # Frame can be 1, 2 or 3 for the forward strand and -1, -2 or -3
302
+ # (4, 5 or 6 is also accepted) for the reverse strand.
303
+ def translate(frame = 1, table = 1, unknown = 'X')
304
+ if table.is_a?(Bio::CodonTable)
305
+ ct = table
306
+ else
307
+ ct = Bio::CodonTable[table]
308
+ end
309
+ naseq = self.dna
310
+ case frame
311
+ when 1, 2, 3
312
+ from = frame - 1
313
+ when 4, 5, 6
314
+ from = frame - 4
315
+ naseq.complement!
316
+ when -1, -2, -3
317
+ from = -1 - frame
318
+ naseq.complement!
319
+ else
320
+ from = 0
321
+ end
322
+ nalen = naseq.length - from
323
+ nalen -= nalen % 3
324
+ aaseq = naseq[from, nalen].gsub(/.{3}/) {|codon| ct[codon] or unknown}
325
+ return Bio::Sequence::AA.new(aaseq)
326
+ end
327
+
328
+ # Returns counts of the each codon in the sequence by Hash.
329
+ def codon_usage
330
+ hash = Hash.new(0)
331
+ self.window_search(3, 3) do |codon|
332
+ hash[codon] += 1
333
+ end
334
+ return hash
335
+ end
336
+
337
+ # Calculate the ratio of GC / ATGC bases in percent.
338
+ def gc_percent
339
+ count = self.composition
340
+ at = count['a'] + count['t'] + count['u']
341
+ gc = count['g'] + count['c']
342
+ gc = 100 * gc / (at + gc)
343
+ return gc
344
+ end
345
+
346
+ # Show abnormal bases other than 'atgcu'.
347
+ def illegal_bases
348
+ self.scan(/[^atgcu]/).sort.uniq
349
+ end
350
+
351
+ # Estimate the weight of this biological string molecule.
352
+ # NucleicAcid is defined in bio/data/na.rb
353
+ def molecular_weight
354
+ if self.rna?
355
+ NucleicAcid.weight(self, true)
356
+ else
357
+ NucleicAcid.weight(self)
358
+ end
359
+ end
360
+
361
+ # Convert the universal code string into the regular expression.
362
+ def to_re
363
+ if self.rna?
364
+ NucleicAcid.to_re(self.dna, true)
365
+ else
366
+ NucleicAcid.to_re(self)
367
+ end
368
+ end
369
+
370
+ # Convert the self string into the list of the names of the each base.
371
+ def names
372
+ array = []
373
+ self.each_byte do |x|
374
+ array.push(NucleicAcid.names[x.chr.upcase])
375
+ end
376
+ return array
377
+ end
378
+
379
+ # Output a DNA string by substituting 'u' to 't'.
380
+ def dna
381
+ self.tr('u', 't')
382
+ end
383
+
384
+ def dna!
385
+ self.tr!('u', 't')
386
+ end
387
+
388
+ # Output a RNA string by substituting 't' to 'u'.
389
+ def rna
390
+ self.tr('t', 'u')
391
+ end
392
+
393
+ def rna!
394
+ self.tr!('t', 'u')
395
+ end
396
+
397
+ def rna?
398
+ self.index('u')
399
+ end
400
+ protected :rna?
401
+
402
+ def pikachu
403
+ self.dna.tr("atgc", "pika") # joke, of course :-)
404
+ end
405
+
406
+ end
407
+
408
+
409
+ # Amino Acid sequence
410
+
411
+ class AA < Sequence
412
+
413
+ # Generate a amino acid sequence object from a string.
414
+ def initialize(str)
415
+ super
416
+ self.upcase!
417
+ self.tr!(" \t\n\r",'')
418
+ end
419
+
420
+ # Estimate the weight of this protein.
421
+ # AminoAcid is defined in bio/data/aa.rb
422
+ def molecular_weight
423
+ AminoAcid.weight(self)
424
+ end
425
+
426
+ def to_re
427
+ AminoAcid.to_re(self)
428
+ end
429
+
430
+ # Generate the list of the names of the each residue along with the
431
+ # sequence (3 letters code).
432
+ def codes
433
+ array = []
434
+ self.each_byte do |x|
435
+ array.push(AminoAcid.names[x.chr])
436
+ end
437
+ return array
438
+ end
439
+
440
+ # Similar to codes but returns long names.
441
+ def names
442
+ self.codes.map do |x|
443
+ AminoAcid.names[x]
444
+ end
445
+ end
446
+
447
+ end
448
+
449
+ end # Sequence
450
+
451
+
452
+ class Seq < Sequence
453
+ attr_accessor :entry_id, :definition, :features, :references, :comments,
454
+ :date, :keywords, :dblinks, :taxonomy, :moltype
455
+ end
456
+
457
+
458
+ end # Bio
459
+
460
+
461
+ if __FILE__ == $0
462
+
463
+ puts "== Test Bio::Sequence::NA.new"
464
+ p Bio::Sequence::NA.new('')
465
+ p na = Bio::Sequence::NA.new('atgcatgcATGCATGCAAAA')
466
+ p rna = Bio::Sequence::NA.new('augcaugcaugcaugcaaaa')
467
+
468
+ puts "\n== Test Bio::Sequence::AA.new"
469
+ p Bio::Sequence::AA.new('')
470
+ p aa = Bio::Sequence::AA.new('ACDEFGHIKLMNPQRSTVWYU')
471
+
472
+ puts "\n== Test Bio::Sequence#to_s"
473
+ p na.to_s
474
+ p aa.to_s
475
+
476
+ puts "\n== Test Bio::Sequence#subseq(2,6)"
477
+ p na
478
+ p na.subseq(2,6)
479
+
480
+ puts "\n== Test Bio::Sequence#[2,6]"
481
+ p na
482
+ p na[2,6]
483
+
484
+ puts "\n== Test Bio::Sequence#to_fasta('hoge', 8)"
485
+ puts na.to_fasta('hoge', 8)
486
+
487
+ puts "\n== Test Bio::Sequence#window_search(15)"
488
+ p na
489
+ na.window_search(15) {|x| p x}
490
+
491
+ puts "\n== Test Bio::Sequence#total({'a'=>0.1,'t'=>0.2,'g'=>0.3,'c'=>0.4})"
492
+ p na.total({'a'=>0.1,'t'=>0.2,'g'=>0.3,'c'=>0.4})
493
+
494
+ puts "\n== Test Bio::Sequence#composition"
495
+ p na
496
+ p na.composition
497
+ p rna
498
+ p rna.composition
499
+
500
+ puts "\n== Test Bio::Sequence::NA#splicing('complement(join(1..5,16..20))')"
501
+ p na
502
+ p na.splicing("complement(join(1..5,16..20))")
503
+ p rna
504
+ p rna.splicing("complement(join(1..5,16..20))")
505
+
506
+ puts "\n== Test Bio::Sequence::NA#complement"
507
+ p na.complement
508
+ p rna.complement
509
+ p Bio::Sequence::NA.new('tacgyrkmhdbvswn').complement
510
+ p Bio::Sequence::NA.new('uacgyrkmhdbvswn').complement
511
+
512
+ puts "\n== Test Bio::Sequence::NA#translate"
513
+ p na
514
+ p na.translate
515
+ p rna
516
+ p rna.translate
517
+
518
+ puts "\n== Test Bio::Sequence::NA#gc_percent"
519
+ p na.gc
520
+ p rna.gc
521
+
522
+ puts "\n== Test Bio::Sequence::NA#illegal_bases"
523
+ p na.illegal_bases
524
+ p Bio::Sequence::NA.new('tacgyrkmhdbvswn').illegal_bases
525
+ p Bio::Sequence::NA.new('abcdefghijklmnopqrstuvwxyz-!%#$@').illegal_bases
526
+
527
+ puts "\n== Test Bio::Sequence::NA#molecular_weight"
528
+ p na
529
+ p na.molecular_weight
530
+ p rna
531
+ p rna.molecular_weight
532
+
533
+ puts "\n== Test Bio::Sequence::NA#to_re"
534
+ p Bio::Sequence::NA.new('atgcrymkdhvbswn')
535
+ p Bio::Sequence::NA.new('atgcrymkdhvbswn').to_re
536
+ p Bio::Sequence::NA.new('augcrymkdhvbswn')
537
+ p Bio::Sequence::NA.new('augcrymkdhvbswn').to_re
538
+
539
+ puts "\n== Test Bio::Sequence::NA#names"
540
+ p na.names
541
+
542
+ puts "\n== Test Bio::Sequence::NA#pikachu"
543
+ p na.pikachu
544
+
545
+ puts "\n== Test Bio::Sequence::NA#randomize"
546
+ print "Orig : "; p na
547
+ print "Rand : "; p na.randomize
548
+ print "Rand : "; p na.randomize
549
+ print "Rand : "; p na.randomize.randomize
550
+ print "Block : "; na.randomize do |x| print x end; puts
551
+
552
+ print "Orig : "; p rna
553
+ print "Rand : "; p rna.randomize
554
+ print "Rand : "; p rna.randomize
555
+ print "Rand : "; p rna.randomize.randomize
556
+ print "Block : "; rna.randomize do |x| print x end; puts
557
+
558
+ puts "\n== Test Bio::Sequence::NA.randomize(counts)"
559
+ print "Count : "; p counts = {'a'=>10,'c'=>20,'g'=>30,'t'=>40}
560
+ print "Rand : "; p Bio::Sequence::NA.randomize(counts)
561
+ print "Count : "; p counts = {'a'=>10,'c'=>20,'g'=>30,'u'=>40}
562
+ print "Rand : "; p Bio::Sequence::NA.randomize(counts)
563
+ print "Block : "; Bio::Sequence::NA.randomize(counts) {|x| print x}; puts
564
+
565
+ puts "\n== Test Bio::Sequence::AA#codes"
566
+ p aa
567
+ p aa.codes
568
+
569
+ puts "\n== Test Bio::Sequence::AA#names"
570
+ p aa
571
+ p aa.names
572
+
573
+ puts "\n== Test Bio::Sequence::AA#molecular_weight"
574
+ p aa.subseq(1,20)
575
+ p aa.subseq(1,20).molecular_weight
576
+
577
+ puts "\n== Test Bio::Sequence::AA#randomize"
578
+ aaseq = 'MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNHLVAMIEKTISGQDA'
579
+ s = Bio::Sequence::AA.new(aaseq)
580
+ print "Orig : "; p s
581
+ print "Rand : "; p s.randomize
582
+ print "Rand : "; p s.randomize
583
+ print "Rand : "; p s.randomize.randomize
584
+ print "Block : "; s.randomize {|x| print x}; puts
585
+
586
+ puts "\n== Test Bio::Sequence::AA.randomize(counts)"
587
+ print "Count : "; p counts = s.composition
588
+ print "Rand : "; puts Bio::Sequence::AA.randomize(counts)
589
+ print "Block : "; Bio::Sequence::AA.randomize(counts) {|x| print x}; puts
590
+
591
+ end
592
+
593
+