bio 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (201) hide show
  1. data/bin/bioruby +107 -0
  2. data/bin/br_biofetch.rb +59 -0
  3. data/bin/br_bioflat.rb +294 -0
  4. data/bin/br_biogetseq.rb +57 -0
  5. data/bin/br_pmfetch.rb +431 -0
  6. data/doc/BioRuby.rd.ja +225 -0
  7. data/doc/Changes-0.7.rd +236 -0
  8. data/doc/Design.rd.ja +341 -0
  9. data/doc/KEGG_API.rd +1437 -0
  10. data/doc/KEGG_API.rd.ja +1399 -0
  11. data/doc/TODO.rd.ja +138 -0
  12. data/doc/Tutorial.rd +1138 -0
  13. data/doc/Tutorial.rd.ja +2110 -0
  14. data/etc/bioinformatics/seqdatabase.ini +210 -0
  15. data/lib/bio.rb +256 -0
  16. data/lib/bio/alignment.rb +1906 -0
  17. data/lib/bio/appl/bl2seq/report.rb +350 -0
  18. data/lib/bio/appl/blast.rb +269 -0
  19. data/lib/bio/appl/blast/format0.rb +1402 -0
  20. data/lib/bio/appl/blast/format8.rb +95 -0
  21. data/lib/bio/appl/blast/report.rb +652 -0
  22. data/lib/bio/appl/blast/rexml.rb +151 -0
  23. data/lib/bio/appl/blast/wublast.rb +553 -0
  24. data/lib/bio/appl/blast/xmlparser.rb +222 -0
  25. data/lib/bio/appl/blat/report.rb +392 -0
  26. data/lib/bio/appl/clustalw.rb +191 -0
  27. data/lib/bio/appl/clustalw/report.rb +154 -0
  28. data/lib/bio/appl/emboss.rb +68 -0
  29. data/lib/bio/appl/fasta.rb +262 -0
  30. data/lib/bio/appl/fasta/format10.rb +428 -0
  31. data/lib/bio/appl/fasta/format6.rb +37 -0
  32. data/lib/bio/appl/genscan/report.rb +570 -0
  33. data/lib/bio/appl/hmmer.rb +129 -0
  34. data/lib/bio/appl/hmmer/report.rb +556 -0
  35. data/lib/bio/appl/mafft.rb +222 -0
  36. data/lib/bio/appl/mafft/report.rb +119 -0
  37. data/lib/bio/appl/psort.rb +555 -0
  38. data/lib/bio/appl/psort/report.rb +473 -0
  39. data/lib/bio/appl/sim4.rb +134 -0
  40. data/lib/bio/appl/sim4/report.rb +501 -0
  41. data/lib/bio/appl/sosui/report.rb +166 -0
  42. data/lib/bio/appl/spidey/report.rb +604 -0
  43. data/lib/bio/appl/targetp/report.rb +283 -0
  44. data/lib/bio/appl/tmhmm/report.rb +238 -0
  45. data/lib/bio/command.rb +166 -0
  46. data/lib/bio/data/aa.rb +354 -0
  47. data/lib/bio/data/codontable.rb +740 -0
  48. data/lib/bio/data/na.rb +226 -0
  49. data/lib/bio/db.rb +340 -0
  50. data/lib/bio/db/aaindex.rb +280 -0
  51. data/lib/bio/db/embl/common.rb +332 -0
  52. data/lib/bio/db/embl/embl.rb +446 -0
  53. data/lib/bio/db/embl/sptr.rb +954 -0
  54. data/lib/bio/db/embl/swissprot.rb +32 -0
  55. data/lib/bio/db/embl/trembl.rb +31 -0
  56. data/lib/bio/db/embl/uniprot.rb +32 -0
  57. data/lib/bio/db/fantom.rb +604 -0
  58. data/lib/bio/db/fasta.rb +869 -0
  59. data/lib/bio/db/genbank/common.rb +299 -0
  60. data/lib/bio/db/genbank/ddbj.rb +34 -0
  61. data/lib/bio/db/genbank/genbank.rb +354 -0
  62. data/lib/bio/db/genbank/genpept.rb +73 -0
  63. data/lib/bio/db/genbank/refseq.rb +31 -0
  64. data/lib/bio/db/gff.rb +106 -0
  65. data/lib/bio/db/go.rb +497 -0
  66. data/lib/bio/db/kegg/brite.rb +51 -0
  67. data/lib/bio/db/kegg/cell.rb +88 -0
  68. data/lib/bio/db/kegg/compound.rb +130 -0
  69. data/lib/bio/db/kegg/enzyme.rb +125 -0
  70. data/lib/bio/db/kegg/expression.rb +173 -0
  71. data/lib/bio/db/kegg/genes.rb +293 -0
  72. data/lib/bio/db/kegg/genome.rb +362 -0
  73. data/lib/bio/db/kegg/glycan.rb +213 -0
  74. data/lib/bio/db/kegg/keggtab.rb +418 -0
  75. data/lib/bio/db/kegg/kgml.rb +299 -0
  76. data/lib/bio/db/kegg/ko.rb +178 -0
  77. data/lib/bio/db/kegg/reaction.rb +97 -0
  78. data/lib/bio/db/litdb.rb +131 -0
  79. data/lib/bio/db/medline.rb +317 -0
  80. data/lib/bio/db/nbrf.rb +199 -0
  81. data/lib/bio/db/pdb.rb +38 -0
  82. data/lib/bio/db/pdb/atom.rb +60 -0
  83. data/lib/bio/db/pdb/chain.rb +117 -0
  84. data/lib/bio/db/pdb/model.rb +106 -0
  85. data/lib/bio/db/pdb/pdb.rb +1682 -0
  86. data/lib/bio/db/pdb/residue.rb +122 -0
  87. data/lib/bio/db/pdb/utils.rb +234 -0
  88. data/lib/bio/db/prosite.rb +616 -0
  89. data/lib/bio/db/rebase.rb +417 -0
  90. data/lib/bio/db/transfac.rb +387 -0
  91. data/lib/bio/feature.rb +201 -0
  92. data/lib/bio/io/brdb.rb +103 -0
  93. data/lib/bio/io/das.rb +471 -0
  94. data/lib/bio/io/dbget.rb +212 -0
  95. data/lib/bio/io/ddbjxml.rb +614 -0
  96. data/lib/bio/io/fastacmd.rb +123 -0
  97. data/lib/bio/io/fetch.rb +114 -0
  98. data/lib/bio/io/flatfile.rb +496 -0
  99. data/lib/bio/io/flatfile/bdb.rb +266 -0
  100. data/lib/bio/io/flatfile/index.rb +1308 -0
  101. data/lib/bio/io/flatfile/indexer.rb +778 -0
  102. data/lib/bio/io/higet.rb +92 -0
  103. data/lib/bio/io/keggapi.rb +863 -0
  104. data/lib/bio/io/pubmed.rb +189 -0
  105. data/lib/bio/io/registry.rb +308 -0
  106. data/lib/bio/io/soapwsdl.rb +114 -0
  107. data/lib/bio/io/sql.rb +428 -0
  108. data/lib/bio/location.rb +650 -0
  109. data/lib/bio/pathway.rb +991 -0
  110. data/lib/bio/reference.rb +308 -0
  111. data/lib/bio/sequence.rb +593 -0
  112. data/lib/bio/shell.rb +51 -0
  113. data/lib/bio/shell/core.rb +512 -0
  114. data/lib/bio/shell/plugin/codon.rb +228 -0
  115. data/lib/bio/shell/plugin/entry.rb +85 -0
  116. data/lib/bio/shell/plugin/flatfile.rb +119 -0
  117. data/lib/bio/shell/plugin/keggapi.rb +187 -0
  118. data/lib/bio/shell/plugin/midi.rb +448 -0
  119. data/lib/bio/shell/plugin/obda.rb +63 -0
  120. data/lib/bio/shell/plugin/seq.rb +238 -0
  121. data/lib/bio/shell/session.rb +214 -0
  122. data/lib/bio/util/color_scheme.rb +214 -0
  123. data/lib/bio/util/color_scheme/buried.rb +78 -0
  124. data/lib/bio/util/color_scheme/helix.rb +78 -0
  125. data/lib/bio/util/color_scheme/hydropathy.rb +83 -0
  126. data/lib/bio/util/color_scheme/nucleotide.rb +50 -0
  127. data/lib/bio/util/color_scheme/strand.rb +78 -0
  128. data/lib/bio/util/color_scheme/taylor.rb +69 -0
  129. data/lib/bio/util/color_scheme/turn.rb +78 -0
  130. data/lib/bio/util/color_scheme/zappo.rb +69 -0
  131. data/lib/bio/util/contingency_table.rb +337 -0
  132. data/lib/bio/util/sirna.rb +306 -0
  133. data/lib/bioruby.rb +34 -0
  134. data/sample/biofetch.rb +475 -0
  135. data/sample/color_scheme_na.rb +99 -0
  136. data/sample/dbget +37 -0
  137. data/sample/fasta2tab.rb +99 -0
  138. data/sample/fsplit.rb +51 -0
  139. data/sample/gb2fasta.rb +31 -0
  140. data/sample/gb2tab.rb +325 -0
  141. data/sample/gbtab2mysql.rb +161 -0
  142. data/sample/genes2nuc.rb +33 -0
  143. data/sample/genes2pep.rb +33 -0
  144. data/sample/genes2tab.rb +81 -0
  145. data/sample/genome2rb.rb +29 -0
  146. data/sample/genome2tab.rb +76 -0
  147. data/sample/goslim.rb +311 -0
  148. data/sample/gt2fasta.rb +47 -0
  149. data/sample/pmfetch.rb +42 -0
  150. data/sample/pmsearch.rb +42 -0
  151. data/sample/psortplot_html.rb +222 -0
  152. data/sample/ssearch2tab.rb +96 -0
  153. data/sample/tdiary.rb +158 -0
  154. data/sample/tfastx2tab.rb +100 -0
  155. data/sample/vs-genes.rb +212 -0
  156. data/test/data/SOSUI/sample.report +11 -0
  157. data/test/data/TMHMM/sample.report +21 -0
  158. data/test/data/blast/eco:b0002.faa +15 -0
  159. data/test/data/blast/eco:b0002.faa.m0 +128 -0
  160. data/test/data/blast/eco:b0002.faa.m7 +65 -0
  161. data/test/data/blast/eco:b0002.faa.m8 +1 -0
  162. data/test/data/embl/AB090716.embl +65 -0
  163. data/test/data/genscan/sample.report +63 -0
  164. data/test/data/prosite/prosite.dat +2233 -0
  165. data/test/data/refseq/nm_126355.entret +64 -0
  166. data/test/data/uniprot/p53_human.uniprot +1456 -0
  167. data/test/runner.rb +10 -0
  168. data/test/unit/bio/appl/blast/test_report.rb +427 -0
  169. data/test/unit/bio/appl/blast/test_xmlparser.rb +400 -0
  170. data/test/unit/bio/appl/genscan/test_report.rb +195 -0
  171. data/test/unit/bio/appl/sosui/test_report.rb +94 -0
  172. data/test/unit/bio/appl/targetp/test_report.rb +159 -0
  173. data/test/unit/bio/appl/test_blast.rb +159 -0
  174. data/test/unit/bio/appl/test_fasta.rb +142 -0
  175. data/test/unit/bio/appl/tmhmm/test_report.rb +139 -0
  176. data/test/unit/bio/data/test_aa.rb +103 -0
  177. data/test/unit/bio/data/test_codontable.rb +120 -0
  178. data/test/unit/bio/data/test_na.rb +89 -0
  179. data/test/unit/bio/db/embl/test_common.rb +130 -0
  180. data/test/unit/bio/db/embl/test_embl.rb +227 -0
  181. data/test/unit/bio/db/embl/test_sptr.rb +268 -0
  182. data/test/unit/bio/db/embl/test_uniprot.rb +44 -0
  183. data/test/unit/bio/db/kegg/test_genes.rb +58 -0
  184. data/test/unit/bio/db/test_fasta.rb +263 -0
  185. data/test/unit/bio/db/test_gff.rb +140 -0
  186. data/test/unit/bio/db/test_prosite.rb +1450 -0
  187. data/test/unit/bio/io/test_ddbjxml.rb +87 -0
  188. data/test/unit/bio/io/test_soapwsdl.rb +45 -0
  189. data/test/unit/bio/shell/plugin/test_seq.rb +175 -0
  190. data/test/unit/bio/test_alignment.rb +1028 -0
  191. data/test/unit/bio/test_command.rb +71 -0
  192. data/test/unit/bio/test_db.rb +109 -0
  193. data/test/unit/bio/test_feature.rb +128 -0
  194. data/test/unit/bio/test_location.rb +51 -0
  195. data/test/unit/bio/test_pathway.rb +485 -0
  196. data/test/unit/bio/test_sequence.rb +386 -0
  197. data/test/unit/bio/test_shell.rb +31 -0
  198. data/test/unit/bio/util/test_color_scheme.rb +45 -0
  199. data/test/unit/bio/util/test_contingency_table.rb +106 -0
  200. data/test/unit/bio/util/test_sirna.rb +258 -0
  201. metadata +295 -0
@@ -0,0 +1,308 @@
1
+ #
2
+ # bio/reference.rb - journal reference class
3
+ #
4
+ # Copyright (C) 2001 KATAYAMA Toshiaki <k@bioruby.org>
5
+ #
6
+ # This library is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU Lesser General Public
8
+ # License as published by the Free Software Foundation; either
9
+ # version 2 of the License, or (at your option) any later version.
10
+ #
11
+ # This library is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ # Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with this library; if not, write to the Free Software
18
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ #
20
+ # $Id: reference.rb,v 1.18 2005/12/18 16:58:58 nakao Exp $
21
+ #
22
+
23
+ module Bio
24
+
25
+ class Reference
26
+
27
+ def initialize(hash)
28
+ hash.default = ''
29
+ @authors = hash['authors'] # [ "Hoge, J.P.", "Fuga, F.B." ]
30
+ @title = hash['title'] # "Title of the study."
31
+ @journal = hash['journal'] # "Theor. J. Hoge"
32
+ @volume = hash['volume'] # 12
33
+ @issue = hash['issue'] # 3
34
+ @pages = hash['pages'] # 123-145
35
+ @year = hash['year'] # 2001
36
+ @pubmed = hash['pubmed'] # 12345678
37
+ @medline = hash['medline'] # 98765432
38
+ @abstract = hash['abstract']
39
+ @url = hash['url']
40
+ @mesh = hash['mesh']
41
+ @affiliations = hash['affiliations']
42
+ @authors = [] if @authors.empty?
43
+ @mesh = [] if @mesh.empty?
44
+ @affiliations = [] if @affiliations.empty?
45
+ end
46
+ attr_reader :authors, :title, :journal, :volume, :issue, :pages, :year,
47
+ :pubmed, :medline, :abstract, :url, :mesh, :affiliations
48
+
49
+ def format(style = nil, option = nil)
50
+ case style
51
+ when 'endnote'
52
+ return endnote
53
+ when 'bibitem'
54
+ return bibitem(option)
55
+ when 'bibtex'
56
+ return bibtex(option)
57
+ when 'rd'
58
+ return rd(option)
59
+ when /^nature$/i
60
+ return nature(option)
61
+ when /^science$/i
62
+ return science
63
+ when /^genome\s*_*biol/i
64
+ return genome_biol
65
+ when /^genome\s*_*res/i
66
+ return genome_res
67
+ when /^nar$/i
68
+ return nar
69
+ when /^current/i
70
+ return current
71
+ when /^trends/i
72
+ return trends
73
+ when /^cell$/i
74
+ return cell
75
+ else
76
+ return general
77
+ end
78
+ end
79
+
80
+ def endnote
81
+ lines = []
82
+ lines << "%0 Journal Article"
83
+ @authors.each do |author|
84
+ lines << "%A #{author}"
85
+ end
86
+ lines << "%D #{@year}" unless @year.empty?
87
+ lines << "%T #{@title}" unless @title.empty?
88
+ lines << "%J #{@journal}" unless @journal.empty?
89
+ lines << "%V #{@volume}" unless @volume.empty?
90
+ lines << "%N #{@issue}" unless @issue.empty?
91
+ lines << "%P #{@pages}" unless @pages.empty?
92
+ lines << "%M #{@pubmed}" unless @pubmed.empty?
93
+ if @pubmed
94
+ cgi = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi"
95
+ opts = "cmd=Retrieve&db=PubMed&dopt=Citation&list_uids"
96
+ @url = "#{cgi}?#{opts}=#{@pubmed}"
97
+ end
98
+ lines << "%U #{@url}" unless @url.empty?
99
+ lines << "%X #{@abstract}" unless @abstract.empty?
100
+ @mesh.each do |term|
101
+ lines << "%K #{term}"
102
+ end
103
+ lines << "%+ #{@affiliations.join(' ')}" unless @affiliations.empty?
104
+ return lines.join("\n")
105
+ end
106
+
107
+ def bibitem(item = nil)
108
+ item = "PMID:#{@pubmed}" unless item
109
+ pages = @pages.sub('-', '--')
110
+ return <<-"END".collect {|line| line.strip}.join("\n")
111
+ \\bibitem{#{item}}
112
+ #{@authors.join(', ')}
113
+ #{@title},
114
+ {\\em #{@journal}}, #{@volume}(#{@issue}):#{pages}, #{@year}.
115
+ END
116
+ end
117
+
118
+ def bibtex(section = nil)
119
+ section = "article" unless section
120
+ authors = authors_join(' and ', ' and ')
121
+ pages = @pages.sub('-', '--')
122
+ return <<-"END".gsub(/\t/, '')
123
+ @#{section}{PMID:#{@pubmed},
124
+ author = {#{authors}},
125
+ title = {#{@title}},
126
+ journal = {#{@journal}},
127
+ year = {#{@year}},
128
+ volume = {#{@volume}},
129
+ number = {#{@issue}},
130
+ pages = {#{pages}},
131
+ }
132
+ END
133
+ end
134
+
135
+ def general
136
+ authors = @authors.join(', ')
137
+ "#{authors} (#{@year}). \"#{@title}\" #{@journal} #{@volume}:#{@pages}."
138
+ end
139
+
140
+ def rd(str = nil)
141
+ @abstract ||= str
142
+ lines = []
143
+ lines << "== " + @title
144
+ lines << "* " + authors_join(' and ')
145
+ lines << "* #{@journal} #{@year} #{@volume}:#{@pages} [PMID:#{@pubmed}]"
146
+ lines << @abstract
147
+ return lines.join("\n\n")
148
+ end
149
+
150
+ def nature(short = false)
151
+ if short
152
+ if @authors.size > 4
153
+ authors = "#{@authors[0]} et al."
154
+ elsif @authors.size == 1
155
+ authors = "#{@authors[0]}"
156
+ else
157
+ authors = authors_join(' & ')
158
+ end
159
+ "#{authors} #{@journal} #{@volume}, #{@pages} (#{@year})."
160
+ else
161
+ authors = authors_join(' & ')
162
+ "#{authors} #{@title} #{@journal} #{@volume}, #{@pages} (#{@year})."
163
+ end
164
+ end
165
+
166
+ def science
167
+ if @authors.size > 4
168
+ authors = rev_name(@authors[0]) + " et al."
169
+ else
170
+ authors = @authors.collect {|name| rev_name(name)}.join(', ')
171
+ end
172
+ page_from, = @pages.split('-')
173
+ "#{authors}, #{@journal} #{@volume} #{page_from} (#{@year})."
174
+ end
175
+
176
+ def genome_biol
177
+ authors = @authors.collect {|name| strip_dots(name)}.join(', ')
178
+ journal = strip_dots(@journal)
179
+ "#{authors}: #{@title} #{journal} #{@year}, #{@volume}:#{@pages}."
180
+ end
181
+ alias current genome_biol
182
+
183
+ def genome_res
184
+ authors = authors_join(' and ')
185
+ "#{authors} #{@year}.\n #{@title} #{@journal} #{@volume}: #{@pages}."
186
+ end
187
+
188
+ def nar
189
+ authors = authors_join(' and ')
190
+ "#{authors} (#{@year}) #{@title} #{@journal}, #{@volume}, #{@pages}."
191
+ end
192
+
193
+ def cell
194
+ authors = authors_join(' and ')
195
+ "#{authors} (#{@year}). #{@title} #{@journal} #{@volume}, #{pages}."
196
+ end
197
+
198
+ def trends
199
+ if @authors.size > 2
200
+ authors = "#{@authors[0]} et al."
201
+ elsif @authors.size == 1
202
+ authors = "#{@authors[0]}"
203
+ else
204
+ authors = authors_join(' and ')
205
+ end
206
+ "#{authors} (#{@year}) #{@title} #{@journal} #{@volume}, #{@pages}"
207
+ end
208
+
209
+
210
+ private
211
+
212
+ def strip_dots(data)
213
+ data.tr(',.', '') if data
214
+ end
215
+
216
+ def authors_join(amp, sep = ', ')
217
+ authors = @authors.clone
218
+ if authors.length > 1
219
+ last = authors.pop
220
+ authors = authors.join(sep) + "#{amp}" + last
221
+ elsif authors.length == 1
222
+ authors = authors.pop
223
+ else
224
+ authors = ""
225
+ end
226
+ end
227
+
228
+ def rev_name(name)
229
+ if name =~ /,/
230
+ name, initial = name.split(/,\s+/)
231
+ name = "#{initial} #{name}"
232
+ end
233
+ return name
234
+ end
235
+
236
+ end
237
+
238
+
239
+ class References
240
+
241
+ def initialize(ary = [])
242
+ @references = ary
243
+ end
244
+ attr_accessor :references
245
+
246
+ def append(a)
247
+ @references.push(a) if a.is_a? Reference
248
+ return self
249
+ end
250
+
251
+ def each
252
+ @references.each do |x|
253
+ yield x
254
+ end
255
+ end
256
+
257
+ end
258
+
259
+ end
260
+
261
+
262
+
263
+ =begin
264
+
265
+ = Bio::Reference
266
+
267
+ --- Bio::Reference.new(hash)
268
+
269
+ --- Bio::Reference#authors -> Array
270
+ --- Bio::Reference#title -> String
271
+ --- Bio::Reference#journal -> String
272
+ --- Bio::Reference#volume -> Fixnum
273
+ --- Bio::Reference#issue -> Fixnum
274
+ --- Bio::Reference#pages -> String
275
+ --- Bio::Reference#year -> Fixnum
276
+ --- Bio::Reference#pubmed -> Fixnum
277
+ --- Bio::Reference#medline -> Fixnum
278
+ --- Bio::Reference#abstract -> String
279
+ --- Bio::Reference#url -> String
280
+ --- Bio::Reference#mesh -> Array
281
+ --- Bio::Reference#affiliations -> Array
282
+
283
+ --- Bio::Reference#format(style = nil, option = nil) -> String
284
+
285
+ --- Bio::Reference#endnote
286
+ --- Bio::Reference#bibitem(item = nil) -> String
287
+ --- Bio::Reference#bibtex(section = nil) -> String
288
+ --- Bio::Reference#rd(str = nil) -> String
289
+ --- Bio::Reference#nature(short = false) -> String
290
+ --- Bio::Reference#science -> String
291
+ --- Bio::Reference#genome_biol -> String
292
+ --- Bio::Reference#genome_res -> String
293
+ --- Bio::Reference#nar -> String
294
+ --- Bio::Reference#cell -> String
295
+ --- Bio::Reference#trends -> String
296
+ --- Bio::Reference#general -> String
297
+
298
+ = Bio::References
299
+
300
+ --- Bio::References.new(ary = [])
301
+
302
+ --- Bio::References#references -> Array
303
+ --- Bio::References#append(a) -> Bio::References
304
+ --- Bio::References#each -> Array
305
+
306
+ =end
307
+
308
+
@@ -0,0 +1,593 @@
1
+ #
2
+ # = bio/sequence.rb - biological sequence class
3
+ #
4
+ # Copyright:: Copyright (C) 2000-2005
5
+ # Toshiaki Katayama <k@bioruby.org>,
6
+ # Yoshinori K. Okuji <okuji@embug.org>,
7
+ # Naohisa Goto <ng@bioruby.org>
8
+ # License:: LGPL
9
+ #
10
+ # $Id: sequence.rb,v 0.49 2005/11/27 15:46:01 k Exp $
11
+ #
12
+ #--
13
+ # *TODO* remove this functionality?
14
+ # You can use Bio::Seq instead of Bio::Sequence for short.
15
+ #++
16
+ #
17
+ #--
18
+ #
19
+ # This library is free software; you can redistribute it and/or
20
+ # modify it under the terms of the GNU Lesser General Public
21
+ # License as published by the Free Software Foundation; either
22
+ # version 2 of the License, or (at your option) any later version.
23
+ #
24
+ # This library is distributed in the hope that it will be useful,
25
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
26
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
27
+ # Lesser General Public License for more details.
28
+ #
29
+ # You should have received a copy of the GNU Lesser General Public
30
+ # License along with this library; if not, write to the Free Software
31
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
32
+ #
33
+ #++
34
+ #
35
+
36
+ require 'bio/data/na'
37
+ require 'bio/data/aa'
38
+ require 'bio/data/codontable'
39
+ require 'bio/location'
40
+
41
+ module Bio
42
+
43
+ # Nucleic/Amino Acid sequence
44
+
45
+ class Sequence < String
46
+
47
+ def self.auto(str)
48
+ moltype = self.guess(str)
49
+ if moltype == NA
50
+ NA.new(str)
51
+ else
52
+ AA.new(str)
53
+ end
54
+ end
55
+
56
+ def guess(threshold = 0.9)
57
+ cmp = self.composition
58
+
59
+ bases = cmp['A'] + cmp['T'] + cmp['G'] + cmp['C'] +
60
+ cmp['a'] + cmp['t'] + cmp['g'] + cmp['c']
61
+
62
+ total = self.length - cmp['N'] - cmp['n']
63
+
64
+ if bases.to_f / total > threshold
65
+ return NA
66
+ else
67
+ return AA
68
+ end
69
+ end
70
+
71
+ def self.guess(str, *args)
72
+ self.new(str).guess(*args)
73
+ end
74
+
75
+ def to_s
76
+ String.new(self)
77
+ end
78
+ alias to_str to_s
79
+
80
+ # Force self to re-initialize for clean up (remove white spaces,
81
+ # case unification).
82
+ def seq
83
+ self.class.new(self)
84
+ end
85
+
86
+ # Similar to the 'seq' method, but changes the self object destructively.
87
+ def normalize!
88
+ initialize(self)
89
+ self
90
+ end
91
+ alias seq! normalize!
92
+
93
+ def <<(*arg)
94
+ super(self.class.new(*arg))
95
+ end
96
+ alias concat <<
97
+
98
+ def +(*arg)
99
+ self.class.new(super(*arg))
100
+ end
101
+
102
+ # Returns the subsequence of the self string.
103
+ def subseq(s = 1, e = self.length)
104
+ return nil if s < 1 or e < 1
105
+ s -= 1
106
+ e -= 1
107
+ self[s..e]
108
+ end
109
+
110
+ # Output the FASTA format string of the sequence. The 1st argument is
111
+ # used as the comment string. If the 2nd option is given, the output
112
+ # sequence will be folded.
113
+ def to_fasta(header = '', width = nil)
114
+ ">#{header}\n" +
115
+ if width
116
+ self.to_s.gsub(Regexp.new(".{1,#{width}}"), "\\0\n")
117
+ else
118
+ self.to_s + "\n"
119
+ end
120
+ end
121
+
122
+ # This method iterates on sub string with specified length 'window_size'.
123
+ # By specifing 'step_size', codon sized shifting or spliting genome
124
+ # sequence with ovelapping each end can easily be yielded.
125
+ #
126
+ # The remainder sequence at the terminal end will be returned.
127
+ #
128
+ # Example:
129
+ # # prints average GC% on each 100bp
130
+ # seq.window_search(100) do |subseq|
131
+ # puts subseq.gc
132
+ # end
133
+ # # prints every translated peptide (length 5aa) in the same frame
134
+ # seq.window_search(15, 3) do |subseq|
135
+ # puts subseq.translate
136
+ # end
137
+ # # split genome sequence by 10000bp with 1000bp overlap in fasta format
138
+ # i = 1
139
+ # remainder = seq.window_search(10000, 9000) do |subseq|
140
+ # puts subseq.to_fasta("segment #{i}", 60)
141
+ # i += 1
142
+ # end
143
+ # puts remainder.to_fasta("segment #{i}", 60)
144
+ #
145
+ def window_search(window_size, step_size = 1)
146
+ i = 0
147
+ 0.step(self.length - window_size, step_size) do |i|
148
+ yield self[i, window_size]
149
+ end
150
+ return self[i + window_size .. -1]
151
+ end
152
+
153
+ # This method receive a hash of residues/bases to the particular values,
154
+ # and sum up the value along with the self sequence. Especially useful
155
+ # to use with the window_search method and amino acid indices etc.
156
+ def total(hash)
157
+ hash.default = 0.0 unless hash.default
158
+ sum = 0.0
159
+ self.each_byte do |x|
160
+ begin
161
+ sum += hash[x.chr]
162
+ end
163
+ end
164
+ return sum
165
+ end
166
+
167
+ # Returns a hash of the occurrence counts for each residue or base.
168
+ def composition
169
+ count = Hash.new(0)
170
+ self.scan(/./) do |x|
171
+ count[x] += 1
172
+ end
173
+ return count
174
+ end
175
+
176
+ # Returns a randomized sequence keeping its composition by default.
177
+ # The argument is required when generating a random sequence from the empty
178
+ # sequence (used by the class methods NA.randomize, AA.randomize).
179
+ # If the block is given, yields for each random residue/base.
180
+ def randomize(hash = nil)
181
+ length = self.length
182
+ if hash
183
+ count = hash.clone
184
+ count.each_value {|x| length += x}
185
+ else
186
+ count = self.composition
187
+ end
188
+
189
+ seq = ''
190
+ tmp = {}
191
+ length.times do
192
+ count.each do |k, v|
193
+ tmp[k] = v * rand
194
+ end
195
+ max = tmp.max {|a, b| a[1] <=> b[1]}
196
+ count[max.first] -= 1
197
+
198
+ if block_given?
199
+ yield max.first
200
+ else
201
+ seq += max.first
202
+ end
203
+ end
204
+ return self.class.new(seq)
205
+ end
206
+
207
+ # Generate a new random sequence with the given frequency of bases
208
+ # or residues. The sequence length is determined by the sum of each
209
+ # base/residue occurences.
210
+ def self.randomize(*arg, &block)
211
+ self.new('').randomize(*arg, &block)
212
+ end
213
+
214
+ # Receive a GenBank style position string and convert it to the Locations
215
+ # objects to splice the sequence itself. See also: bio/location.rb
216
+ #
217
+ # This method depends on Locations class, see bio/location.rb
218
+ def splicing(position)
219
+ unless position.is_a?(Locations) then
220
+ position = Locations.new(position)
221
+ end
222
+ s = ''
223
+ position.each do |location|
224
+ if location.sequence
225
+ s << location.sequence
226
+ else
227
+ exon = self.subseq(location.from, location.to)
228
+ begin
229
+ exon.complement! if location.strand < 0
230
+ rescue NameError
231
+ end
232
+ s << exon
233
+ end
234
+ end
235
+ return self.class.new(s)
236
+ end
237
+
238
+
239
+ # Nucleic Acid sequence
240
+
241
+ class NA < Sequence
242
+
243
+ # Generate a nucleic acid sequence object from a string.
244
+ def initialize(str)
245
+ super
246
+ self.downcase!
247
+ self.tr!(" \t\n\r",'')
248
+ end
249
+
250
+ # This method depends on Locations class, see bio/location.rb
251
+ def splicing(position)
252
+ mRNA = super
253
+ if mRNA.rna?
254
+ mRNA.tr!('t', 'u')
255
+ else
256
+ mRNA.tr!('u', 't')
257
+ end
258
+ mRNA
259
+ end
260
+
261
+ # Returns complement sequence without reversing ("atgc" -> "tacg")
262
+ def forward_complement
263
+ s = self.class.new(self)
264
+ s.forward_complement!
265
+ s
266
+ end
267
+
268
+ # Convert to complement sequence without reversing ("atgc" -> "tacg")
269
+ def forward_complement!
270
+ if self.rna?
271
+ self.tr!('augcrymkdhvbswn', 'uacgyrkmhdbvswn')
272
+ else
273
+ self.tr!('atgcrymkdhvbswn', 'tacgyrkmhdbvswn')
274
+ end
275
+ self
276
+ end
277
+
278
+ # Returns reverse complement sequence ("atgc" -> "gcat")
279
+ def reverse_complement
280
+ s = self.class.new(self)
281
+ s.reverse_complement!
282
+ s
283
+ end
284
+
285
+ # Convert to reverse complement sequence ("atgc" -> "gcat")
286
+ def reverse_complement!
287
+ self.reverse!
288
+ self.forward_complement!
289
+ end
290
+
291
+ # Aliases for short
292
+ alias complement reverse_complement
293
+ alias complement! reverse_complement!
294
+
295
+
296
+ # Translate into the amino acid sequence from the given frame and the
297
+ # selected codon table. The table also can be a Bio::CodonTable object.
298
+ # The 'unknown' character is used for invalid/unknown codon (can be
299
+ # used for 'nnn' and/or gap translation in practice).
300
+ #
301
+ # Frame can be 1, 2 or 3 for the forward strand and -1, -2 or -3
302
+ # (4, 5 or 6 is also accepted) for the reverse strand.
303
+ def translate(frame = 1, table = 1, unknown = 'X')
304
+ if table.is_a?(Bio::CodonTable)
305
+ ct = table
306
+ else
307
+ ct = Bio::CodonTable[table]
308
+ end
309
+ naseq = self.dna
310
+ case frame
311
+ when 1, 2, 3
312
+ from = frame - 1
313
+ when 4, 5, 6
314
+ from = frame - 4
315
+ naseq.complement!
316
+ when -1, -2, -3
317
+ from = -1 - frame
318
+ naseq.complement!
319
+ else
320
+ from = 0
321
+ end
322
+ nalen = naseq.length - from
323
+ nalen -= nalen % 3
324
+ aaseq = naseq[from, nalen].gsub(/.{3}/) {|codon| ct[codon] or unknown}
325
+ return Bio::Sequence::AA.new(aaseq)
326
+ end
327
+
328
+ # Returns counts of the each codon in the sequence by Hash.
329
+ def codon_usage
330
+ hash = Hash.new(0)
331
+ self.window_search(3, 3) do |codon|
332
+ hash[codon] += 1
333
+ end
334
+ return hash
335
+ end
336
+
337
+ # Calculate the ratio of GC / ATGC bases in percent.
338
+ def gc_percent
339
+ count = self.composition
340
+ at = count['a'] + count['t'] + count['u']
341
+ gc = count['g'] + count['c']
342
+ gc = 100 * gc / (at + gc)
343
+ return gc
344
+ end
345
+
346
+ # Show abnormal bases other than 'atgcu'.
347
+ def illegal_bases
348
+ self.scan(/[^atgcu]/).sort.uniq
349
+ end
350
+
351
+ # Estimate the weight of this biological string molecule.
352
+ # NucleicAcid is defined in bio/data/na.rb
353
+ def molecular_weight
354
+ if self.rna?
355
+ NucleicAcid.weight(self, true)
356
+ else
357
+ NucleicAcid.weight(self)
358
+ end
359
+ end
360
+
361
+ # Convert the universal code string into the regular expression.
362
+ def to_re
363
+ if self.rna?
364
+ NucleicAcid.to_re(self.dna, true)
365
+ else
366
+ NucleicAcid.to_re(self)
367
+ end
368
+ end
369
+
370
+ # Convert the self string into the list of the names of the each base.
371
+ def names
372
+ array = []
373
+ self.each_byte do |x|
374
+ array.push(NucleicAcid.names[x.chr.upcase])
375
+ end
376
+ return array
377
+ end
378
+
379
+ # Output a DNA string by substituting 'u' to 't'.
380
+ def dna
381
+ self.tr('u', 't')
382
+ end
383
+
384
+ def dna!
385
+ self.tr!('u', 't')
386
+ end
387
+
388
+ # Output a RNA string by substituting 't' to 'u'.
389
+ def rna
390
+ self.tr('t', 'u')
391
+ end
392
+
393
+ def rna!
394
+ self.tr!('t', 'u')
395
+ end
396
+
397
+ def rna?
398
+ self.index('u')
399
+ end
400
+ protected :rna?
401
+
402
+ def pikachu
403
+ self.dna.tr("atgc", "pika") # joke, of course :-)
404
+ end
405
+
406
+ end
407
+
408
+
409
+ # Amino Acid sequence
410
+
411
+ class AA < Sequence
412
+
413
+ # Generate a amino acid sequence object from a string.
414
+ def initialize(str)
415
+ super
416
+ self.upcase!
417
+ self.tr!(" \t\n\r",'')
418
+ end
419
+
420
+ # Estimate the weight of this protein.
421
+ # AminoAcid is defined in bio/data/aa.rb
422
+ def molecular_weight
423
+ AminoAcid.weight(self)
424
+ end
425
+
426
+ def to_re
427
+ AminoAcid.to_re(self)
428
+ end
429
+
430
+ # Generate the list of the names of the each residue along with the
431
+ # sequence (3 letters code).
432
+ def codes
433
+ array = []
434
+ self.each_byte do |x|
435
+ array.push(AminoAcid.names[x.chr])
436
+ end
437
+ return array
438
+ end
439
+
440
+ # Similar to codes but returns long names.
441
+ def names
442
+ self.codes.map do |x|
443
+ AminoAcid.names[x]
444
+ end
445
+ end
446
+
447
+ end
448
+
449
+ end # Sequence
450
+
451
+
452
+ class Seq < Sequence
453
+ attr_accessor :entry_id, :definition, :features, :references, :comments,
454
+ :date, :keywords, :dblinks, :taxonomy, :moltype
455
+ end
456
+
457
+
458
+ end # Bio
459
+
460
+
461
+ if __FILE__ == $0
462
+
463
+ puts "== Test Bio::Sequence::NA.new"
464
+ p Bio::Sequence::NA.new('')
465
+ p na = Bio::Sequence::NA.new('atgcatgcATGCATGCAAAA')
466
+ p rna = Bio::Sequence::NA.new('augcaugcaugcaugcaaaa')
467
+
468
+ puts "\n== Test Bio::Sequence::AA.new"
469
+ p Bio::Sequence::AA.new('')
470
+ p aa = Bio::Sequence::AA.new('ACDEFGHIKLMNPQRSTVWYU')
471
+
472
+ puts "\n== Test Bio::Sequence#to_s"
473
+ p na.to_s
474
+ p aa.to_s
475
+
476
+ puts "\n== Test Bio::Sequence#subseq(2,6)"
477
+ p na
478
+ p na.subseq(2,6)
479
+
480
+ puts "\n== Test Bio::Sequence#[2,6]"
481
+ p na
482
+ p na[2,6]
483
+
484
+ puts "\n== Test Bio::Sequence#to_fasta('hoge', 8)"
485
+ puts na.to_fasta('hoge', 8)
486
+
487
+ puts "\n== Test Bio::Sequence#window_search(15)"
488
+ p na
489
+ na.window_search(15) {|x| p x}
490
+
491
+ puts "\n== Test Bio::Sequence#total({'a'=>0.1,'t'=>0.2,'g'=>0.3,'c'=>0.4})"
492
+ p na.total({'a'=>0.1,'t'=>0.2,'g'=>0.3,'c'=>0.4})
493
+
494
+ puts "\n== Test Bio::Sequence#composition"
495
+ p na
496
+ p na.composition
497
+ p rna
498
+ p rna.composition
499
+
500
+ puts "\n== Test Bio::Sequence::NA#splicing('complement(join(1..5,16..20))')"
501
+ p na
502
+ p na.splicing("complement(join(1..5,16..20))")
503
+ p rna
504
+ p rna.splicing("complement(join(1..5,16..20))")
505
+
506
+ puts "\n== Test Bio::Sequence::NA#complement"
507
+ p na.complement
508
+ p rna.complement
509
+ p Bio::Sequence::NA.new('tacgyrkmhdbvswn').complement
510
+ p Bio::Sequence::NA.new('uacgyrkmhdbvswn').complement
511
+
512
+ puts "\n== Test Bio::Sequence::NA#translate"
513
+ p na
514
+ p na.translate
515
+ p rna
516
+ p rna.translate
517
+
518
+ puts "\n== Test Bio::Sequence::NA#gc_percent"
519
+ p na.gc
520
+ p rna.gc
521
+
522
+ puts "\n== Test Bio::Sequence::NA#illegal_bases"
523
+ p na.illegal_bases
524
+ p Bio::Sequence::NA.new('tacgyrkmhdbvswn').illegal_bases
525
+ p Bio::Sequence::NA.new('abcdefghijklmnopqrstuvwxyz-!%#$@').illegal_bases
526
+
527
+ puts "\n== Test Bio::Sequence::NA#molecular_weight"
528
+ p na
529
+ p na.molecular_weight
530
+ p rna
531
+ p rna.molecular_weight
532
+
533
+ puts "\n== Test Bio::Sequence::NA#to_re"
534
+ p Bio::Sequence::NA.new('atgcrymkdhvbswn')
535
+ p Bio::Sequence::NA.new('atgcrymkdhvbswn').to_re
536
+ p Bio::Sequence::NA.new('augcrymkdhvbswn')
537
+ p Bio::Sequence::NA.new('augcrymkdhvbswn').to_re
538
+
539
+ puts "\n== Test Bio::Sequence::NA#names"
540
+ p na.names
541
+
542
+ puts "\n== Test Bio::Sequence::NA#pikachu"
543
+ p na.pikachu
544
+
545
+ puts "\n== Test Bio::Sequence::NA#randomize"
546
+ print "Orig : "; p na
547
+ print "Rand : "; p na.randomize
548
+ print "Rand : "; p na.randomize
549
+ print "Rand : "; p na.randomize.randomize
550
+ print "Block : "; na.randomize do |x| print x end; puts
551
+
552
+ print "Orig : "; p rna
553
+ print "Rand : "; p rna.randomize
554
+ print "Rand : "; p rna.randomize
555
+ print "Rand : "; p rna.randomize.randomize
556
+ print "Block : "; rna.randomize do |x| print x end; puts
557
+
558
+ puts "\n== Test Bio::Sequence::NA.randomize(counts)"
559
+ print "Count : "; p counts = {'a'=>10,'c'=>20,'g'=>30,'t'=>40}
560
+ print "Rand : "; p Bio::Sequence::NA.randomize(counts)
561
+ print "Count : "; p counts = {'a'=>10,'c'=>20,'g'=>30,'u'=>40}
562
+ print "Rand : "; p Bio::Sequence::NA.randomize(counts)
563
+ print "Block : "; Bio::Sequence::NA.randomize(counts) {|x| print x}; puts
564
+
565
+ puts "\n== Test Bio::Sequence::AA#codes"
566
+ p aa
567
+ p aa.codes
568
+
569
+ puts "\n== Test Bio::Sequence::AA#names"
570
+ p aa
571
+ p aa.names
572
+
573
+ puts "\n== Test Bio::Sequence::AA#molecular_weight"
574
+ p aa.subseq(1,20)
575
+ p aa.subseq(1,20).molecular_weight
576
+
577
+ puts "\n== Test Bio::Sequence::AA#randomize"
578
+ aaseq = 'MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNHLVAMIEKTISGQDA'
579
+ s = Bio::Sequence::AA.new(aaseq)
580
+ print "Orig : "; p s
581
+ print "Rand : "; p s.randomize
582
+ print "Rand : "; p s.randomize
583
+ print "Rand : "; p s.randomize.randomize
584
+ print "Block : "; s.randomize {|x| print x}; puts
585
+
586
+ puts "\n== Test Bio::Sequence::AA.randomize(counts)"
587
+ print "Count : "; p counts = s.composition
588
+ print "Rand : "; puts Bio::Sequence::AA.randomize(counts)
589
+ print "Block : "; Bio::Sequence::AA.randomize(counts) {|x| print x}; puts
590
+
591
+ end
592
+
593
+