bio 0.7.1 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (142) hide show
  1. data/bin/bioruby +71 -27
  2. data/bin/br_biofetch.rb +5 -17
  3. data/bin/br_bioflat.rb +14 -26
  4. data/bin/br_biogetseq.rb +6 -18
  5. data/bin/br_pmfetch.rb +6 -16
  6. data/doc/Changes-0.7.rd +35 -0
  7. data/doc/KEGG_API.rd +287 -172
  8. data/doc/KEGG_API.rd.ja +273 -160
  9. data/doc/Tutorial.rd +18 -9
  10. data/doc/Tutorial.rd.ja +656 -138
  11. data/lib/bio.rb +6 -24
  12. data/lib/bio/alignment.rb +5 -5
  13. data/lib/bio/appl/blast.rb +132 -98
  14. data/lib/bio/appl/blast/format0.rb +9 -19
  15. data/lib/bio/appl/blast/wublast.rb +5 -18
  16. data/lib/bio/appl/emboss.rb +40 -47
  17. data/lib/bio/appl/hmmer.rb +116 -82
  18. data/lib/bio/appl/hmmer/report.rb +509 -364
  19. data/lib/bio/appl/spidey/report.rb +7 -18
  20. data/lib/bio/data/na.rb +3 -21
  21. data/lib/bio/db.rb +3 -21
  22. data/lib/bio/db/aaindex.rb +147 -52
  23. data/lib/bio/db/embl/common.rb +27 -6
  24. data/lib/bio/db/embl/embl.rb +18 -10
  25. data/lib/bio/db/embl/sptr.rb +87 -67
  26. data/lib/bio/db/embl/swissprot.rb +32 -3
  27. data/lib/bio/db/embl/trembl.rb +32 -3
  28. data/lib/bio/db/embl/uniprot.rb +32 -3
  29. data/lib/bio/db/fasta.rb +327 -289
  30. data/lib/bio/db/medline.rb +25 -4
  31. data/lib/bio/db/nbrf.rb +12 -20
  32. data/lib/bio/db/pdb.rb +4 -1
  33. data/lib/bio/db/pdb/chemicalcomponent.rb +240 -0
  34. data/lib/bio/db/pdb/pdb.rb +13 -8
  35. data/lib/bio/db/rebase.rb +93 -97
  36. data/lib/bio/feature.rb +2 -31
  37. data/lib/bio/io/ddbjxml.rb +167 -139
  38. data/lib/bio/io/fastacmd.rb +89 -56
  39. data/lib/bio/io/flatfile.rb +994 -278
  40. data/lib/bio/io/flatfile/index.rb +257 -194
  41. data/lib/bio/io/flatfile/indexer.rb +37 -29
  42. data/lib/bio/reference.rb +147 -64
  43. data/lib/bio/sequence.rb +57 -417
  44. data/lib/bio/sequence/aa.rb +64 -0
  45. data/lib/bio/sequence/common.rb +175 -0
  46. data/lib/bio/sequence/compat.rb +68 -0
  47. data/lib/bio/sequence/format.rb +134 -0
  48. data/lib/bio/sequence/generic.rb +24 -0
  49. data/lib/bio/sequence/na.rb +189 -0
  50. data/lib/bio/shell.rb +9 -23
  51. data/lib/bio/shell/core.rb +130 -125
  52. data/lib/bio/shell/demo.rb +143 -0
  53. data/lib/bio/shell/{session.rb → interface.rb} +42 -40
  54. data/lib/bio/shell/object.rb +52 -0
  55. data/lib/bio/shell/plugin/codon.rb +4 -22
  56. data/lib/bio/shell/plugin/emboss.rb +23 -0
  57. data/lib/bio/shell/plugin/entry.rb +34 -25
  58. data/lib/bio/shell/plugin/flatfile.rb +5 -23
  59. data/lib/bio/shell/plugin/keggapi.rb +11 -24
  60. data/lib/bio/shell/plugin/midi.rb +5 -23
  61. data/lib/bio/shell/plugin/obda.rb +4 -22
  62. data/lib/bio/shell/plugin/seq.rb +6 -24
  63. data/lib/bio/shell/rails/Rakefile +10 -0
  64. data/lib/bio/shell/rails/app/controllers/application.rb +4 -0
  65. data/lib/bio/shell/rails/app/controllers/shell_controller.rb +94 -0
  66. data/lib/bio/shell/rails/app/helpers/application_helper.rb +3 -0
  67. data/lib/bio/shell/rails/app/models/shell_connection.rb +30 -0
  68. data/lib/bio/shell/rails/app/views/layouts/shell.rhtml +37 -0
  69. data/lib/bio/shell/rails/app/views/shell/history.rhtml +5 -0
  70. data/lib/bio/shell/rails/app/views/shell/index.rhtml +2 -0
  71. data/lib/bio/shell/rails/app/views/shell/show.rhtml +13 -0
  72. data/lib/bio/shell/rails/config/boot.rb +19 -0
  73. data/lib/bio/shell/rails/config/database.yml +85 -0
  74. data/lib/bio/shell/rails/config/environment.rb +53 -0
  75. data/lib/bio/shell/rails/config/environments/development.rb +19 -0
  76. data/lib/bio/shell/rails/config/environments/production.rb +19 -0
  77. data/lib/bio/shell/rails/config/environments/test.rb +19 -0
  78. data/lib/bio/shell/rails/config/routes.rb +19 -0
  79. data/lib/bio/shell/rails/doc/README_FOR_APP +2 -0
  80. data/lib/bio/shell/rails/public/404.html +8 -0
  81. data/lib/bio/shell/rails/public/500.html +8 -0
  82. data/lib/bio/shell/rails/public/dispatch.cgi +10 -0
  83. data/lib/bio/shell/rails/public/dispatch.fcgi +24 -0
  84. data/lib/bio/shell/rails/public/dispatch.rb +10 -0
  85. data/lib/bio/shell/rails/public/favicon.ico +0 -0
  86. data/lib/bio/shell/rails/public/images/icon.png +0 -0
  87. data/lib/bio/shell/rails/public/images/rails.png +0 -0
  88. data/lib/bio/shell/rails/public/index.html +277 -0
  89. data/lib/bio/shell/rails/public/javascripts/controls.js +750 -0
  90. data/lib/bio/shell/rails/public/javascripts/dragdrop.js +584 -0
  91. data/lib/bio/shell/rails/public/javascripts/effects.js +854 -0
  92. data/lib/bio/shell/rails/public/javascripts/prototype.js +1785 -0
  93. data/lib/bio/shell/rails/public/robots.txt +1 -0
  94. data/lib/bio/shell/rails/public/stylesheets/main.css +187 -0
  95. data/lib/bio/shell/rails/script/about +3 -0
  96. data/lib/bio/shell/rails/script/breakpointer +3 -0
  97. data/lib/bio/shell/rails/script/console +3 -0
  98. data/lib/bio/shell/rails/script/destroy +3 -0
  99. data/lib/bio/shell/rails/script/generate +3 -0
  100. data/lib/bio/shell/rails/script/performance/benchmarker +3 -0
  101. data/lib/bio/shell/rails/script/performance/profiler +3 -0
  102. data/lib/bio/shell/rails/script/plugin +3 -0
  103. data/lib/bio/shell/rails/script/process/reaper +3 -0
  104. data/lib/bio/shell/rails/script/process/spawner +3 -0
  105. data/lib/bio/shell/rails/script/process/spinner +3 -0
  106. data/lib/bio/shell/rails/script/runner +3 -0
  107. data/lib/bio/shell/rails/script/server +42 -0
  108. data/lib/bio/shell/rails/test/test_helper.rb +28 -0
  109. data/lib/bio/shell/web.rb +90 -0
  110. data/lib/bio/util/contingency_table.rb +231 -225
  111. data/sample/any2fasta.rb +59 -0
  112. data/test/data/HMMER/hmmpfam.out +64 -0
  113. data/test/data/HMMER/hmmsearch.out +88 -0
  114. data/test/data/aaindex/DAYM780301 +30 -0
  115. data/test/data/aaindex/PRAM900102 +20 -0
  116. data/test/data/bl2seq/cd8a_cd8b_blastp.bl2seq +53 -0
  117. data/test/data/bl2seq/cd8a_p53_e-5blastp.bl2seq +37 -0
  118. data/test/data/blast/{eco:b0002.faa → b0002.faa} +0 -0
  119. data/test/data/blast/{eco:b0002.faa.m0 → b0002.faa.m0} +2 -2
  120. data/test/data/blast/{eco:b0002.faa.m7 → b0002.faa.m7} +1 -1
  121. data/test/data/blast/{eco:b0002.faa.m8 → b0002.faa.m8} +0 -0
  122. data/test/unit/bio/appl/bl2seq/test_report.rb +134 -0
  123. data/test/unit/bio/appl/blast/test_report.rb +15 -12
  124. data/test/unit/bio/appl/blast/test_xmlparser.rb +4 -4
  125. data/test/unit/bio/appl/hmmer/test_report.rb +355 -0
  126. data/test/unit/bio/appl/test_blast.rb +5 -5
  127. data/test/unit/bio/data/test_na.rb +9 -18
  128. data/test/unit/bio/db/pdb/test_pdb.rb +169 -0
  129. data/test/unit/bio/db/test_aaindex.rb +197 -0
  130. data/test/unit/bio/io/test_fastacmd.rb +55 -0
  131. data/test/unit/bio/sequence/test_aa.rb +102 -0
  132. data/test/unit/bio/sequence/test_common.rb +178 -0
  133. data/test/unit/bio/sequence/test_compat.rb +82 -0
  134. data/test/unit/bio/sequence/test_na.rb +242 -0
  135. data/test/unit/bio/shell/plugin/test_seq.rb +29 -19
  136. data/test/unit/bio/test_alignment.rb +15 -7
  137. data/test/unit/bio/test_reference.rb +198 -0
  138. data/test/unit/bio/test_sequence.rb +4 -49
  139. data/test/unit/bio/test_shell.rb +2 -2
  140. metadata +118 -15
  141. data/lib/bio/io/brdb.rb +0 -103
  142. data/lib/bioruby.rb +0 -34
@@ -1,7 +1,34 @@
1
1
  #
2
- # bio/db/embl/uniprot.rb - UniProt database class
2
+ # = bio/db/embl/uniprot.rb - UniProt database class
3
3
  #
4
- # Copyright (C) 2005 KATAYAMA Toshiaki <k@bioruby.org>
4
+ # Copyright:: Copyright (C) 2005 KATAYAMA Toshiaki <k@bioruby.org>
5
+ # License:: LGPL
6
+ #
7
+ # $Id: uniprot.rb,v 1.2 2006/01/28 06:40:39 nakao Exp $
8
+ #
9
+ # == Description
10
+ #
11
+ # Name space for UniProtKB/SwissProt specific methods.
12
+ #
13
+ # UniProtKB/SwissProt specific methods are defined in this class.
14
+ # Shared methods for UniProtKB/SwissProt and TrEMBL classes are
15
+ # defined in Bio::SPTR class.
16
+ #
17
+ # == Examples
18
+ #
19
+ # str = File.read("p53_human.swiss")
20
+ # obj = Bio::UniProt.new(str)
21
+ # obj.entry_id #=> "P53_HUMAN"
22
+ #
23
+ # == Referencees
24
+ #
25
+ # * UniProt
26
+ # http://uniprot.org/
27
+ #
28
+ # * The UniProtKB/SwissProt/TrEMBL User Manual
29
+ # http://www.expasy.org/sprot/userman.html
30
+
31
+ #--
5
32
  #
6
33
  # This library is free software; you can redistribute it and/or
7
34
  # modify it under the terms of the GNU Lesser General Public
@@ -17,13 +44,15 @@
17
44
  # License along with this library; if not, write to the Free Software
18
45
  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
46
  #
20
- # $Id: uniprot.rb,v 1.1 2005/09/10 23:43:35 k Exp $
47
+ #++
21
48
  #
22
49
 
23
50
  require 'bio/db/embl/sptr'
24
51
 
25
52
  module Bio
26
53
 
54
+ # Parser class for SwissProt database entry.
55
+ # See also Bio::SPTR class.
27
56
  class UniProt < SPTR
28
57
  # Nothing to do (UniProt format is abstracted in SPTR)
29
58
  end
@@ -1,24 +1,66 @@
1
1
  #
2
- # bio/db/fasta.rb - FASTA format class
2
+ # = bio/db/fasta.rb - FASTA format class
3
3
  #
4
- # Copyright (C) 2001 GOTO Naohisa <ngoto@gen-info.osaka-u.ac.jp>
5
- # Copyright (C) 2001, 2002 KATAYAMA Toshiaki <k@bioruby.org>
4
+ # Copyright:: Copyright (C) 2001, 2002
5
+ # GOTO Naohisa <ngoto@gen-info.osaka-u.ac.jp>,
6
+ # KATAYAMA Toshiaki <k@bioruby.org>
7
+ # Lisence:: Ruby's
6
8
  #
7
- # This library is free software; you can redistribute it and/or
8
- # modify it under the terms of the GNU Lesser General Public
9
- # License as published by the Free Software Foundation; either
10
- # version 2 of the License, or (at your option) any later version.
9
+ # $Id: fasta.rb,v 1.25 2006/02/22 08:44:46 ngoto Exp $
10
+ #
11
+ # == Description
12
+ #
13
+ # FASTA format class.
11
14
  #
12
- # This library is distributed in the hope that it will be useful,
13
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
- # Lesser General Public License for more details.
15
+ # == Examples
16
16
  #
17
- # You should have received a copy of the GNU Lesser General Public
18
- # License along with this library; if not, write to the Free Software
19
- # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17
+ # rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
18
+ # rub.entry_id ==> 'gi|671595'
19
+ # rub.get('emb') ==> 'CAA85678.1'
20
+ # rub.emb ==> 'CAA85678.1'
21
+ # rub.gi ==> '671595'
22
+ # rub.accession ==> 'CAA85678'
23
+ # rub.accessions ==> [ 'CAA85678' ]
24
+ # rub.acc_version ==> 'CAA85678.1'
25
+ # rub.locus ==> nil
26
+ # rub.list_ids ==> [["gi", "671595"],
27
+ # ["emb", "CAA85678.1", nil],
28
+ # ["Perovskia abrotanoides"]]
20
29
  #
21
- # $Id: fasta.rb,v 1.21 2005/09/26 13:00:06 k Exp $
30
+ # ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
31
+ # ckr.entry_id ==> "gi|2495000"
32
+ # ckr.sp ==> "CCKR_CAVPO"
33
+ # ckr.pir ==> "I51898"
34
+ # ckr.gb ==> "AAB29504.1"
35
+ # ckr.gi ==> "2495000"
36
+ # ckr.accession ==> "AAB29504"
37
+ # ckr.accessions ==> ["Q63931", "AAB29504"]
38
+ # ckr.acc_version ==> "AAB29504.1"
39
+ # ckr.locus ==> nil
40
+ # ckr.description ==>
41
+ # "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
42
+ # ckr.descriptions ==>
43
+ # ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
44
+ # "cholecystokinin A receptor - guinea pig",
45
+ # "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
46
+ # ckr.words ==>
47
+ # ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
48
+ # "receptor", "type"]
49
+ # ckr.id_strings ==>
50
+ # ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
51
+ # "544724", "AAB29504.1", "Cavia"]
52
+ # ckr.list_ids ==>
53
+ # [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
54
+ # ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
55
+ # ["gb", "AAB29504.1", nil], ["Cavia"]]
56
+ #
57
+ # == References
58
+ #
59
+ # * FASTA format (WikiPedia)
60
+ # http://en.wikipedia.org/wiki/FASTA_format
61
+ #
62
+ # * Fasta format description (NCBI)
63
+ # http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
22
64
  #
23
65
 
24
66
  require 'bio/db'
@@ -26,34 +68,132 @@ require 'bio/sequence'
26
68
 
27
69
  module Bio
28
70
 
71
+
72
+ # Treats a FASTA formatted entry, such as:
73
+ #
74
+ # >id and/or some comments <== comment line
75
+ # ATGCATGCATGCATGCATGCATGCATGCATGCATGC <== sequence lines
76
+ # ATGCATGCATGCATGCATGCATGCATGCATGCATGC
77
+ # ATGCATGCATGC
78
+ #
79
+ # The precedent '>' can be omitted and the trailing '>' will be removed
80
+ # automatically.
81
+ #
82
+ # === Examples
83
+ #
84
+ # f_str = <<END
85
+ # >sce:YBR160W CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
86
+ # MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEG
87
+ # VPSTAIREISLLKELKDDNIVRLYDIVHSDAHKLYLVFEFLDLDLKRYME
88
+ # GIPKDQPLGADIVKKFMMQLCKGIAYCHSHRILHRDLKPQNLLINKDGNL
89
+ # KLGDFGLARAFGVPLRAYTHEIVTLWYRAPEVLLGGKQYSTGVDTWSIGC
90
+ # IFAEMCNRKPIFSGDSEIDQIFKIFRVLGTPNEAIWPDIVYLPDFKPSFP
91
+ # QWRRKDLSQVVPSLDPRGIDLLDKLLAYDPINRISARRAAIHPYFQES
92
+ # >sce:YBR274W CHK1; probable serine/threonine-protein kinase [EC:2.7.1.-] [SP:KB9S_YEAST]
93
+ # MSLSQVSPLPHIKDVVLGDTVGQGAFACVKNAHLQMDPSIILAVKFIHVP
94
+ # TCKKMGLSDKDITKEVVLQSKCSKHPNVLRLIDCNVSKEYMWIILEMADG
95
+ # GDLFDKIEPDVGVDSDVAQFYFQQLVSAINYLHVECGVAHRDIKPENILL
96
+ # DKNGNLKLADFGLASQFRRKDGTLRVSMDQRGSPPYMAPEVLYSEEGYYA
97
+ # DRTDIWSIGILLFVLLTGQTPWELPSLENEDFVFFIENDGNLNWGPWSKI
98
+ # EFTHLNLLRKILQPDPNKRVTLKALKLHPWVLRRASFSGDDGLCNDPELL
99
+ # AKKLFSHLKVSLSNENYLKFTQDTNSNNRYISTQPIGNELAELEHDSMHF
100
+ # QTVSNTQRAFTSYDSNTNYNSGTGMTQEAKWTQFISYDIAALQFHSDEND
101
+ # CNELVKRHLQFNPNKLTKFYTLQPMDVLLPILEKALNLSQIRVKPDLFAN
102
+ # FERLCELLGYDNVFPLIINIKTKSNGGYQLCGSISIIKIEEELKSVGFER
103
+ # KTGDPLEWRRLFKKISTICRDIILIPN
104
+ # END
105
+ #
106
+ # f = Bio::FastaFormat.new(f_str)
107
+ # puts "### FastaFormat"
108
+ # puts "# entry"
109
+ # puts f.entry
110
+ # puts "# entry_id"
111
+ # p f.entry_id
112
+ # puts "# definition"
113
+ # p f.definition
114
+ # puts "# data"
115
+ # p f.data
116
+ # puts "# seq"
117
+ # p f.seq
118
+ # puts "# seq.type"
119
+ # p f.seq.type
120
+ # puts "# length"
121
+ # p f.length
122
+ # puts "# aaseq"
123
+ # p f.aaseq
124
+ # puts "# aaseq.type"
125
+ # p f.aaseq.type
126
+ # puts "# aaseq.composition"
127
+ # p f.aaseq.composition
128
+ # puts "# aalen"
129
+ # p f.aalen
130
+ #
131
+ # === References
132
+ #
133
+ # * FASTA format (WikiPedia)
134
+ # http://en.wikipedia.org/wiki/FASTA_format
135
+ #
29
136
  class FastaFormat < DB
30
137
 
138
+ # Entry delimiter in flatfile text.
31
139
  DELIMITER = RS = "\n>"
32
140
 
141
+ # (Integer) excess read size included in DELIMITER.
142
+ DELIMITER_OVERRUN = 1 # '>'
143
+
144
+ # The comment line of the FASTA formatted data.
145
+ attr_accessor :definition
146
+
147
+ # The seuqnce lines in text.
148
+ attr_accessor :data
149
+
150
+ attr_reader :entry_overrun
151
+
152
+ # Stores the comment and sequence information from one entry of the
153
+ # FASTA format string. If the argument contains more than one
154
+ # entry, only the first entry is used.
33
155
  def initialize(str)
34
156
  @definition = str[/.*/].sub(/^>/, '').strip # 1st line
35
157
  @data = str.sub(/.*/, '') # rests
36
158
  @data.sub!(/^>.*/m, '') # remove trailing entries for sure
37
159
  @entry_overrun = $&
38
160
  end
39
- attr_accessor :definition, :data
40
- attr_reader :entry_overrun
41
161
 
162
+ # Returns the stored one entry as a FASTA format. (same as to_s)
42
163
  def entry
43
164
  @entry = ">#{@definition}\n#{@data.strip}\n"
44
165
  end
45
166
  alias to_s entry
46
167
 
168
+
169
+ # Executes FASTA/BLAST search by using a Bio::Fasta or a Bio::Blast
170
+ # factory object.
171
+ #
172
+ # #!/usr/bin/env ruby
173
+ # require 'bio'
174
+ #
175
+ # factory = Bio::Fasta.local('fasta34', 'db/swissprot.f')
176
+ # flatfile = Bio::FlatFile.open(Bio::FastaFormat, 'queries.f')
177
+ # flatfile.each do |entry|
178
+ # p entry.definition
179
+ # result = entry.fasta(factory)
180
+ # result.each do |hit|
181
+ # print "#{hit.query_id} : #{hit.evalue}\t#{hit.target_id} at "
182
+ # p hit.lap_at
183
+ # end
184
+ # end
185
+ #
47
186
  def query(factory)
48
187
  factory.query(@entry)
49
188
  end
50
189
  alias fasta query
51
190
  alias blast query
52
191
 
192
+ # Returns a joined sequence line as a String.
53
193
  def seq
54
194
  unless defined?(@seq)
55
195
  unless /\A\s*^\#/ =~ @data then
56
- @seq = Sequence.new(@data.tr(" \t\r\n0-9", '')) # lazy clean up
196
+ @seq = Sequence::Generic.new(@data.tr(" \t\r\n0-9", '')) # lazy clean up
57
197
  else
58
198
  a = @data.split(/(^\#.*$)/)
59
199
  i = 0
@@ -69,37 +209,61 @@ module Bio
69
209
  end
70
210
  end
71
211
  @comment = cmnt
72
- @seq = Bio::Sequence.new(s.join(''))
212
+ @seq = Bio::Sequence::Generic.new(s.join(''))
73
213
  end
74
214
  end
75
215
  @seq
76
216
  end
77
217
 
218
+ # Returns comments.
78
219
  def comment
79
220
  seq
80
221
  @comment
81
222
  end
82
223
 
224
+ # Returns sequence length.
83
225
  def length
84
226
  seq.length
85
227
  end
86
228
 
229
+ # Returens the Bio::Sequence::NA.
87
230
  def naseq
88
231
  Sequence::NA.new(seq)
89
232
  end
90
233
 
234
+ # Returens the length of Bio::Sequence::NA.
91
235
  def nalen
92
236
  self.naseq.length
93
237
  end
94
238
 
239
+ # Returens the Bio::Sequence::AA.
95
240
  def aaseq
96
241
  Sequence::AA.new(seq)
97
242
  end
98
243
 
244
+ # Returens the length of Bio::Sequence::AA.
99
245
  def aalen
100
246
  self.aaseq.length
101
247
  end
102
248
 
249
+ # Returns sequence as a Bio::Sequence object.
250
+ #
251
+ # Note: If you modify the returned Bio::Sequence object,
252
+ # the sequence or definition in this FastaFormat object
253
+ # might also be changed (but not always be changed)
254
+ # because of efficiency.
255
+ #
256
+ def to_seq
257
+ seq
258
+ obj = Bio::Sequence.new(@seq)
259
+ obj.definition = self.definition
260
+ obj
261
+ end
262
+
263
+ # Parsing FASTA Defline, and extract IDs.
264
+ # IDs are NSIDs (NCBI standard FASTA sequence identifiers)
265
+ # or ":"-separated IDs.
266
+ # It returns a Bio::FastaDefline instance.
103
267
  def identifiers
104
268
  unless defined?(@ids) then
105
269
  @ids = FastaDefline.new(@definition)
@@ -107,34 +271,69 @@ module Bio
107
271
  @ids
108
272
  end
109
273
 
274
+ # Parsing FASTA Defline (using #identifiers method), and
275
+ # shows a possibly unique identifier.
276
+ # It returns a string.
110
277
  def entry_id
111
278
  identifiers.entry_id
112
279
  end
113
280
 
281
+ # Parsing FASTA Defline (using #identifiers method), and
282
+ # shows GI/locus/accession/accession with version number.
283
+ # If a entry has more than two of such IDs,
284
+ # only the first ID are shown.
285
+ # It returns a string or nil.
114
286
  def gi
115
287
  identifiers.gi
116
288
  end
117
289
 
290
+ # Returns an accession number.
118
291
  def accession
119
292
  identifiers.accession
120
293
  end
121
294
 
295
+ # Parsing FASTA Defline (using #identifiers method), and
296
+ # shows accession numbers.
297
+ # It returns an array of strings.
122
298
  def accessions
123
299
  identifiers.accessions
124
300
  end
125
301
 
302
+ # Returns accession number with version.
126
303
  def acc_version
127
304
  identifiers.acc_version
128
305
  end
129
306
 
307
+ # Returns locus.
130
308
  def locus
131
309
  identifiers.locus
132
310
  end
133
311
 
134
312
  end #class FastaFormat
135
313
 
314
+ # Treats a FASTA formatted numerical entry, such as:
315
+ #
316
+ # >id and/or some comments <== comment line
317
+ # 24 15 23 29 20 13 20 21 21 23 22 25 13 <== numerical data
318
+ # 22 17 15 25 27 32 26 32 29 29 25
319
+ #
320
+ # The precedent '>' can be omitted and the trailing '>' will be removed
321
+ # automatically.
322
+ #
323
+ # --- Bio::FastaNumericFormat.new(entry)
324
+ #
325
+ # Stores the comment and the list of the numerical data.
326
+ #
327
+ # --- Bio::FastaNumericFormat#definition
328
+ #
329
+ # The comment line of the FASTA formatted data.
330
+ #
331
+ # * FASTA format (Wikipedia)
332
+ # http://en.wikipedia.org/wiki/FASTA_format
136
333
  class FastaNumericFormat < FastaFormat
137
334
 
335
+ # Returns the list of the numerical data (typically the quality score
336
+ # of its corresponding sequence) as an Array.
138
337
  def data
139
338
  unless @list
140
339
  @list = @data.strip.split(/\s+/).map {|x| x.to_i}
@@ -142,16 +341,19 @@ module Bio
142
341
  @list
143
342
  end
144
343
 
344
+ # Returns the number of elements in the numerical data.
145
345
  def length
146
346
  data.length
147
347
  end
148
348
 
349
+ # Yields on each elements of the numerical data.
149
350
  def each
150
351
  data.each do |x|
151
352
  yield x
152
353
  end
153
354
  end
154
355
 
356
+ # Returns the n-th element.
155
357
  def [](n)
156
358
  data[n]
157
359
  end
@@ -160,11 +362,69 @@ module Bio
160
362
 
161
363
  end #class FastaNumericFormat
162
364
 
163
- class FastaDefline
164
365
 
165
- # specs are described in:
166
- # ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
167
- # http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
366
+ # Parsing FASTA Defline, and extract IDs and other informations.
367
+ # IDs are NSIDs (NCBI standard FASTA sequence identifiers)
368
+ # or ":"-separated IDs.
369
+ #
370
+ # specs are described in:
371
+ # ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
372
+ # http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
373
+ #
374
+ # === Examples
375
+ #
376
+ # rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
377
+ # rub.entry_id ==> 'gi|671595'
378
+ # rub.get('emb') ==> 'CAA85678.1'
379
+ # rub.emb ==> 'CAA85678.1'
380
+ # rub.gi ==> '671595'
381
+ # rub.accession ==> 'CAA85678'
382
+ # rub.accessions ==> [ 'CAA85678' ]
383
+ # rub.acc_version ==> 'CAA85678.1'
384
+ # rub.locus ==> nil
385
+ # rub.list_ids ==> [["gi", "671595"],
386
+ # ["emb", "CAA85678.1", nil],
387
+ # ["Perovskia abrotanoides"]]
388
+ #
389
+ # ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
390
+ # ckr.entry_id ==> "gi|2495000"
391
+ # ckr.sp ==> "CCKR_CAVPO"
392
+ # ckr.pir ==> "I51898"
393
+ # ckr.gb ==> "AAB29504.1"
394
+ # ckr.gi ==> "2495000"
395
+ # ckr.accession ==> "AAB29504"
396
+ # ckr.accessions ==> ["Q63931", "AAB29504"]
397
+ # ckr.acc_version ==> "AAB29504.1"
398
+ # ckr.locus ==> nil
399
+ # ckr.description ==>
400
+ # "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
401
+ # ckr.descriptions ==>
402
+ # ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
403
+ # "cholecystokinin A receptor - guinea pig",
404
+ # "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
405
+ # ckr.words ==>
406
+ # ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
407
+ # "receptor", "type"]
408
+ # ckr.id_strings ==>
409
+ # ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
410
+ # "544724", "AAB29504.1", "Cavia"]
411
+ # ckr.list_ids ==>
412
+ # [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
413
+ # ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
414
+ # ["gb", "AAB29504.1", nil], ["Cavia"]]
415
+ #
416
+ # === Refereneces
417
+ #
418
+ # * Fasta format description (NCBI)
419
+ # http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
420
+ #
421
+ # * Frequently Asked Questions: Indexing of Sequence Identifiers (by Warren R. Gish.)
422
+ # http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
423
+ #
424
+ # * README.formatdb
425
+ # ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
426
+ #
427
+ class FastaDefline
168
428
 
169
429
  NSIDs = {
170
430
  # NCBI and WU-BLAST
@@ -197,6 +457,15 @@ module Bio
197
457
  'ri' => [ 'entry_id', 'rearray_id', 'len' ], # RIKEN FANTOM DB
198
458
  }
199
459
 
460
+ # Shows array that contains IDs (or ID-like strings).
461
+ # Returns an array of arrays of strings.
462
+ attr_reader :list_ids
463
+
464
+ # Shows a possibly unique identifier.
465
+ # Returns a string.
466
+ attr_reader :entry_id
467
+
468
+ # Parses given string.
200
469
  def initialize(str)
201
470
  @deflines = []
202
471
  @info = {}
@@ -210,9 +479,7 @@ module Bio
210
479
  end
211
480
  end #def initialize
212
481
 
213
- attr_reader :list_ids
214
- attr_reader :entry_id
215
-
482
+ # Parses given string and adds parsed data.
216
483
  def add_defline(str)
217
484
  case str
218
485
  when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/
@@ -343,6 +610,10 @@ module Bio
343
610
  end #def parse_NSIDs
344
611
  private :parse_NSIDs
345
612
 
613
+
614
+ # Shows original string.
615
+ # Note that the result of this method may be different from
616
+ # original string which is given in FastaDefline.new method.
346
617
  def to_s
347
618
  @deflines.collect { |a|
348
619
  s = a[0]
@@ -350,16 +621,20 @@ module Bio
350
621
  }.join("\x01")
351
622
  end
352
623
 
624
+ # Shows description.
353
625
  def description
354
626
  @deflines[0].to_a[-1]
355
627
  end
356
628
 
629
+ # Returns descriptions.
357
630
  def descriptions
358
631
  @deflines.collect do |a|
359
632
  a[-1]
360
633
  end
361
634
  end
362
635
 
636
+ # Shows ID-like strings.
637
+ # Returns an array of strings.
363
638
  def id_strings
364
639
  r = []
365
640
  @list_ids.each do |a|
@@ -401,6 +676,7 @@ module Bio
401
676
  /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
402
677
  ]
403
678
 
679
+ # Shows words used in the defline. Returns an Array.
404
680
  def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray,
405
681
  kwhash = self.class::KillWordsHash)
406
682
  a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\#\s\x00-\x1f\x7f]+/)
@@ -426,8 +702,9 @@ module Bio
426
702
  a
427
703
  end
428
704
 
429
- def get(db)
430
- db =db.to_s
705
+ # Returns identifires by a database name.
706
+ def get(dbname)
707
+ db = dbname.to_s
431
708
  r = nil
432
709
  unless r = @info[db] then
433
710
  di = @list_ids.find { |x| x[0] == db.to_s }
@@ -449,10 +726,11 @@ module Bio
449
726
  r
450
727
  end
451
728
 
452
- def get_by_type(tstr)
729
+ # Returns an identifier by given type.
730
+ def get_by_type(type_str)
453
731
  @list_ids.each do |x|
454
732
  if labels = self.class::NSIDs[x[0]] then
455
- if i = labels.index(tstr) then
733
+ if i = labels.index(type_str) then
456
734
  return x[i+1]
457
735
  end
458
736
  end
@@ -460,11 +738,12 @@ module Bio
460
738
  nil
461
739
  end
462
740
 
463
- def get_all_by_type(*tstrarg)
741
+ # Returns identifiers by given type.
742
+ def get_all_by_type(*type_strarg)
464
743
  d = []
465
744
  @list_ids.each do |x|
466
745
  if labels = self.class::NSIDs[x[0]] then
467
- tstrarg.each do |y|
746
+ type_strarg.each do |y|
468
747
  if i = labels.index(y) then
469
748
  d << x[i+1] if x[i+1]
470
749
  end
@@ -474,6 +753,10 @@ module Bio
474
753
  d
475
754
  end
476
755
 
756
+ # Shows locus.
757
+ # If the entry has more than two of such IDs,
758
+ # only the first ID are shown.
759
+ # Returns a string or nil.
477
760
  def locus
478
761
  unless defined?(@locus)
479
762
  @locus = get_by_type('locus')
@@ -481,6 +764,10 @@ module Bio
481
764
  @locus
482
765
  end
483
766
 
767
+ # Shows GI.
768
+ # If the entry has more than two of such IDs,
769
+ # only the first ID are shown.
770
+ # Returns a string or nil.
484
771
  def gi
485
772
  unless defined?(@gi) then
486
773
  @gi = get_by_type('gi')
@@ -488,6 +775,10 @@ module Bio
488
775
  @gi
489
776
  end
490
777
 
778
+ # Shows accession with version number.
779
+ # If the entry has more than two of such IDs,
780
+ # only the first ID are shown.
781
+ # Returns a string or nil.
491
782
  def acc_version
492
783
  unless defined?(@acc_version) then
493
784
  @acc_version = get_by_type('acc_version')
@@ -495,6 +786,8 @@ module Bio
495
786
  @acc_version
496
787
  end
497
788
 
789
+ # Shows accession numbers.
790
+ # Returns an array of strings.
498
791
  def accessions
499
792
  unless defined?(@accessions) then
500
793
  @accessions = get_all_by_type('accession', 'acc_version')
@@ -503,6 +796,7 @@ module Bio
503
796
  @accessions
504
797
  end
505
798
 
799
+ # Shows an accession number.
506
800
  def accession
507
801
  unless defined?(@accession) then
508
802
  if acc_version then
@@ -523,6 +817,7 @@ module Bio
523
817
  end
524
818
  r
525
819
  end
820
+
526
821
 
527
822
  end #class FastaDefline
528
823
 
@@ -610,260 +905,3 @@ END
610
905
 
611
906
  end
612
907
 
613
- =begin
614
-
615
- = Bio::FastaFormat
616
-
617
- Treats a FASTA formatted entry, such as:
618
-
619
- >id and/or some comments <== comment line
620
- ATGCATGCATGCATGCATGCATGCATGCATGCATGC <== sequence lines
621
- ATGCATGCATGCATGCATGCATGCATGCATGCATGC
622
- ATGCATGCATGC
623
-
624
- The precedent '>' can be omitted and the trailing '>' will be removed
625
- automatically.
626
-
627
- --- Bio::FastaFormat.new(entry)
628
-
629
- Stores the comment and sequence information from one entry of the
630
- FASTA format string. If the argument contains more than one
631
- entry, only the first entry is used.
632
-
633
- --- Bio::FastaFormat#entry
634
-
635
- Returns the stored one entry as a FASTA format. (same as to_s)
636
-
637
- --- Bio::FastaFormat#definition
638
-
639
- Returns the comment line of the FASTA formatted data.
640
-
641
- --- Bio::FastaFormat#seq
642
-
643
- Returns a joined sequence line as a String.
644
-
645
- --- Bio::FastaFormat#query(factory)
646
- --- Bio::FastaFormat#fasta(factory)
647
- --- Bio::FastaFormat#blast(factory)
648
-
649
- Executes FASTA/BLAST search by using a Bio::Fasta or a Bio::Blast
650
- factory object.
651
-
652
- #!/usr/bin/env ruby
653
-
654
- require 'bio'
655
-
656
- factory = Bio::Fasta.local('fasta34', 'db/swissprot.f')
657
- flatfile = Bio::FlatFile.open(Bio::FastaFormat, 'queries.f')
658
- flatfile.each do |entry|
659
- p entry.definition
660
- result = entry.fasta(factory)
661
- result.each do |hit|
662
- print "#{hit.query_id} : #{hit.evalue}\t#{hit.target_id} at "
663
- p hit.lap_at
664
- end
665
- end
666
-
667
- --- Bio::FastaFormat#length
668
-
669
- Returns sequence length.
670
-
671
- --- Bio::FastaFormat#naseq
672
- --- Bio::FastaFormat#nalen
673
- --- Bio::FastaFormat#aaseq
674
- --- Bio::FastaFormat#aalen
675
-
676
- If you know whether the sequence is NA or AA, use these methods.
677
- 'naseq' and 'aaseq' methods returen the Bio::Sequence::NA or
678
- Bio::Sequence::AA object respectively. 'nalen' and 'aalen' methods
679
- return the length of them.
680
-
681
- --- Bio::FastaFormat#identifiers
682
-
683
- Parsing FASTA Defline, and extract IDs.
684
- IDs are NSIDs (NCBI standard FASTA sequence identifiers)
685
- or ":"-separated IDs.
686
- It returns a Bio::FastaDefline instance.
687
-
688
- --- Bio::FastaFormat#entry_id
689
-
690
- Parsing FASTA Defline (using #identifiers method), and
691
- shows a possibly unique identifier.
692
- It returns a string.
693
-
694
- --- Bio::FastaFormat#gi
695
- --- Bio::FastaFormat#locus
696
- --- Bio::FastaFormat#accession
697
- --- Bio::FastaFormat#acc_version
698
-
699
- Parsing FASTA Defline (using #identifiers method), and
700
- shows GI/locus/accession/accession with version number.
701
- If a entry has more than two of such IDs,
702
- only the first ID are shown.
703
- It returns a string or nil.
704
-
705
- --- Bio::FastaFormat#accessions
706
-
707
- Parsing FASTA Defline (using #identifiers method), and
708
- shows accession numbers.
709
- It returns an array of strings.
710
-
711
- --- Bio::FastaFormat
712
-
713
- = Bio::FastaNumericFormat
714
-
715
- Treats a FASTA formatted numerical entry, such as:
716
-
717
- >id and/or some comments <== comment line
718
- 24 15 23 29 20 13 20 21 21 23 22 25 13 <== numerical data
719
- 22 17 15 25 27 32 26 32 29 29 25
720
-
721
- The precedent '>' can be omitted and the trailing '>' will be removed
722
- automatically.
723
-
724
- --- Bio::FastaNumericFormat.new(entry)
725
-
726
- Stores the comment and the list of the numerical data.
727
-
728
- --- Bio::FastaNumericFormat#definition
729
-
730
- The comment line of the FASTA formatted data.
731
-
732
- --- Bio::FastaNumericFormat#data
733
-
734
- Returns the list of the numerical data (typically the quality score
735
- of its corresponding sequence) as an Array.
736
-
737
- --- Bio::FastaNumericFormat#length
738
-
739
- Returns the number of elements in the numerical data.
740
-
741
- --- Bio::FastaNumericFormat#each
742
-
743
- Yields on each elements of the numerical data.
744
-
745
- --- Bio::FastaNumericFormat#[](n)
746
-
747
- Returns the n-th element.
748
-
749
- --- Bio::FastaNumericFormat#identifiers
750
- --- Bio::FastaNumericFormat#entry_id
751
- --- Bio::FastaNumericFormat#gi
752
- --- Bio::FastaNumericFormat#locus
753
- --- Bio::FastaNumericFormat#accession
754
- --- Bio::FastaNumericFormat#acc_version
755
- --- Bio::FastaNumericFormat#accessions
756
-
757
- Same as Bio::FastaFormat.
758
-
759
-
760
- = Bio::FastaDefline
761
-
762
- Parsing FASTA Defline, and extract IDs and other informations.
763
- IDs are NSIDs (NCBI standard FASTA sequence identifiers)
764
- or ":"-separated IDs.
765
-
766
- --- see also:
767
- ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
768
- http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
769
-
770
- --- Bio::FastaDefline.new(str)
771
-
772
- Parses given string.
773
-
774
- --- Bio::FastaFormat#entry_id
775
-
776
- Shows a possibly unique identifier.
777
- Returns a string.
778
-
779
- --- Bio::FastaDefline#gi
780
- --- Bio::FastaDefline#locus
781
- --- Bio::FastaDefline#accession
782
- --- Bio::FastaDefline#acc_version
783
-
784
- Shows GI/locus/accession/accession with version number.
785
- If the entry has more than two of such IDs,
786
- only the first ID are shown.
787
- Returns a string or nil.
788
-
789
- --- Bio::FastaFormat#accessions
790
-
791
- Shows accession numbers.
792
- Returns an array of strings.
793
-
794
- --- Bio::FastaDefline#add_defline(str)
795
-
796
- Parses given string and adds parsed data.
797
-
798
- --- Bio::FastaDefline#to_s
799
-
800
- Shows original string.
801
- Note that the result of this method may be different from
802
- original string which is given in FastaDefline.new method.
803
-
804
- --- Bio::FastaDefline#id_strings
805
-
806
- Shows ID-like strings.
807
- Returns an array of strings.
808
-
809
- --- Bio::FastaDefline#list_ids
810
-
811
- Shows array that contains IDs (or ID-like strings).
812
- Returns an array of arrays of strings.
813
-
814
- --- Bio::FastaDefline#description
815
- --- Bio::FastaDefline#descriptions
816
-
817
- --- Bio::FastaDefline#words(case_sensitive = nil,
818
- kill_words_regexp_array, kill_words_hash)
819
-
820
- --- Bio::FastaDefline#get(tag_of_id)
821
-
822
- --- Bio::FastaDefline#get_by_type(type_of_id)
823
-
824
- --- Bio::FastaDefline#get_all_by_type(type_of_id)
825
-
826
- --- examples:
827
- rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
828
- rub.entry_id ==> 'gi|671595'
829
- rub.get('emb') ==> 'CAA85678.1'
830
- rub.emb ==> 'CAA85678.1'
831
- rub.gi ==> '671595'
832
- rub.accession ==> 'CAA85678'
833
- rub.accessions ==> [ 'CAA85678' ]
834
- rub.acc_version ==> 'CAA85678.1'
835
- rub.locus ==> nil
836
- rub.list_ids ==> [["gi", "671595"],
837
- ["emb", "CAA85678.1", nil],
838
- ["Perovskia abrotanoides"]]
839
-
840
- ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
841
- ckr.entry_id ==> "gi|2495000"
842
- ckr.sp ==> "CCKR_CAVPO"
843
- ckr.pir ==> "I51898"
844
- ckr.gb ==> "AAB29504.1"
845
- ckr.gi ==> "2495000"
846
- ckr.accession ==> "AAB29504"
847
- ckr.accessions ==> ["Q63931", "AAB29504"]
848
- ckr.acc_version ==> "AAB29504.1"
849
- ckr.locus ==> nil
850
- ckr.description ==>
851
- "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
852
- ckr.descriptions ==>
853
- ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
854
- "cholecystokinin A receptor - guinea pig",
855
- "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
856
- ckr.words ==>
857
- ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
858
- "receptor", "type"]
859
- ckr.id_strings ==>
860
- ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
861
- "544724", "AAB29504.1", "Cavia"]
862
- ckr.list_ids ==>
863
- [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
864
- ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
865
- ["gb", "AAB29504.1", nil], ["Cavia"]]
866
-
867
- =end
868
-
869
-