bio 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. data/bin/bioruby +107 -0
  2. data/bin/br_biofetch.rb +59 -0
  3. data/bin/br_bioflat.rb +294 -0
  4. data/bin/br_biogetseq.rb +57 -0
  5. data/bin/br_pmfetch.rb +431 -0
  6. data/doc/BioRuby.rd.ja +225 -0
  7. data/doc/Changes-0.7.rd +236 -0
  8. data/doc/Design.rd.ja +341 -0
  9. data/doc/KEGG_API.rd +1437 -0
  10. data/doc/KEGG_API.rd.ja +1399 -0
  11. data/doc/TODO.rd.ja +138 -0
  12. data/doc/Tutorial.rd +1138 -0
  13. data/doc/Tutorial.rd.ja +2110 -0
  14. data/etc/bioinformatics/seqdatabase.ini +210 -0
  15. data/lib/bio.rb +256 -0
  16. data/lib/bio/alignment.rb +1906 -0
  17. data/lib/bio/appl/bl2seq/report.rb +350 -0
  18. data/lib/bio/appl/blast.rb +269 -0
  19. data/lib/bio/appl/blast/format0.rb +1402 -0
  20. data/lib/bio/appl/blast/format8.rb +95 -0
  21. data/lib/bio/appl/blast/report.rb +652 -0
  22. data/lib/bio/appl/blast/rexml.rb +151 -0
  23. data/lib/bio/appl/blast/wublast.rb +553 -0
  24. data/lib/bio/appl/blast/xmlparser.rb +222 -0
  25. data/lib/bio/appl/blat/report.rb +392 -0
  26. data/lib/bio/appl/clustalw.rb +191 -0
  27. data/lib/bio/appl/clustalw/report.rb +154 -0
  28. data/lib/bio/appl/emboss.rb +68 -0
  29. data/lib/bio/appl/fasta.rb +262 -0
  30. data/lib/bio/appl/fasta/format10.rb +428 -0
  31. data/lib/bio/appl/fasta/format6.rb +37 -0
  32. data/lib/bio/appl/genscan/report.rb +570 -0
  33. data/lib/bio/appl/hmmer.rb +129 -0
  34. data/lib/bio/appl/hmmer/report.rb +556 -0
  35. data/lib/bio/appl/mafft.rb +222 -0
  36. data/lib/bio/appl/mafft/report.rb +119 -0
  37. data/lib/bio/appl/psort.rb +555 -0
  38. data/lib/bio/appl/psort/report.rb +473 -0
  39. data/lib/bio/appl/sim4.rb +134 -0
  40. data/lib/bio/appl/sim4/report.rb +501 -0
  41. data/lib/bio/appl/sosui/report.rb +166 -0
  42. data/lib/bio/appl/spidey/report.rb +604 -0
  43. data/lib/bio/appl/targetp/report.rb +283 -0
  44. data/lib/bio/appl/tmhmm/report.rb +238 -0
  45. data/lib/bio/command.rb +166 -0
  46. data/lib/bio/data/aa.rb +354 -0
  47. data/lib/bio/data/codontable.rb +740 -0
  48. data/lib/bio/data/na.rb +226 -0
  49. data/lib/bio/db.rb +340 -0
  50. data/lib/bio/db/aaindex.rb +280 -0
  51. data/lib/bio/db/embl/common.rb +332 -0
  52. data/lib/bio/db/embl/embl.rb +446 -0
  53. data/lib/bio/db/embl/sptr.rb +954 -0
  54. data/lib/bio/db/embl/swissprot.rb +32 -0
  55. data/lib/bio/db/embl/trembl.rb +31 -0
  56. data/lib/bio/db/embl/uniprot.rb +32 -0
  57. data/lib/bio/db/fantom.rb +604 -0
  58. data/lib/bio/db/fasta.rb +869 -0
  59. data/lib/bio/db/genbank/common.rb +299 -0
  60. data/lib/bio/db/genbank/ddbj.rb +34 -0
  61. data/lib/bio/db/genbank/genbank.rb +354 -0
  62. data/lib/bio/db/genbank/genpept.rb +73 -0
  63. data/lib/bio/db/genbank/refseq.rb +31 -0
  64. data/lib/bio/db/gff.rb +106 -0
  65. data/lib/bio/db/go.rb +497 -0
  66. data/lib/bio/db/kegg/brite.rb +51 -0
  67. data/lib/bio/db/kegg/cell.rb +88 -0
  68. data/lib/bio/db/kegg/compound.rb +130 -0
  69. data/lib/bio/db/kegg/enzyme.rb +125 -0
  70. data/lib/bio/db/kegg/expression.rb +173 -0
  71. data/lib/bio/db/kegg/genes.rb +293 -0
  72. data/lib/bio/db/kegg/genome.rb +362 -0
  73. data/lib/bio/db/kegg/glycan.rb +213 -0
  74. data/lib/bio/db/kegg/keggtab.rb +418 -0
  75. data/lib/bio/db/kegg/kgml.rb +299 -0
  76. data/lib/bio/db/kegg/ko.rb +178 -0
  77. data/lib/bio/db/kegg/reaction.rb +97 -0
  78. data/lib/bio/db/litdb.rb +131 -0
  79. data/lib/bio/db/medline.rb +317 -0
  80. data/lib/bio/db/nbrf.rb +199 -0
  81. data/lib/bio/db/pdb.rb +38 -0
  82. data/lib/bio/db/pdb/atom.rb +60 -0
  83. data/lib/bio/db/pdb/chain.rb +117 -0
  84. data/lib/bio/db/pdb/model.rb +106 -0
  85. data/lib/bio/db/pdb/pdb.rb +1682 -0
  86. data/lib/bio/db/pdb/residue.rb +122 -0
  87. data/lib/bio/db/pdb/utils.rb +234 -0
  88. data/lib/bio/db/prosite.rb +616 -0
  89. data/lib/bio/db/rebase.rb +417 -0
  90. data/lib/bio/db/transfac.rb +387 -0
  91. data/lib/bio/feature.rb +201 -0
  92. data/lib/bio/io/brdb.rb +103 -0
  93. data/lib/bio/io/das.rb +471 -0
  94. data/lib/bio/io/dbget.rb +212 -0
  95. data/lib/bio/io/ddbjxml.rb +614 -0
  96. data/lib/bio/io/fastacmd.rb +123 -0
  97. data/lib/bio/io/fetch.rb +114 -0
  98. data/lib/bio/io/flatfile.rb +496 -0
  99. data/lib/bio/io/flatfile/bdb.rb +266 -0
  100. data/lib/bio/io/flatfile/index.rb +1308 -0
  101. data/lib/bio/io/flatfile/indexer.rb +778 -0
  102. data/lib/bio/io/higet.rb +92 -0
  103. data/lib/bio/io/keggapi.rb +863 -0
  104. data/lib/bio/io/pubmed.rb +189 -0
  105. data/lib/bio/io/registry.rb +308 -0
  106. data/lib/bio/io/soapwsdl.rb +114 -0
  107. data/lib/bio/io/sql.rb +428 -0
  108. data/lib/bio/location.rb +650 -0
  109. data/lib/bio/pathway.rb +991 -0
  110. data/lib/bio/reference.rb +308 -0
  111. data/lib/bio/sequence.rb +593 -0
  112. data/lib/bio/shell.rb +51 -0
  113. data/lib/bio/shell/core.rb +512 -0
  114. data/lib/bio/shell/plugin/codon.rb +228 -0
  115. data/lib/bio/shell/plugin/entry.rb +85 -0
  116. data/lib/bio/shell/plugin/flatfile.rb +119 -0
  117. data/lib/bio/shell/plugin/keggapi.rb +187 -0
  118. data/lib/bio/shell/plugin/midi.rb +448 -0
  119. data/lib/bio/shell/plugin/obda.rb +63 -0
  120. data/lib/bio/shell/plugin/seq.rb +238 -0
  121. data/lib/bio/shell/session.rb +214 -0
  122. data/lib/bio/util/color_scheme.rb +214 -0
  123. data/lib/bio/util/color_scheme/buried.rb +78 -0
  124. data/lib/bio/util/color_scheme/helix.rb +78 -0
  125. data/lib/bio/util/color_scheme/hydropathy.rb +83 -0
  126. data/lib/bio/util/color_scheme/nucleotide.rb +50 -0
  127. data/lib/bio/util/color_scheme/strand.rb +78 -0
  128. data/lib/bio/util/color_scheme/taylor.rb +69 -0
  129. data/lib/bio/util/color_scheme/turn.rb +78 -0
  130. data/lib/bio/util/color_scheme/zappo.rb +69 -0
  131. data/lib/bio/util/contingency_table.rb +337 -0
  132. data/lib/bio/util/sirna.rb +306 -0
  133. data/lib/bioruby.rb +34 -0
  134. data/sample/biofetch.rb +475 -0
  135. data/sample/color_scheme_na.rb +99 -0
  136. data/sample/dbget +37 -0
  137. data/sample/fasta2tab.rb +99 -0
  138. data/sample/fsplit.rb +51 -0
  139. data/sample/gb2fasta.rb +31 -0
  140. data/sample/gb2tab.rb +325 -0
  141. data/sample/gbtab2mysql.rb +161 -0
  142. data/sample/genes2nuc.rb +33 -0
  143. data/sample/genes2pep.rb +33 -0
  144. data/sample/genes2tab.rb +81 -0
  145. data/sample/genome2rb.rb +29 -0
  146. data/sample/genome2tab.rb +76 -0
  147. data/sample/goslim.rb +311 -0
  148. data/sample/gt2fasta.rb +47 -0
  149. data/sample/pmfetch.rb +42 -0
  150. data/sample/pmsearch.rb +42 -0
  151. data/sample/psortplot_html.rb +222 -0
  152. data/sample/ssearch2tab.rb +96 -0
  153. data/sample/tdiary.rb +158 -0
  154. data/sample/tfastx2tab.rb +100 -0
  155. data/sample/vs-genes.rb +212 -0
  156. data/test/data/SOSUI/sample.report +11 -0
  157. data/test/data/TMHMM/sample.report +21 -0
  158. data/test/data/blast/eco:b0002.faa +15 -0
  159. data/test/data/blast/eco:b0002.faa.m0 +128 -0
  160. data/test/data/blast/eco:b0002.faa.m7 +65 -0
  161. data/test/data/blast/eco:b0002.faa.m8 +1 -0
  162. data/test/data/embl/AB090716.embl +65 -0
  163. data/test/data/genscan/sample.report +63 -0
  164. data/test/data/prosite/prosite.dat +2233 -0
  165. data/test/data/refseq/nm_126355.entret +64 -0
  166. data/test/data/uniprot/p53_human.uniprot +1456 -0
  167. data/test/runner.rb +10 -0
  168. data/test/unit/bio/appl/blast/test_report.rb +427 -0
  169. data/test/unit/bio/appl/blast/test_xmlparser.rb +400 -0
  170. data/test/unit/bio/appl/genscan/test_report.rb +195 -0
  171. data/test/unit/bio/appl/sosui/test_report.rb +94 -0
  172. data/test/unit/bio/appl/targetp/test_report.rb +159 -0
  173. data/test/unit/bio/appl/test_blast.rb +159 -0
  174. data/test/unit/bio/appl/test_fasta.rb +142 -0
  175. data/test/unit/bio/appl/tmhmm/test_report.rb +139 -0
  176. data/test/unit/bio/data/test_aa.rb +103 -0
  177. data/test/unit/bio/data/test_codontable.rb +120 -0
  178. data/test/unit/bio/data/test_na.rb +89 -0
  179. data/test/unit/bio/db/embl/test_common.rb +130 -0
  180. data/test/unit/bio/db/embl/test_embl.rb +227 -0
  181. data/test/unit/bio/db/embl/test_sptr.rb +268 -0
  182. data/test/unit/bio/db/embl/test_uniprot.rb +44 -0
  183. data/test/unit/bio/db/kegg/test_genes.rb +58 -0
  184. data/test/unit/bio/db/test_fasta.rb +263 -0
  185. data/test/unit/bio/db/test_gff.rb +140 -0
  186. data/test/unit/bio/db/test_prosite.rb +1450 -0
  187. data/test/unit/bio/io/test_ddbjxml.rb +87 -0
  188. data/test/unit/bio/io/test_soapwsdl.rb +45 -0
  189. data/test/unit/bio/shell/plugin/test_seq.rb +175 -0
  190. data/test/unit/bio/test_alignment.rb +1028 -0
  191. data/test/unit/bio/test_command.rb +71 -0
  192. data/test/unit/bio/test_db.rb +109 -0
  193. data/test/unit/bio/test_feature.rb +128 -0
  194. data/test/unit/bio/test_location.rb +51 -0
  195. data/test/unit/bio/test_pathway.rb +485 -0
  196. data/test/unit/bio/test_sequence.rb +386 -0
  197. data/test/unit/bio/test_shell.rb +31 -0
  198. data/test/unit/bio/util/test_color_scheme.rb +45 -0
  199. data/test/unit/bio/util/test_contingency_table.rb +106 -0
  200. data/test/unit/bio/util/test_sirna.rb +258 -0
  201. metadata +295 -0
@@ -0,0 +1,869 @@
1
+ #
2
+ # bio/db/fasta.rb - FASTA format class
3
+ #
4
+ # Copyright (C) 2001 GOTO Naohisa <ngoto@gen-info.osaka-u.ac.jp>
5
+ # Copyright (C) 2001, 2002 KATAYAMA Toshiaki <k@bioruby.org>
6
+ #
7
+ # This library is free software; you can redistribute it and/or
8
+ # modify it under the terms of the GNU Lesser General Public
9
+ # License as published by the Free Software Foundation; either
10
+ # version 2 of the License, or (at your option) any later version.
11
+ #
12
+ # This library is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ # Lesser General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Lesser General Public
18
+ # License along with this library; if not, write to the Free Software
19
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20
+ #
21
+ # $Id: fasta.rb,v 1.21 2005/09/26 13:00:06 k Exp $
22
+ #
23
+
24
+ require 'bio/db'
25
+ require 'bio/sequence'
26
+
27
+ module Bio
28
+
29
+ class FastaFormat < DB
30
+
31
+ DELIMITER = RS = "\n>"
32
+
33
+ def initialize(str)
34
+ @definition = str[/.*/].sub(/^>/, '').strip # 1st line
35
+ @data = str.sub(/.*/, '') # rests
36
+ @data.sub!(/^>.*/m, '') # remove trailing entries for sure
37
+ @entry_overrun = $&
38
+ end
39
+ attr_accessor :definition, :data
40
+ attr_reader :entry_overrun
41
+
42
+ def entry
43
+ @entry = ">#{@definition}\n#{@data.strip}\n"
44
+ end
45
+ alias to_s entry
46
+
47
+ def query(factory)
48
+ factory.query(@entry)
49
+ end
50
+ alias fasta query
51
+ alias blast query
52
+
53
+ def seq
54
+ unless defined?(@seq)
55
+ unless /\A\s*^\#/ =~ @data then
56
+ @seq = Sequence.new(@data.tr(" \t\r\n0-9", '')) # lazy clean up
57
+ else
58
+ a = @data.split(/(^\#.*$)/)
59
+ i = 0
60
+ cmnt = {}
61
+ s = []
62
+ a.each do |x|
63
+ if /^# ?(.*)$/ =~ x then
64
+ cmnt[i] ? cmnt[i] << "\n" << $1 : cmnt[i] = $1
65
+ else
66
+ x.tr!(" \t\r\n0-9", '') # lazy clean up
67
+ i += x.length
68
+ s << x
69
+ end
70
+ end
71
+ @comment = cmnt
72
+ @seq = Bio::Sequence.new(s.join(''))
73
+ end
74
+ end
75
+ @seq
76
+ end
77
+
78
+ def comment
79
+ seq
80
+ @comment
81
+ end
82
+
83
+ def length
84
+ seq.length
85
+ end
86
+
87
+ def naseq
88
+ Sequence::NA.new(seq)
89
+ end
90
+
91
+ def nalen
92
+ self.naseq.length
93
+ end
94
+
95
+ def aaseq
96
+ Sequence::AA.new(seq)
97
+ end
98
+
99
+ def aalen
100
+ self.aaseq.length
101
+ end
102
+
103
+ def identifiers
104
+ unless defined?(@ids) then
105
+ @ids = FastaDefline.new(@definition)
106
+ end
107
+ @ids
108
+ end
109
+
110
+ def entry_id
111
+ identifiers.entry_id
112
+ end
113
+
114
+ def gi
115
+ identifiers.gi
116
+ end
117
+
118
+ def accession
119
+ identifiers.accession
120
+ end
121
+
122
+ def accessions
123
+ identifiers.accessions
124
+ end
125
+
126
+ def acc_version
127
+ identifiers.acc_version
128
+ end
129
+
130
+ def locus
131
+ identifiers.locus
132
+ end
133
+
134
+ end #class FastaFormat
135
+
136
+ class FastaNumericFormat < FastaFormat
137
+
138
+ def data
139
+ unless @list
140
+ @list = @data.strip.split(/\s+/).map {|x| x.to_i}
141
+ end
142
+ @list
143
+ end
144
+
145
+ def length
146
+ data.length
147
+ end
148
+
149
+ def each
150
+ data.each do |x|
151
+ yield x
152
+ end
153
+ end
154
+
155
+ def [](n)
156
+ data[n]
157
+ end
158
+
159
+ undef query, blast, fasta, seq, naseq, nalen, aaseq, aalen
160
+
161
+ end #class FastaNumericFormat
162
+
163
+ class FastaDefline
164
+
165
+ # specs are described in:
166
+ # ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
167
+ # http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
168
+
169
+ NSIDs = {
170
+ # NCBI and WU-BLAST
171
+ 'gi' => [ 'gi' ], # NCBI GI
172
+ 'gb' => [ 'acc_version', 'locus' ], # GenBank
173
+ 'emb' => [ 'acc_version', 'locus' ], # EMBL
174
+ 'dbj' => [ 'acc_version', 'locus' ], # DDBJ
175
+ 'sp' => [ 'accession', 'entry_id' ], # SWISS-PROT
176
+ 'pdb' => [ 'entry_id', 'chain' ], # PDB
177
+ 'bbs' => [ 'number' ], # GenInfo Backbone Id
178
+ 'gnl' => [ 'database' , 'entry_id' ], # General database identifier
179
+ 'ref' => [ 'acc_version' , 'locus' ], # NCBI Reference Sequence
180
+ 'lcl' => [ 'entry_id' ], # Local Sequence identifier
181
+
182
+ # WU-BLAST and NCBI
183
+ 'pir' => [ 'accession', 'entry_id' ], # PIR
184
+ 'prf' => [ 'accession', 'entry_id' ], # Protein Research Foundation
185
+ 'pat' => [ 'country', 'number', 'serial' ], # Patents
186
+
187
+ # WU-BLAST only
188
+ 'bbm' => [ 'number' ], # NCBI GenInfo Backbone database identifier
189
+ 'gim' => [ 'number' ], # NCBI GenInfo Import identifier
190
+ 'gp' => [ 'acc_version', 'locus' ], # GenPept
191
+ 'oth' => [ 'accession', 'name', 'release' ], # Other (user-definable) identifier
192
+ 'tpd' => [ 'accession', 'name' ], # Third party annotation, DDBJ
193
+ 'tpe' => [ 'accession', 'name' ], # Third party annotation, EMBL
194
+ 'tpg' => [ 'accession', 'name' ], # Third party annotation, GenBank
195
+
196
+ # Original
197
+ 'ri' => [ 'entry_id', 'rearray_id', 'len' ], # RIKEN FANTOM DB
198
+ }
199
+
200
+ def initialize(str)
201
+ @deflines = []
202
+ @info = {}
203
+ @list_ids = []
204
+
205
+ @entry_id = nil
206
+
207
+ lines = str.split("\x01")
208
+ lines.each do |line|
209
+ add_defline(line)
210
+ end
211
+ end #def initialize
212
+
213
+ attr_reader :list_ids
214
+ attr_reader :entry_id
215
+
216
+ def add_defline(str)
217
+ case str
218
+ when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/
219
+ # NSIDs
220
+ # examples:
221
+ # >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P
222
+ #
223
+ # note: regexp (:?) means grouping without backreferences
224
+ i = $1
225
+ d = $2
226
+ tks = i.split('|')
227
+ tks << '' if i[-1,1] == '|'
228
+ a = parse_NSIDs(tks)
229
+ i = a[0].join('|')
230
+ a.unshift('|')
231
+ d = tks.join('|') + ' ' + d unless tks.empty?
232
+ a << d
233
+ this_line = a
234
+ match_EC(d)
235
+ parse_square_brackets(d).each do |x|
236
+ if !match_EC(x, false) and x =~ /\A[A-Z]/ then
237
+ di = [ x ]
238
+ @list_ids << di
239
+ @info['organism'] = x unless @info['organism']
240
+ end
241
+ end
242
+
243
+ when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/
244
+ # examples:
245
+ # >sce:YBR160W CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
246
+ # >emb:CACDC28 [X80034] C.albicans CDC28 gene
247
+ i = $1
248
+ d = $2
249
+ a = parse_ColonSepID(i)
250
+ i = a.join(':')
251
+ this_line = [ ':', a , d ]
252
+ match_EC(d)
253
+ parse_square_brackets(d).each do |x|
254
+ if !match_EC(x, false) and x =~ /:/ then
255
+ parse_ColonSepID(x)
256
+ elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then
257
+ @list_ids << [ $1 ]
258
+ end
259
+ end
260
+
261
+ when /^\>?\s*(\S+)(?:\s+(.+))?$/
262
+ # examples:
263
+ # >ABC12345 this is test
264
+ i = $1
265
+ d = $2.to_s
266
+ @list_ids << [ i.chomp('.') ]
267
+ this_line = [ '', [ i ], d ]
268
+ match_EC(d)
269
+ else
270
+ i = str
271
+ d = ''
272
+ match_EC(i)
273
+ this_line = [ '', [ i ], d ]
274
+ end
275
+
276
+ @deflines << this_line
277
+ @entry_id = i unless @entry_id
278
+ end
279
+
280
+ def match_EC(str, write_flag = true)
281
+ di = nil
282
+ str.scan(/EC\:((:?[\-\d]+\.){3}(:?[\-\d]+))/i) do |x|
283
+ di = [ 'EC', $1 ]
284
+ if write_flag then
285
+ @info['ec'] = di[1] if (!@info['ec'] or @info['ec'].to_s =~ /\-/)
286
+ @list_ids << di
287
+ end
288
+ end
289
+ di
290
+ end
291
+ private :match_EC
292
+
293
+ def parse_square_brackets(str)
294
+ r = []
295
+ str.scan(/\[([^\]]*)\]/) do |x|
296
+ r << x[0]
297
+ end
298
+ r
299
+ end
300
+ private :parse_square_brackets
301
+
302
+ def parse_ColonSepID(str)
303
+ di = str.split(':', 2)
304
+ di << nil if di.size <= 1
305
+ @list_ids << di
306
+ di
307
+ end
308
+ private :parse_ColonSepID
309
+
310
+ def parse_NSIDs(ary)
311
+ # this method destroys ary
312
+ data = []
313
+ while token = ary.shift
314
+ if labels = self.class::NSIDs[token] then
315
+ di = [ token ]
316
+ idtype = token
317
+ labels.each do |x|
318
+ token = ary.shift
319
+ break unless token
320
+ if self.class::NSIDs[token] then
321
+ ary.unshift(token)
322
+ break #each
323
+ end
324
+ if token.length > 0 then
325
+ di << token
326
+ else
327
+ di << nil
328
+ end
329
+ end
330
+ data << di
331
+ else
332
+ if token.length > 0 then
333
+ # UCID (uncontrolled identifiers)
334
+ di = [ token ]
335
+ data << di
336
+ @info['ucid'] = token unless @info['ucid']
337
+ end
338
+ break #while
339
+ end
340
+ end #while
341
+ @list_ids.concat data
342
+ data
343
+ end #def parse_NSIDs
344
+ private :parse_NSIDs
345
+
346
+ def to_s
347
+ @deflines.collect { |a|
348
+ s = a[0]
349
+ (a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip
350
+ }.join("\x01")
351
+ end
352
+
353
+ def description
354
+ @deflines[0].to_a[-1]
355
+ end
356
+
357
+ def descriptions
358
+ @deflines.collect do |a|
359
+ a[-1]
360
+ end
361
+ end
362
+
363
+ def id_strings
364
+ r = []
365
+ @list_ids.each do |a|
366
+ if a.size >= 2 then
367
+ r.concat a[1..-1].find_all { |x| x }
368
+ else
369
+ if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/
370
+ r << a[0]
371
+ end
372
+ end
373
+ end
374
+ r.concat( words(true, []).find_all do |x|
375
+ x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or
376
+ x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
377
+ end)
378
+ r
379
+ end
380
+
381
+ KillWords = [
382
+ 'an', 'the', 'this', 'that',
383
+ 'is', 'are', 'were', 'was', 'be', 'can', 'may', 'might',
384
+ 'as', 'at', 'by', 'for', 'in', 'of', 'on', 'to', 'with',
385
+ 'from', 'and', 'or', 'not',
386
+ 'dna', 'rna', 'mrna', 'cdna', 'orf',
387
+ 'aa', 'nt', 'pct', 'id', 'ec', 'sp', 'subsp',
388
+ 'similar', 'involved', 'identical', 'identity',
389
+ 'cds', 'clone', 'library', 'contig', 'contigs',
390
+ 'homolog', 'homologue', 'homologs', 'homologous',
391
+ 'protein', 'proteins', 'gene', 'genes',
392
+ 'product', 'products', 'sequence', 'sequences',
393
+ 'strain', 'strains', 'region', 'regions',
394
+ ]
395
+ KillWordsHash = {}
396
+ KillWords.each { |x| KillWordsHash[x] = true }
397
+
398
+ KillRegexpArray = [
399
+ /\A\d{1,3}\%?\z/,
400
+ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/,
401
+ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
402
+ ]
403
+
404
+ def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray,
405
+ kwhash = self.class::KillWordsHash)
406
+ a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\#\s\x00-\x1f\x7f]+/)
407
+ a.collect! do |x|
408
+ x.sub!(/\A[\$\*\-\+]+/, '')
409
+ x.sub!(/[\$\*\-\=]+\z/, '')
410
+ if x.size <= 1 then
411
+ nil
412
+ elsif kwhash[x.downcase] then
413
+ nil
414
+ else
415
+ if kill_regexp.find { |expr| expr =~ x } then
416
+ nil
417
+ else
418
+ x
419
+ end
420
+ end
421
+ end
422
+ a.compact!
423
+ a.collect! { |x| x.downcase } unless case_sensitive
424
+ a.sort!
425
+ a.uniq!
426
+ a
427
+ end
428
+
429
+ def get(db)
430
+ db =db.to_s
431
+ r = nil
432
+ unless r = @info[db] then
433
+ di = @list_ids.find { |x| x[0] == db.to_s }
434
+ if di and di.size <= 2 then
435
+ r = di[-1]
436
+ elsif di then
437
+ labels = self.class::NSIDs[db]
438
+ [ 'acc_version', 'entry_id',
439
+ 'locus', 'accession', 'number'].each do |x|
440
+ if i = labels.index(x) then
441
+ r = di[i+1]
442
+ break if r
443
+ end
444
+ end
445
+ r = di[1..-1].find { |x| x } unless r
446
+ end
447
+ @info[db] = r if r
448
+ end
449
+ r
450
+ end
451
+
452
+ def get_by_type(tstr)
453
+ @list_ids.each do |x|
454
+ if labels = self.class::NSIDs[x[0]] then
455
+ if i = labels.index(tstr) then
456
+ return x[i+1]
457
+ end
458
+ end
459
+ end
460
+ nil
461
+ end
462
+
463
+ def get_all_by_type(*tstrarg)
464
+ d = []
465
+ @list_ids.each do |x|
466
+ if labels = self.class::NSIDs[x[0]] then
467
+ tstrarg.each do |y|
468
+ if i = labels.index(y) then
469
+ d << x[i+1] if x[i+1]
470
+ end
471
+ end
472
+ end
473
+ end
474
+ d
475
+ end
476
+
477
+ def locus
478
+ unless defined?(@locus)
479
+ @locus = get_by_type('locus')
480
+ end
481
+ @locus
482
+ end
483
+
484
+ def gi
485
+ unless defined?(@gi) then
486
+ @gi = get_by_type('gi')
487
+ end
488
+ @gi
489
+ end
490
+
491
+ def acc_version
492
+ unless defined?(@acc_version) then
493
+ @acc_version = get_by_type('acc_version')
494
+ end
495
+ @acc_version
496
+ end
497
+
498
+ def accessions
499
+ unless defined?(@accessions) then
500
+ @accessions = get_all_by_type('accession', 'acc_version')
501
+ @accessions.collect! { |x| x.sub(/\..*\z/, '') }
502
+ end
503
+ @accessions
504
+ end
505
+
506
+ def accession
507
+ unless defined?(@accession) then
508
+ if acc_version then
509
+ @accession = acc_version.split('.')[0]
510
+ else
511
+ @accession = accessions[0]
512
+ end
513
+ end
514
+ @accession
515
+ end
516
+
517
+ def method_missing(name, *args)
518
+ # raise ArgumentError,
519
+ # "wrong # of arguments(#{args.size} for 1)" if args.size >= 2
520
+ r = get(name, *args)
521
+ if !r and !(self.class::NSIDs[name.to_s]) then
522
+ raise "NameError: undefined method `#{name.inspect}'"
523
+ end
524
+ r
525
+ end
526
+
527
+ end #class FastaDefline
528
+
529
+ end #module Bio
530
+
531
+ if __FILE__ == $0
532
+
533
+ f_str = <<END
534
+ >sce:YBR160W CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
535
+ MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEG
536
+ VPSTAIREISLLKELKDDNIVRLYDIVHSDAHKLYLVFEFLDLDLKRYME
537
+ GIPKDQPLGADIVKKFMMQLCKGIAYCHSHRILHRDLKPQNLLINKDGNL
538
+ KLGDFGLARAFGVPLRAYTHEIVTLWYRAPEVLLGGKQYSTGVDTWSIGC
539
+ IFAEMCNRKPIFSGDSEIDQIFKIFRVLGTPNEAIWPDIVYLPDFKPSFP
540
+ QWRRKDLSQVVPSLDPRGIDLLDKLLAYDPINRISARRAAIHPYFQES
541
+ >sce:YBR274W CHK1; probable serine/threonine-protein kinase [EC:2.7.1.-] [SP:KB9S_YEAST]
542
+ MSLSQVSPLPHIKDVVLGDTVGQGAFACVKNAHLQMDPSIILAVKFIHVP
543
+ TCKKMGLSDKDITKEVVLQSKCSKHPNVLRLIDCNVSKEYMWIILEMADG
544
+ GDLFDKIEPDVGVDSDVAQFYFQQLVSAINYLHVECGVAHRDIKPENILL
545
+ DKNGNLKLADFGLASQFRRKDGTLRVSMDQRGSPPYMAPEVLYSEEGYYA
546
+ DRTDIWSIGILLFVLLTGQTPWELPSLENEDFVFFIENDGNLNWGPWSKI
547
+ EFTHLNLLRKILQPDPNKRVTLKALKLHPWVLRRASFSGDDGLCNDPELL
548
+ AKKLFSHLKVSLSNENYLKFTQDTNSNNRYISTQPIGNELAELEHDSMHF
549
+ QTVSNTQRAFTSYDSNTNYNSGTGMTQEAKWTQFISYDIAALQFHSDEND
550
+ CNELVKRHLQFNPNKLTKFYTLQPMDVLLPILEKALNLSQIRVKPDLFAN
551
+ FERLCELLGYDNVFPLIINIKTKSNGGYQLCGSISIIKIEEELKSVGFER
552
+ KTGDPLEWRRLFKKISTICRDIILIPN
553
+ END
554
+
555
+ f = Bio::FastaFormat.new(f_str)
556
+ puts "### FastaFormat"
557
+ puts "# entry"
558
+ puts f.entry
559
+ puts "# entry_id"
560
+ p f.entry_id
561
+ puts "# definition"
562
+ p f.definition
563
+ puts "# data"
564
+ p f.data
565
+ puts "# seq"
566
+ p f.seq
567
+ puts "# seq.type"
568
+ p f.seq.type
569
+ puts "# length"
570
+ p f.length
571
+ puts "# aaseq"
572
+ p f.aaseq
573
+ puts "# aaseq.type"
574
+ p f.aaseq.type
575
+ puts "# aaseq.composition"
576
+ p f.aaseq.composition
577
+ puts "# aalen"
578
+ p f.aalen
579
+
580
+ puts
581
+
582
+ n_str = <<END
583
+ >CRA3575282.F
584
+ 24 15 23 29 20 13 20 21 21 23 22 25 13 22 17 15 25 27 32 26
585
+ 32 29 29 25
586
+ END
587
+
588
+ n = Bio::FastaNumericFormat.new(n_str)
589
+ puts "### FastaNumericFormat"
590
+ puts "# entry"
591
+ puts n.entry
592
+ puts "# entry_id"
593
+ p n.entry_id
594
+ puts "# definition"
595
+ p n.definition
596
+ puts "# data"
597
+ p n.data
598
+ puts "# length"
599
+ p n.length
600
+ puts "# percent to ratio by yield"
601
+ n.each do |x|
602
+ p x/100.0
603
+ end
604
+ puts "# first three"
605
+ p n[0]
606
+ p n[1]
607
+ p n[2]
608
+ puts "# last one"
609
+ p n[-1]
610
+
611
+ end
612
+
613
+ =begin
614
+
615
+ = Bio::FastaFormat
616
+
617
+ Treats a FASTA formatted entry, such as:
618
+
619
+ >id and/or some comments <== comment line
620
+ ATGCATGCATGCATGCATGCATGCATGCATGCATGC <== sequence lines
621
+ ATGCATGCATGCATGCATGCATGCATGCATGCATGC
622
+ ATGCATGCATGC
623
+
624
+ The precedent '>' can be omitted and the trailing '>' will be removed
625
+ automatically.
626
+
627
+ --- Bio::FastaFormat.new(entry)
628
+
629
+ Stores the comment and sequence information from one entry of the
630
+ FASTA format string. If the argument contains more than one
631
+ entry, only the first entry is used.
632
+
633
+ --- Bio::FastaFormat#entry
634
+
635
+ Returns the stored one entry as a FASTA format. (same as to_s)
636
+
637
+ --- Bio::FastaFormat#definition
638
+
639
+ Returns the comment line of the FASTA formatted data.
640
+
641
+ --- Bio::FastaFormat#seq
642
+
643
+ Returns a joined sequence line as a String.
644
+
645
+ --- Bio::FastaFormat#query(factory)
646
+ --- Bio::FastaFormat#fasta(factory)
647
+ --- Bio::FastaFormat#blast(factory)
648
+
649
+ Executes FASTA/BLAST search by using a Bio::Fasta or a Bio::Blast
650
+ factory object.
651
+
652
+ #!/usr/bin/env ruby
653
+
654
+ require 'bio'
655
+
656
+ factory = Bio::Fasta.local('fasta34', 'db/swissprot.f')
657
+ flatfile = Bio::FlatFile.open(Bio::FastaFormat, 'queries.f')
658
+ flatfile.each do |entry|
659
+ p entry.definition
660
+ result = entry.fasta(factory)
661
+ result.each do |hit|
662
+ print "#{hit.query_id} : #{hit.evalue}\t#{hit.target_id} at "
663
+ p hit.lap_at
664
+ end
665
+ end
666
+
667
+ --- Bio::FastaFormat#length
668
+
669
+ Returns sequence length.
670
+
671
+ --- Bio::FastaFormat#naseq
672
+ --- Bio::FastaFormat#nalen
673
+ --- Bio::FastaFormat#aaseq
674
+ --- Bio::FastaFormat#aalen
675
+
676
+ If you know whether the sequence is NA or AA, use these methods.
677
+ 'naseq' and 'aaseq' methods returen the Bio::Sequence::NA or
678
+ Bio::Sequence::AA object respectively. 'nalen' and 'aalen' methods
679
+ return the length of them.
680
+
681
+ --- Bio::FastaFormat#identifiers
682
+
683
+ Parsing FASTA Defline, and extract IDs.
684
+ IDs are NSIDs (NCBI standard FASTA sequence identifiers)
685
+ or ":"-separated IDs.
686
+ It returns a Bio::FastaDefline instance.
687
+
688
+ --- Bio::FastaFormat#entry_id
689
+
690
+ Parsing FASTA Defline (using #identifiers method), and
691
+ shows a possibly unique identifier.
692
+ It returns a string.
693
+
694
+ --- Bio::FastaFormat#gi
695
+ --- Bio::FastaFormat#locus
696
+ --- Bio::FastaFormat#accession
697
+ --- Bio::FastaFormat#acc_version
698
+
699
+ Parsing FASTA Defline (using #identifiers method), and
700
+ shows GI/locus/accession/accession with version number.
701
+ If a entry has more than two of such IDs,
702
+ only the first ID are shown.
703
+ It returns a string or nil.
704
+
705
+ --- Bio::FastaFormat#accessions
706
+
707
+ Parsing FASTA Defline (using #identifiers method), and
708
+ shows accession numbers.
709
+ It returns an array of strings.
710
+
711
+ --- Bio::FastaFormat
712
+
713
+ = Bio::FastaNumericFormat
714
+
715
+ Treats a FASTA formatted numerical entry, such as:
716
+
717
+ >id and/or some comments <== comment line
718
+ 24 15 23 29 20 13 20 21 21 23 22 25 13 <== numerical data
719
+ 22 17 15 25 27 32 26 32 29 29 25
720
+
721
+ The precedent '>' can be omitted and the trailing '>' will be removed
722
+ automatically.
723
+
724
+ --- Bio::FastaNumericFormat.new(entry)
725
+
726
+ Stores the comment and the list of the numerical data.
727
+
728
+ --- Bio::FastaNumericFormat#definition
729
+
730
+ The comment line of the FASTA formatted data.
731
+
732
+ --- Bio::FastaNumericFormat#data
733
+
734
+ Returns the list of the numerical data (typically the quality score
735
+ of its corresponding sequence) as an Array.
736
+
737
+ --- Bio::FastaNumericFormat#length
738
+
739
+ Returns the number of elements in the numerical data.
740
+
741
+ --- Bio::FastaNumericFormat#each
742
+
743
+ Yields on each elements of the numerical data.
744
+
745
+ --- Bio::FastaNumericFormat#[](n)
746
+
747
+ Returns the n-th element.
748
+
749
+ --- Bio::FastaNumericFormat#identifiers
750
+ --- Bio::FastaNumericFormat#entry_id
751
+ --- Bio::FastaNumericFormat#gi
752
+ --- Bio::FastaNumericFormat#locus
753
+ --- Bio::FastaNumericFormat#accession
754
+ --- Bio::FastaNumericFormat#acc_version
755
+ --- Bio::FastaNumericFormat#accessions
756
+
757
+ Same as Bio::FastaFormat.
758
+
759
+
760
+ = Bio::FastaDefline
761
+
762
+ Parsing FASTA Defline, and extract IDs and other informations.
763
+ IDs are NSIDs (NCBI standard FASTA sequence identifiers)
764
+ or ":"-separated IDs.
765
+
766
+ --- see also:
767
+ ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
768
+ http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
769
+
770
+ --- Bio::FastaDefline.new(str)
771
+
772
+ Parses given string.
773
+
774
+ --- Bio::FastaFormat#entry_id
775
+
776
+ Shows a possibly unique identifier.
777
+ Returns a string.
778
+
779
+ --- Bio::FastaDefline#gi
780
+ --- Bio::FastaDefline#locus
781
+ --- Bio::FastaDefline#accession
782
+ --- Bio::FastaDefline#acc_version
783
+
784
+ Shows GI/locus/accession/accession with version number.
785
+ If the entry has more than two of such IDs,
786
+ only the first ID are shown.
787
+ Returns a string or nil.
788
+
789
+ --- Bio::FastaFormat#accessions
790
+
791
+ Shows accession numbers.
792
+ Returns an array of strings.
793
+
794
+ --- Bio::FastaDefline#add_defline(str)
795
+
796
+ Parses given string and adds parsed data.
797
+
798
+ --- Bio::FastaDefline#to_s
799
+
800
+ Shows original string.
801
+ Note that the result of this method may be different from
802
+ original string which is given in FastaDefline.new method.
803
+
804
+ --- Bio::FastaDefline#id_strings
805
+
806
+ Shows ID-like strings.
807
+ Returns an array of strings.
808
+
809
+ --- Bio::FastaDefline#list_ids
810
+
811
+ Shows array that contains IDs (or ID-like strings).
812
+ Returns an array of arrays of strings.
813
+
814
+ --- Bio::FastaDefline#description
815
+ --- Bio::FastaDefline#descriptions
816
+
817
+ --- Bio::FastaDefline#words(case_sensitive = nil,
818
+ kill_words_regexp_array, kill_words_hash)
819
+
820
+ --- Bio::FastaDefline#get(tag_of_id)
821
+
822
+ --- Bio::FastaDefline#get_by_type(type_of_id)
823
+
824
+ --- Bio::FastaDefline#get_all_by_type(type_of_id)
825
+
826
+ --- examples:
827
+ rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
828
+ rub.entry_id ==> 'gi|671595'
829
+ rub.get('emb') ==> 'CAA85678.1'
830
+ rub.emb ==> 'CAA85678.1'
831
+ rub.gi ==> '671595'
832
+ rub.accession ==> 'CAA85678'
833
+ rub.accessions ==> [ 'CAA85678' ]
834
+ rub.acc_version ==> 'CAA85678.1'
835
+ rub.locus ==> nil
836
+ rub.list_ids ==> [["gi", "671595"],
837
+ ["emb", "CAA85678.1", nil],
838
+ ["Perovskia abrotanoides"]]
839
+
840
+ ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
841
+ ckr.entry_id ==> "gi|2495000"
842
+ ckr.sp ==> "CCKR_CAVPO"
843
+ ckr.pir ==> "I51898"
844
+ ckr.gb ==> "AAB29504.1"
845
+ ckr.gi ==> "2495000"
846
+ ckr.accession ==> "AAB29504"
847
+ ckr.accessions ==> ["Q63931", "AAB29504"]
848
+ ckr.acc_version ==> "AAB29504.1"
849
+ ckr.locus ==> nil
850
+ ckr.description ==>
851
+ "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
852
+ ckr.descriptions ==>
853
+ ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
854
+ "cholecystokinin A receptor - guinea pig",
855
+ "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
856
+ ckr.words ==>
857
+ ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
858
+ "receptor", "type"]
859
+ ckr.id_strings ==>
860
+ ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
861
+ "544724", "AAB29504.1", "Cavia"]
862
+ ckr.list_ids ==>
863
+ [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
864
+ ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
865
+ ["gb", "AAB29504.1", nil], ["Cavia"]]
866
+
867
+ =end
868
+
869
+