bio 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (201) hide show
  1. data/bin/bioruby +107 -0
  2. data/bin/br_biofetch.rb +59 -0
  3. data/bin/br_bioflat.rb +294 -0
  4. data/bin/br_biogetseq.rb +57 -0
  5. data/bin/br_pmfetch.rb +431 -0
  6. data/doc/BioRuby.rd.ja +225 -0
  7. data/doc/Changes-0.7.rd +236 -0
  8. data/doc/Design.rd.ja +341 -0
  9. data/doc/KEGG_API.rd +1437 -0
  10. data/doc/KEGG_API.rd.ja +1399 -0
  11. data/doc/TODO.rd.ja +138 -0
  12. data/doc/Tutorial.rd +1138 -0
  13. data/doc/Tutorial.rd.ja +2110 -0
  14. data/etc/bioinformatics/seqdatabase.ini +210 -0
  15. data/lib/bio.rb +256 -0
  16. data/lib/bio/alignment.rb +1906 -0
  17. data/lib/bio/appl/bl2seq/report.rb +350 -0
  18. data/lib/bio/appl/blast.rb +269 -0
  19. data/lib/bio/appl/blast/format0.rb +1402 -0
  20. data/lib/bio/appl/blast/format8.rb +95 -0
  21. data/lib/bio/appl/blast/report.rb +652 -0
  22. data/lib/bio/appl/blast/rexml.rb +151 -0
  23. data/lib/bio/appl/blast/wublast.rb +553 -0
  24. data/lib/bio/appl/blast/xmlparser.rb +222 -0
  25. data/lib/bio/appl/blat/report.rb +392 -0
  26. data/lib/bio/appl/clustalw.rb +191 -0
  27. data/lib/bio/appl/clustalw/report.rb +154 -0
  28. data/lib/bio/appl/emboss.rb +68 -0
  29. data/lib/bio/appl/fasta.rb +262 -0
  30. data/lib/bio/appl/fasta/format10.rb +428 -0
  31. data/lib/bio/appl/fasta/format6.rb +37 -0
  32. data/lib/bio/appl/genscan/report.rb +570 -0
  33. data/lib/bio/appl/hmmer.rb +129 -0
  34. data/lib/bio/appl/hmmer/report.rb +556 -0
  35. data/lib/bio/appl/mafft.rb +222 -0
  36. data/lib/bio/appl/mafft/report.rb +119 -0
  37. data/lib/bio/appl/psort.rb +555 -0
  38. data/lib/bio/appl/psort/report.rb +473 -0
  39. data/lib/bio/appl/sim4.rb +134 -0
  40. data/lib/bio/appl/sim4/report.rb +501 -0
  41. data/lib/bio/appl/sosui/report.rb +166 -0
  42. data/lib/bio/appl/spidey/report.rb +604 -0
  43. data/lib/bio/appl/targetp/report.rb +283 -0
  44. data/lib/bio/appl/tmhmm/report.rb +238 -0
  45. data/lib/bio/command.rb +166 -0
  46. data/lib/bio/data/aa.rb +354 -0
  47. data/lib/bio/data/codontable.rb +740 -0
  48. data/lib/bio/data/na.rb +226 -0
  49. data/lib/bio/db.rb +340 -0
  50. data/lib/bio/db/aaindex.rb +280 -0
  51. data/lib/bio/db/embl/common.rb +332 -0
  52. data/lib/bio/db/embl/embl.rb +446 -0
  53. data/lib/bio/db/embl/sptr.rb +954 -0
  54. data/lib/bio/db/embl/swissprot.rb +32 -0
  55. data/lib/bio/db/embl/trembl.rb +31 -0
  56. data/lib/bio/db/embl/uniprot.rb +32 -0
  57. data/lib/bio/db/fantom.rb +604 -0
  58. data/lib/bio/db/fasta.rb +869 -0
  59. data/lib/bio/db/genbank/common.rb +299 -0
  60. data/lib/bio/db/genbank/ddbj.rb +34 -0
  61. data/lib/bio/db/genbank/genbank.rb +354 -0
  62. data/lib/bio/db/genbank/genpept.rb +73 -0
  63. data/lib/bio/db/genbank/refseq.rb +31 -0
  64. data/lib/bio/db/gff.rb +106 -0
  65. data/lib/bio/db/go.rb +497 -0
  66. data/lib/bio/db/kegg/brite.rb +51 -0
  67. data/lib/bio/db/kegg/cell.rb +88 -0
  68. data/lib/bio/db/kegg/compound.rb +130 -0
  69. data/lib/bio/db/kegg/enzyme.rb +125 -0
  70. data/lib/bio/db/kegg/expression.rb +173 -0
  71. data/lib/bio/db/kegg/genes.rb +293 -0
  72. data/lib/bio/db/kegg/genome.rb +362 -0
  73. data/lib/bio/db/kegg/glycan.rb +213 -0
  74. data/lib/bio/db/kegg/keggtab.rb +418 -0
  75. data/lib/bio/db/kegg/kgml.rb +299 -0
  76. data/lib/bio/db/kegg/ko.rb +178 -0
  77. data/lib/bio/db/kegg/reaction.rb +97 -0
  78. data/lib/bio/db/litdb.rb +131 -0
  79. data/lib/bio/db/medline.rb +317 -0
  80. data/lib/bio/db/nbrf.rb +199 -0
  81. data/lib/bio/db/pdb.rb +38 -0
  82. data/lib/bio/db/pdb/atom.rb +60 -0
  83. data/lib/bio/db/pdb/chain.rb +117 -0
  84. data/lib/bio/db/pdb/model.rb +106 -0
  85. data/lib/bio/db/pdb/pdb.rb +1682 -0
  86. data/lib/bio/db/pdb/residue.rb +122 -0
  87. data/lib/bio/db/pdb/utils.rb +234 -0
  88. data/lib/bio/db/prosite.rb +616 -0
  89. data/lib/bio/db/rebase.rb +417 -0
  90. data/lib/bio/db/transfac.rb +387 -0
  91. data/lib/bio/feature.rb +201 -0
  92. data/lib/bio/io/brdb.rb +103 -0
  93. data/lib/bio/io/das.rb +471 -0
  94. data/lib/bio/io/dbget.rb +212 -0
  95. data/lib/bio/io/ddbjxml.rb +614 -0
  96. data/lib/bio/io/fastacmd.rb +123 -0
  97. data/lib/bio/io/fetch.rb +114 -0
  98. data/lib/bio/io/flatfile.rb +496 -0
  99. data/lib/bio/io/flatfile/bdb.rb +266 -0
  100. data/lib/bio/io/flatfile/index.rb +1308 -0
  101. data/lib/bio/io/flatfile/indexer.rb +778 -0
  102. data/lib/bio/io/higet.rb +92 -0
  103. data/lib/bio/io/keggapi.rb +863 -0
  104. data/lib/bio/io/pubmed.rb +189 -0
  105. data/lib/bio/io/registry.rb +308 -0
  106. data/lib/bio/io/soapwsdl.rb +114 -0
  107. data/lib/bio/io/sql.rb +428 -0
  108. data/lib/bio/location.rb +650 -0
  109. data/lib/bio/pathway.rb +991 -0
  110. data/lib/bio/reference.rb +308 -0
  111. data/lib/bio/sequence.rb +593 -0
  112. data/lib/bio/shell.rb +51 -0
  113. data/lib/bio/shell/core.rb +512 -0
  114. data/lib/bio/shell/plugin/codon.rb +228 -0
  115. data/lib/bio/shell/plugin/entry.rb +85 -0
  116. data/lib/bio/shell/plugin/flatfile.rb +119 -0
  117. data/lib/bio/shell/plugin/keggapi.rb +187 -0
  118. data/lib/bio/shell/plugin/midi.rb +448 -0
  119. data/lib/bio/shell/plugin/obda.rb +63 -0
  120. data/lib/bio/shell/plugin/seq.rb +238 -0
  121. data/lib/bio/shell/session.rb +214 -0
  122. data/lib/bio/util/color_scheme.rb +214 -0
  123. data/lib/bio/util/color_scheme/buried.rb +78 -0
  124. data/lib/bio/util/color_scheme/helix.rb +78 -0
  125. data/lib/bio/util/color_scheme/hydropathy.rb +83 -0
  126. data/lib/bio/util/color_scheme/nucleotide.rb +50 -0
  127. data/lib/bio/util/color_scheme/strand.rb +78 -0
  128. data/lib/bio/util/color_scheme/taylor.rb +69 -0
  129. data/lib/bio/util/color_scheme/turn.rb +78 -0
  130. data/lib/bio/util/color_scheme/zappo.rb +69 -0
  131. data/lib/bio/util/contingency_table.rb +337 -0
  132. data/lib/bio/util/sirna.rb +306 -0
  133. data/lib/bioruby.rb +34 -0
  134. data/sample/biofetch.rb +475 -0
  135. data/sample/color_scheme_na.rb +99 -0
  136. data/sample/dbget +37 -0
  137. data/sample/fasta2tab.rb +99 -0
  138. data/sample/fsplit.rb +51 -0
  139. data/sample/gb2fasta.rb +31 -0
  140. data/sample/gb2tab.rb +325 -0
  141. data/sample/gbtab2mysql.rb +161 -0
  142. data/sample/genes2nuc.rb +33 -0
  143. data/sample/genes2pep.rb +33 -0
  144. data/sample/genes2tab.rb +81 -0
  145. data/sample/genome2rb.rb +29 -0
  146. data/sample/genome2tab.rb +76 -0
  147. data/sample/goslim.rb +311 -0
  148. data/sample/gt2fasta.rb +47 -0
  149. data/sample/pmfetch.rb +42 -0
  150. data/sample/pmsearch.rb +42 -0
  151. data/sample/psortplot_html.rb +222 -0
  152. data/sample/ssearch2tab.rb +96 -0
  153. data/sample/tdiary.rb +158 -0
  154. data/sample/tfastx2tab.rb +100 -0
  155. data/sample/vs-genes.rb +212 -0
  156. data/test/data/SOSUI/sample.report +11 -0
  157. data/test/data/TMHMM/sample.report +21 -0
  158. data/test/data/blast/eco:b0002.faa +15 -0
  159. data/test/data/blast/eco:b0002.faa.m0 +128 -0
  160. data/test/data/blast/eco:b0002.faa.m7 +65 -0
  161. data/test/data/blast/eco:b0002.faa.m8 +1 -0
  162. data/test/data/embl/AB090716.embl +65 -0
  163. data/test/data/genscan/sample.report +63 -0
  164. data/test/data/prosite/prosite.dat +2233 -0
  165. data/test/data/refseq/nm_126355.entret +64 -0
  166. data/test/data/uniprot/p53_human.uniprot +1456 -0
  167. data/test/runner.rb +10 -0
  168. data/test/unit/bio/appl/blast/test_report.rb +427 -0
  169. data/test/unit/bio/appl/blast/test_xmlparser.rb +400 -0
  170. data/test/unit/bio/appl/genscan/test_report.rb +195 -0
  171. data/test/unit/bio/appl/sosui/test_report.rb +94 -0
  172. data/test/unit/bio/appl/targetp/test_report.rb +159 -0
  173. data/test/unit/bio/appl/test_blast.rb +159 -0
  174. data/test/unit/bio/appl/test_fasta.rb +142 -0
  175. data/test/unit/bio/appl/tmhmm/test_report.rb +139 -0
  176. data/test/unit/bio/data/test_aa.rb +103 -0
  177. data/test/unit/bio/data/test_codontable.rb +120 -0
  178. data/test/unit/bio/data/test_na.rb +89 -0
  179. data/test/unit/bio/db/embl/test_common.rb +130 -0
  180. data/test/unit/bio/db/embl/test_embl.rb +227 -0
  181. data/test/unit/bio/db/embl/test_sptr.rb +268 -0
  182. data/test/unit/bio/db/embl/test_uniprot.rb +44 -0
  183. data/test/unit/bio/db/kegg/test_genes.rb +58 -0
  184. data/test/unit/bio/db/test_fasta.rb +263 -0
  185. data/test/unit/bio/db/test_gff.rb +140 -0
  186. data/test/unit/bio/db/test_prosite.rb +1450 -0
  187. data/test/unit/bio/io/test_ddbjxml.rb +87 -0
  188. data/test/unit/bio/io/test_soapwsdl.rb +45 -0
  189. data/test/unit/bio/shell/plugin/test_seq.rb +175 -0
  190. data/test/unit/bio/test_alignment.rb +1028 -0
  191. data/test/unit/bio/test_command.rb +71 -0
  192. data/test/unit/bio/test_db.rb +109 -0
  193. data/test/unit/bio/test_feature.rb +128 -0
  194. data/test/unit/bio/test_location.rb +51 -0
  195. data/test/unit/bio/test_pathway.rb +485 -0
  196. data/test/unit/bio/test_sequence.rb +386 -0
  197. data/test/unit/bio/test_shell.rb +31 -0
  198. data/test/unit/bio/util/test_color_scheme.rb +45 -0
  199. data/test/unit/bio/util/test_contingency_table.rb +106 -0
  200. data/test/unit/bio/util/test_sirna.rb +258 -0
  201. metadata +295 -0
@@ -0,0 +1,869 @@
1
+ #
2
+ # bio/db/fasta.rb - FASTA format class
3
+ #
4
+ # Copyright (C) 2001 GOTO Naohisa <ngoto@gen-info.osaka-u.ac.jp>
5
+ # Copyright (C) 2001, 2002 KATAYAMA Toshiaki <k@bioruby.org>
6
+ #
7
+ # This library is free software; you can redistribute it and/or
8
+ # modify it under the terms of the GNU Lesser General Public
9
+ # License as published by the Free Software Foundation; either
10
+ # version 2 of the License, or (at your option) any later version.
11
+ #
12
+ # This library is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ # Lesser General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Lesser General Public
18
+ # License along with this library; if not, write to the Free Software
19
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20
+ #
21
+ # $Id: fasta.rb,v 1.21 2005/09/26 13:00:06 k Exp $
22
+ #
23
+
24
+ require 'bio/db'
25
+ require 'bio/sequence'
26
+
27
+ module Bio
28
+
29
+ class FastaFormat < DB
30
+
31
+ DELIMITER = RS = "\n>"
32
+
33
+ def initialize(str)
34
+ @definition = str[/.*/].sub(/^>/, '').strip # 1st line
35
+ @data = str.sub(/.*/, '') # rests
36
+ @data.sub!(/^>.*/m, '') # remove trailing entries for sure
37
+ @entry_overrun = $&
38
+ end
39
+ attr_accessor :definition, :data
40
+ attr_reader :entry_overrun
41
+
42
+ def entry
43
+ @entry = ">#{@definition}\n#{@data.strip}\n"
44
+ end
45
+ alias to_s entry
46
+
47
+ def query(factory)
48
+ factory.query(@entry)
49
+ end
50
+ alias fasta query
51
+ alias blast query
52
+
53
+ def seq
54
+ unless defined?(@seq)
55
+ unless /\A\s*^\#/ =~ @data then
56
+ @seq = Sequence.new(@data.tr(" \t\r\n0-9", '')) # lazy clean up
57
+ else
58
+ a = @data.split(/(^\#.*$)/)
59
+ i = 0
60
+ cmnt = {}
61
+ s = []
62
+ a.each do |x|
63
+ if /^# ?(.*)$/ =~ x then
64
+ cmnt[i] ? cmnt[i] << "\n" << $1 : cmnt[i] = $1
65
+ else
66
+ x.tr!(" \t\r\n0-9", '') # lazy clean up
67
+ i += x.length
68
+ s << x
69
+ end
70
+ end
71
+ @comment = cmnt
72
+ @seq = Bio::Sequence.new(s.join(''))
73
+ end
74
+ end
75
+ @seq
76
+ end
77
+
78
+ def comment
79
+ seq
80
+ @comment
81
+ end
82
+
83
+ def length
84
+ seq.length
85
+ end
86
+
87
+ def naseq
88
+ Sequence::NA.new(seq)
89
+ end
90
+
91
+ def nalen
92
+ self.naseq.length
93
+ end
94
+
95
+ def aaseq
96
+ Sequence::AA.new(seq)
97
+ end
98
+
99
+ def aalen
100
+ self.aaseq.length
101
+ end
102
+
103
+ def identifiers
104
+ unless defined?(@ids) then
105
+ @ids = FastaDefline.new(@definition)
106
+ end
107
+ @ids
108
+ end
109
+
110
+ def entry_id
111
+ identifiers.entry_id
112
+ end
113
+
114
+ def gi
115
+ identifiers.gi
116
+ end
117
+
118
+ def accession
119
+ identifiers.accession
120
+ end
121
+
122
+ def accessions
123
+ identifiers.accessions
124
+ end
125
+
126
+ def acc_version
127
+ identifiers.acc_version
128
+ end
129
+
130
+ def locus
131
+ identifiers.locus
132
+ end
133
+
134
+ end #class FastaFormat
135
+
136
+ class FastaNumericFormat < FastaFormat
137
+
138
+ def data
139
+ unless @list
140
+ @list = @data.strip.split(/\s+/).map {|x| x.to_i}
141
+ end
142
+ @list
143
+ end
144
+
145
+ def length
146
+ data.length
147
+ end
148
+
149
+ def each
150
+ data.each do |x|
151
+ yield x
152
+ end
153
+ end
154
+
155
+ def [](n)
156
+ data[n]
157
+ end
158
+
159
+ undef query, blast, fasta, seq, naseq, nalen, aaseq, aalen
160
+
161
+ end #class FastaNumericFormat
162
+
163
+ class FastaDefline
164
+
165
+ # specs are described in:
166
+ # ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
167
+ # http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
168
+
169
+ NSIDs = {
170
+ # NCBI and WU-BLAST
171
+ 'gi' => [ 'gi' ], # NCBI GI
172
+ 'gb' => [ 'acc_version', 'locus' ], # GenBank
173
+ 'emb' => [ 'acc_version', 'locus' ], # EMBL
174
+ 'dbj' => [ 'acc_version', 'locus' ], # DDBJ
175
+ 'sp' => [ 'accession', 'entry_id' ], # SWISS-PROT
176
+ 'pdb' => [ 'entry_id', 'chain' ], # PDB
177
+ 'bbs' => [ 'number' ], # GenInfo Backbone Id
178
+ 'gnl' => [ 'database' , 'entry_id' ], # General database identifier
179
+ 'ref' => [ 'acc_version' , 'locus' ], # NCBI Reference Sequence
180
+ 'lcl' => [ 'entry_id' ], # Local Sequence identifier
181
+
182
+ # WU-BLAST and NCBI
183
+ 'pir' => [ 'accession', 'entry_id' ], # PIR
184
+ 'prf' => [ 'accession', 'entry_id' ], # Protein Research Foundation
185
+ 'pat' => [ 'country', 'number', 'serial' ], # Patents
186
+
187
+ # WU-BLAST only
188
+ 'bbm' => [ 'number' ], # NCBI GenInfo Backbone database identifier
189
+ 'gim' => [ 'number' ], # NCBI GenInfo Import identifier
190
+ 'gp' => [ 'acc_version', 'locus' ], # GenPept
191
+ 'oth' => [ 'accession', 'name', 'release' ], # Other (user-definable) identifier
192
+ 'tpd' => [ 'accession', 'name' ], # Third party annotation, DDBJ
193
+ 'tpe' => [ 'accession', 'name' ], # Third party annotation, EMBL
194
+ 'tpg' => [ 'accession', 'name' ], # Third party annotation, GenBank
195
+
196
+ # Original
197
+ 'ri' => [ 'entry_id', 'rearray_id', 'len' ], # RIKEN FANTOM DB
198
+ }
199
+
200
+ def initialize(str)
201
+ @deflines = []
202
+ @info = {}
203
+ @list_ids = []
204
+
205
+ @entry_id = nil
206
+
207
+ lines = str.split("\x01")
208
+ lines.each do |line|
209
+ add_defline(line)
210
+ end
211
+ end #def initialize
212
+
213
+ attr_reader :list_ids
214
+ attr_reader :entry_id
215
+
216
+ def add_defline(str)
217
+ case str
218
+ when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/
219
+ # NSIDs
220
+ # examples:
221
+ # >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P
222
+ #
223
+ # note: regexp (:?) means grouping without backreferences
224
+ i = $1
225
+ d = $2
226
+ tks = i.split('|')
227
+ tks << '' if i[-1,1] == '|'
228
+ a = parse_NSIDs(tks)
229
+ i = a[0].join('|')
230
+ a.unshift('|')
231
+ d = tks.join('|') + ' ' + d unless tks.empty?
232
+ a << d
233
+ this_line = a
234
+ match_EC(d)
235
+ parse_square_brackets(d).each do |x|
236
+ if !match_EC(x, false) and x =~ /\A[A-Z]/ then
237
+ di = [ x ]
238
+ @list_ids << di
239
+ @info['organism'] = x unless @info['organism']
240
+ end
241
+ end
242
+
243
+ when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/
244
+ # examples:
245
+ # >sce:YBR160W CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
246
+ # >emb:CACDC28 [X80034] C.albicans CDC28 gene
247
+ i = $1
248
+ d = $2
249
+ a = parse_ColonSepID(i)
250
+ i = a.join(':')
251
+ this_line = [ ':', a , d ]
252
+ match_EC(d)
253
+ parse_square_brackets(d).each do |x|
254
+ if !match_EC(x, false) and x =~ /:/ then
255
+ parse_ColonSepID(x)
256
+ elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then
257
+ @list_ids << [ $1 ]
258
+ end
259
+ end
260
+
261
+ when /^\>?\s*(\S+)(?:\s+(.+))?$/
262
+ # examples:
263
+ # >ABC12345 this is test
264
+ i = $1
265
+ d = $2.to_s
266
+ @list_ids << [ i.chomp('.') ]
267
+ this_line = [ '', [ i ], d ]
268
+ match_EC(d)
269
+ else
270
+ i = str
271
+ d = ''
272
+ match_EC(i)
273
+ this_line = [ '', [ i ], d ]
274
+ end
275
+
276
+ @deflines << this_line
277
+ @entry_id = i unless @entry_id
278
+ end
279
+
280
+ def match_EC(str, write_flag = true)
281
+ di = nil
282
+ str.scan(/EC\:((:?[\-\d]+\.){3}(:?[\-\d]+))/i) do |x|
283
+ di = [ 'EC', $1 ]
284
+ if write_flag then
285
+ @info['ec'] = di[1] if (!@info['ec'] or @info['ec'].to_s =~ /\-/)
286
+ @list_ids << di
287
+ end
288
+ end
289
+ di
290
+ end
291
+ private :match_EC
292
+
293
+ def parse_square_brackets(str)
294
+ r = []
295
+ str.scan(/\[([^\]]*)\]/) do |x|
296
+ r << x[0]
297
+ end
298
+ r
299
+ end
300
+ private :parse_square_brackets
301
+
302
+ def parse_ColonSepID(str)
303
+ di = str.split(':', 2)
304
+ di << nil if di.size <= 1
305
+ @list_ids << di
306
+ di
307
+ end
308
+ private :parse_ColonSepID
309
+
310
+ def parse_NSIDs(ary)
311
+ # this method destroys ary
312
+ data = []
313
+ while token = ary.shift
314
+ if labels = self.class::NSIDs[token] then
315
+ di = [ token ]
316
+ idtype = token
317
+ labels.each do |x|
318
+ token = ary.shift
319
+ break unless token
320
+ if self.class::NSIDs[token] then
321
+ ary.unshift(token)
322
+ break #each
323
+ end
324
+ if token.length > 0 then
325
+ di << token
326
+ else
327
+ di << nil
328
+ end
329
+ end
330
+ data << di
331
+ else
332
+ if token.length > 0 then
333
+ # UCID (uncontrolled identifiers)
334
+ di = [ token ]
335
+ data << di
336
+ @info['ucid'] = token unless @info['ucid']
337
+ end
338
+ break #while
339
+ end
340
+ end #while
341
+ @list_ids.concat data
342
+ data
343
+ end #def parse_NSIDs
344
+ private :parse_NSIDs
345
+
346
+ def to_s
347
+ @deflines.collect { |a|
348
+ s = a[0]
349
+ (a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip
350
+ }.join("\x01")
351
+ end
352
+
353
+ def description
354
+ @deflines[0].to_a[-1]
355
+ end
356
+
357
+ def descriptions
358
+ @deflines.collect do |a|
359
+ a[-1]
360
+ end
361
+ end
362
+
363
+ def id_strings
364
+ r = []
365
+ @list_ids.each do |a|
366
+ if a.size >= 2 then
367
+ r.concat a[1..-1].find_all { |x| x }
368
+ else
369
+ if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/
370
+ r << a[0]
371
+ end
372
+ end
373
+ end
374
+ r.concat( words(true, []).find_all do |x|
375
+ x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or
376
+ x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
377
+ end)
378
+ r
379
+ end
380
+
381
+ KillWords = [
382
+ 'an', 'the', 'this', 'that',
383
+ 'is', 'are', 'were', 'was', 'be', 'can', 'may', 'might',
384
+ 'as', 'at', 'by', 'for', 'in', 'of', 'on', 'to', 'with',
385
+ 'from', 'and', 'or', 'not',
386
+ 'dna', 'rna', 'mrna', 'cdna', 'orf',
387
+ 'aa', 'nt', 'pct', 'id', 'ec', 'sp', 'subsp',
388
+ 'similar', 'involved', 'identical', 'identity',
389
+ 'cds', 'clone', 'library', 'contig', 'contigs',
390
+ 'homolog', 'homologue', 'homologs', 'homologous',
391
+ 'protein', 'proteins', 'gene', 'genes',
392
+ 'product', 'products', 'sequence', 'sequences',
393
+ 'strain', 'strains', 'region', 'regions',
394
+ ]
395
+ KillWordsHash = {}
396
+ KillWords.each { |x| KillWordsHash[x] = true }
397
+
398
+ KillRegexpArray = [
399
+ /\A\d{1,3}\%?\z/,
400
+ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/,
401
+ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
402
+ ]
403
+
404
+ def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray,
405
+ kwhash = self.class::KillWordsHash)
406
+ a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\#\s\x00-\x1f\x7f]+/)
407
+ a.collect! do |x|
408
+ x.sub!(/\A[\$\*\-\+]+/, '')
409
+ x.sub!(/[\$\*\-\=]+\z/, '')
410
+ if x.size <= 1 then
411
+ nil
412
+ elsif kwhash[x.downcase] then
413
+ nil
414
+ else
415
+ if kill_regexp.find { |expr| expr =~ x } then
416
+ nil
417
+ else
418
+ x
419
+ end
420
+ end
421
+ end
422
+ a.compact!
423
+ a.collect! { |x| x.downcase } unless case_sensitive
424
+ a.sort!
425
+ a.uniq!
426
+ a
427
+ end
428
+
429
+ def get(db)
430
+ db =db.to_s
431
+ r = nil
432
+ unless r = @info[db] then
433
+ di = @list_ids.find { |x| x[0] == db.to_s }
434
+ if di and di.size <= 2 then
435
+ r = di[-1]
436
+ elsif di then
437
+ labels = self.class::NSIDs[db]
438
+ [ 'acc_version', 'entry_id',
439
+ 'locus', 'accession', 'number'].each do |x|
440
+ if i = labels.index(x) then
441
+ r = di[i+1]
442
+ break if r
443
+ end
444
+ end
445
+ r = di[1..-1].find { |x| x } unless r
446
+ end
447
+ @info[db] = r if r
448
+ end
449
+ r
450
+ end
451
+
452
+ def get_by_type(tstr)
453
+ @list_ids.each do |x|
454
+ if labels = self.class::NSIDs[x[0]] then
455
+ if i = labels.index(tstr) then
456
+ return x[i+1]
457
+ end
458
+ end
459
+ end
460
+ nil
461
+ end
462
+
463
+ def get_all_by_type(*tstrarg)
464
+ d = []
465
+ @list_ids.each do |x|
466
+ if labels = self.class::NSIDs[x[0]] then
467
+ tstrarg.each do |y|
468
+ if i = labels.index(y) then
469
+ d << x[i+1] if x[i+1]
470
+ end
471
+ end
472
+ end
473
+ end
474
+ d
475
+ end
476
+
477
+ def locus
478
+ unless defined?(@locus)
479
+ @locus = get_by_type('locus')
480
+ end
481
+ @locus
482
+ end
483
+
484
+ def gi
485
+ unless defined?(@gi) then
486
+ @gi = get_by_type('gi')
487
+ end
488
+ @gi
489
+ end
490
+
491
+ def acc_version
492
+ unless defined?(@acc_version) then
493
+ @acc_version = get_by_type('acc_version')
494
+ end
495
+ @acc_version
496
+ end
497
+
498
+ def accessions
499
+ unless defined?(@accessions) then
500
+ @accessions = get_all_by_type('accession', 'acc_version')
501
+ @accessions.collect! { |x| x.sub(/\..*\z/, '') }
502
+ end
503
+ @accessions
504
+ end
505
+
506
+ def accession
507
+ unless defined?(@accession) then
508
+ if acc_version then
509
+ @accession = acc_version.split('.')[0]
510
+ else
511
+ @accession = accessions[0]
512
+ end
513
+ end
514
+ @accession
515
+ end
516
+
517
+ def method_missing(name, *args)
518
+ # raise ArgumentError,
519
+ # "wrong # of arguments(#{args.size} for 1)" if args.size >= 2
520
+ r = get(name, *args)
521
+ if !r and !(self.class::NSIDs[name.to_s]) then
522
+ raise "NameError: undefined method `#{name.inspect}'"
523
+ end
524
+ r
525
+ end
526
+
527
+ end #class FastaDefline
528
+
529
+ end #module Bio
530
+
531
+ if __FILE__ == $0
532
+
533
+ f_str = <<END
534
+ >sce:YBR160W CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
535
+ MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEG
536
+ VPSTAIREISLLKELKDDNIVRLYDIVHSDAHKLYLVFEFLDLDLKRYME
537
+ GIPKDQPLGADIVKKFMMQLCKGIAYCHSHRILHRDLKPQNLLINKDGNL
538
+ KLGDFGLARAFGVPLRAYTHEIVTLWYRAPEVLLGGKQYSTGVDTWSIGC
539
+ IFAEMCNRKPIFSGDSEIDQIFKIFRVLGTPNEAIWPDIVYLPDFKPSFP
540
+ QWRRKDLSQVVPSLDPRGIDLLDKLLAYDPINRISARRAAIHPYFQES
541
+ >sce:YBR274W CHK1; probable serine/threonine-protein kinase [EC:2.7.1.-] [SP:KB9S_YEAST]
542
+ MSLSQVSPLPHIKDVVLGDTVGQGAFACVKNAHLQMDPSIILAVKFIHVP
543
+ TCKKMGLSDKDITKEVVLQSKCSKHPNVLRLIDCNVSKEYMWIILEMADG
544
+ GDLFDKIEPDVGVDSDVAQFYFQQLVSAINYLHVECGVAHRDIKPENILL
545
+ DKNGNLKLADFGLASQFRRKDGTLRVSMDQRGSPPYMAPEVLYSEEGYYA
546
+ DRTDIWSIGILLFVLLTGQTPWELPSLENEDFVFFIENDGNLNWGPWSKI
547
+ EFTHLNLLRKILQPDPNKRVTLKALKLHPWVLRRASFSGDDGLCNDPELL
548
+ AKKLFSHLKVSLSNENYLKFTQDTNSNNRYISTQPIGNELAELEHDSMHF
549
+ QTVSNTQRAFTSYDSNTNYNSGTGMTQEAKWTQFISYDIAALQFHSDEND
550
+ CNELVKRHLQFNPNKLTKFYTLQPMDVLLPILEKALNLSQIRVKPDLFAN
551
+ FERLCELLGYDNVFPLIINIKTKSNGGYQLCGSISIIKIEEELKSVGFER
552
+ KTGDPLEWRRLFKKISTICRDIILIPN
553
+ END
554
+
555
+ f = Bio::FastaFormat.new(f_str)
556
+ puts "### FastaFormat"
557
+ puts "# entry"
558
+ puts f.entry
559
+ puts "# entry_id"
560
+ p f.entry_id
561
+ puts "# definition"
562
+ p f.definition
563
+ puts "# data"
564
+ p f.data
565
+ puts "# seq"
566
+ p f.seq
567
+ puts "# seq.type"
568
+ p f.seq.type
569
+ puts "# length"
570
+ p f.length
571
+ puts "# aaseq"
572
+ p f.aaseq
573
+ puts "# aaseq.type"
574
+ p f.aaseq.type
575
+ puts "# aaseq.composition"
576
+ p f.aaseq.composition
577
+ puts "# aalen"
578
+ p f.aalen
579
+
580
+ puts
581
+
582
+ n_str = <<END
583
+ >CRA3575282.F
584
+ 24 15 23 29 20 13 20 21 21 23 22 25 13 22 17 15 25 27 32 26
585
+ 32 29 29 25
586
+ END
587
+
588
+ n = Bio::FastaNumericFormat.new(n_str)
589
+ puts "### FastaNumericFormat"
590
+ puts "# entry"
591
+ puts n.entry
592
+ puts "# entry_id"
593
+ p n.entry_id
594
+ puts "# definition"
595
+ p n.definition
596
+ puts "# data"
597
+ p n.data
598
+ puts "# length"
599
+ p n.length
600
+ puts "# percent to ratio by yield"
601
+ n.each do |x|
602
+ p x/100.0
603
+ end
604
+ puts "# first three"
605
+ p n[0]
606
+ p n[1]
607
+ p n[2]
608
+ puts "# last one"
609
+ p n[-1]
610
+
611
+ end
612
+
613
+ =begin
614
+
615
+ = Bio::FastaFormat
616
+
617
+ Treats a FASTA formatted entry, such as:
618
+
619
+ >id and/or some comments <== comment line
620
+ ATGCATGCATGCATGCATGCATGCATGCATGCATGC <== sequence lines
621
+ ATGCATGCATGCATGCATGCATGCATGCATGCATGC
622
+ ATGCATGCATGC
623
+
624
+ The precedent '>' can be omitted and the trailing '>' will be removed
625
+ automatically.
626
+
627
+ --- Bio::FastaFormat.new(entry)
628
+
629
+ Stores the comment and sequence information from one entry of the
630
+ FASTA format string. If the argument contains more than one
631
+ entry, only the first entry is used.
632
+
633
+ --- Bio::FastaFormat#entry
634
+
635
+ Returns the stored one entry as a FASTA format. (same as to_s)
636
+
637
+ --- Bio::FastaFormat#definition
638
+
639
+ Returns the comment line of the FASTA formatted data.
640
+
641
+ --- Bio::FastaFormat#seq
642
+
643
+ Returns a joined sequence line as a String.
644
+
645
+ --- Bio::FastaFormat#query(factory)
646
+ --- Bio::FastaFormat#fasta(factory)
647
+ --- Bio::FastaFormat#blast(factory)
648
+
649
+ Executes FASTA/BLAST search by using a Bio::Fasta or a Bio::Blast
650
+ factory object.
651
+
652
+ #!/usr/bin/env ruby
653
+
654
+ require 'bio'
655
+
656
+ factory = Bio::Fasta.local('fasta34', 'db/swissprot.f')
657
+ flatfile = Bio::FlatFile.open(Bio::FastaFormat, 'queries.f')
658
+ flatfile.each do |entry|
659
+ p entry.definition
660
+ result = entry.fasta(factory)
661
+ result.each do |hit|
662
+ print "#{hit.query_id} : #{hit.evalue}\t#{hit.target_id} at "
663
+ p hit.lap_at
664
+ end
665
+ end
666
+
667
+ --- Bio::FastaFormat#length
668
+
669
+ Returns sequence length.
670
+
671
+ --- Bio::FastaFormat#naseq
672
+ --- Bio::FastaFormat#nalen
673
+ --- Bio::FastaFormat#aaseq
674
+ --- Bio::FastaFormat#aalen
675
+
676
+ If you know whether the sequence is NA or AA, use these methods.
677
+ 'naseq' and 'aaseq' methods returen the Bio::Sequence::NA or
678
+ Bio::Sequence::AA object respectively. 'nalen' and 'aalen' methods
679
+ return the length of them.
680
+
681
+ --- Bio::FastaFormat#identifiers
682
+
683
+ Parsing FASTA Defline, and extract IDs.
684
+ IDs are NSIDs (NCBI standard FASTA sequence identifiers)
685
+ or ":"-separated IDs.
686
+ It returns a Bio::FastaDefline instance.
687
+
688
+ --- Bio::FastaFormat#entry_id
689
+
690
+ Parsing FASTA Defline (using #identifiers method), and
691
+ shows a possibly unique identifier.
692
+ It returns a string.
693
+
694
+ --- Bio::FastaFormat#gi
695
+ --- Bio::FastaFormat#locus
696
+ --- Bio::FastaFormat#accession
697
+ --- Bio::FastaFormat#acc_version
698
+
699
+ Parsing FASTA Defline (using #identifiers method), and
700
+ shows GI/locus/accession/accession with version number.
701
+ If a entry has more than two of such IDs,
702
+ only the first ID are shown.
703
+ It returns a string or nil.
704
+
705
+ --- Bio::FastaFormat#accessions
706
+
707
+ Parsing FASTA Defline (using #identifiers method), and
708
+ shows accession numbers.
709
+ It returns an array of strings.
710
+
711
+ --- Bio::FastaFormat
712
+
713
+ = Bio::FastaNumericFormat
714
+
715
+ Treats a FASTA formatted numerical entry, such as:
716
+
717
+ >id and/or some comments <== comment line
718
+ 24 15 23 29 20 13 20 21 21 23 22 25 13 <== numerical data
719
+ 22 17 15 25 27 32 26 32 29 29 25
720
+
721
+ The precedent '>' can be omitted and the trailing '>' will be removed
722
+ automatically.
723
+
724
+ --- Bio::FastaNumericFormat.new(entry)
725
+
726
+ Stores the comment and the list of the numerical data.
727
+
728
+ --- Bio::FastaNumericFormat#definition
729
+
730
+ The comment line of the FASTA formatted data.
731
+
732
+ --- Bio::FastaNumericFormat#data
733
+
734
+ Returns the list of the numerical data (typically the quality score
735
+ of its corresponding sequence) as an Array.
736
+
737
+ --- Bio::FastaNumericFormat#length
738
+
739
+ Returns the number of elements in the numerical data.
740
+
741
+ --- Bio::FastaNumericFormat#each
742
+
743
+ Yields on each elements of the numerical data.
744
+
745
+ --- Bio::FastaNumericFormat#[](n)
746
+
747
+ Returns the n-th element.
748
+
749
+ --- Bio::FastaNumericFormat#identifiers
750
+ --- Bio::FastaNumericFormat#entry_id
751
+ --- Bio::FastaNumericFormat#gi
752
+ --- Bio::FastaNumericFormat#locus
753
+ --- Bio::FastaNumericFormat#accession
754
+ --- Bio::FastaNumericFormat#acc_version
755
+ --- Bio::FastaNumericFormat#accessions
756
+
757
+ Same as Bio::FastaFormat.
758
+
759
+
760
+ = Bio::FastaDefline
761
+
762
+ Parsing FASTA Defline, and extract IDs and other informations.
763
+ IDs are NSIDs (NCBI standard FASTA sequence identifiers)
764
+ or ":"-separated IDs.
765
+
766
+ --- see also:
767
+ ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
768
+ http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
769
+
770
+ --- Bio::FastaDefline.new(str)
771
+
772
+ Parses given string.
773
+
774
+ --- Bio::FastaFormat#entry_id
775
+
776
+ Shows a possibly unique identifier.
777
+ Returns a string.
778
+
779
+ --- Bio::FastaDefline#gi
780
+ --- Bio::FastaDefline#locus
781
+ --- Bio::FastaDefline#accession
782
+ --- Bio::FastaDefline#acc_version
783
+
784
+ Shows GI/locus/accession/accession with version number.
785
+ If the entry has more than two of such IDs,
786
+ only the first ID are shown.
787
+ Returns a string or nil.
788
+
789
+ --- Bio::FastaFormat#accessions
790
+
791
+ Shows accession numbers.
792
+ Returns an array of strings.
793
+
794
+ --- Bio::FastaDefline#add_defline(str)
795
+
796
+ Parses given string and adds parsed data.
797
+
798
+ --- Bio::FastaDefline#to_s
799
+
800
+ Shows original string.
801
+ Note that the result of this method may be different from
802
+ original string which is given in FastaDefline.new method.
803
+
804
+ --- Bio::FastaDefline#id_strings
805
+
806
+ Shows ID-like strings.
807
+ Returns an array of strings.
808
+
809
+ --- Bio::FastaDefline#list_ids
810
+
811
+ Shows array that contains IDs (or ID-like strings).
812
+ Returns an array of arrays of strings.
813
+
814
+ --- Bio::FastaDefline#description
815
+ --- Bio::FastaDefline#descriptions
816
+
817
+ --- Bio::FastaDefline#words(case_sensitive = nil,
818
+ kill_words_regexp_array, kill_words_hash)
819
+
820
+ --- Bio::FastaDefline#get(tag_of_id)
821
+
822
+ --- Bio::FastaDefline#get_by_type(type_of_id)
823
+
824
+ --- Bio::FastaDefline#get_all_by_type(type_of_id)
825
+
826
+ --- examples:
827
+ rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
828
+ rub.entry_id ==> 'gi|671595'
829
+ rub.get('emb') ==> 'CAA85678.1'
830
+ rub.emb ==> 'CAA85678.1'
831
+ rub.gi ==> '671595'
832
+ rub.accession ==> 'CAA85678'
833
+ rub.accessions ==> [ 'CAA85678' ]
834
+ rub.acc_version ==> 'CAA85678.1'
835
+ rub.locus ==> nil
836
+ rub.list_ids ==> [["gi", "671595"],
837
+ ["emb", "CAA85678.1", nil],
838
+ ["Perovskia abrotanoides"]]
839
+
840
+ ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
841
+ ckr.entry_id ==> "gi|2495000"
842
+ ckr.sp ==> "CCKR_CAVPO"
843
+ ckr.pir ==> "I51898"
844
+ ckr.gb ==> "AAB29504.1"
845
+ ckr.gi ==> "2495000"
846
+ ckr.accession ==> "AAB29504"
847
+ ckr.accessions ==> ["Q63931", "AAB29504"]
848
+ ckr.acc_version ==> "AAB29504.1"
849
+ ckr.locus ==> nil
850
+ ckr.description ==>
851
+ "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
852
+ ckr.descriptions ==>
853
+ ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
854
+ "cholecystokinin A receptor - guinea pig",
855
+ "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
856
+ ckr.words ==>
857
+ ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
858
+ "receptor", "type"]
859
+ ckr.id_strings ==>
860
+ ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
861
+ "544724", "AAB29504.1", "Cavia"]
862
+ ckr.list_ids ==>
863
+ [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
864
+ ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
865
+ ["gb", "AAB29504.1", nil], ["Cavia"]]
866
+
867
+ =end
868
+
869
+