bio 1.4.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. data/ChangeLog +1712 -0
  2. data/KNOWN_ISSUES.rdoc +11 -1
  3. data/README.rdoc +3 -2
  4. data/RELEASE_NOTES.rdoc +65 -127
  5. data/bioruby.gemspec +38 -2
  6. data/doc/RELEASE_NOTES-1.4.0.rdoc +167 -0
  7. data/doc/Tutorial.rd +74 -16
  8. data/doc/Tutorial.rd.html +68 -16
  9. data/lib/bio.rb +2 -0
  10. data/lib/bio/appl/clustalw/report.rb +18 -0
  11. data/lib/bio/appl/paml/codeml/report.rb +579 -21
  12. data/lib/bio/command.rb +149 -21
  13. data/lib/bio/db/aaindex.rb +11 -1
  14. data/lib/bio/db/embl/sptr.rb +1 -1
  15. data/lib/bio/db/fasta/defline.rb +7 -2
  16. data/lib/bio/db/fasta/qual.rb +24 -0
  17. data/lib/bio/db/fasta/qual_to_biosequence.rb +29 -0
  18. data/lib/bio/db/fastq.rb +15 -0
  19. data/lib/bio/db/go.rb +2 -2
  20. data/lib/bio/db/kegg/common.rb +109 -5
  21. data/lib/bio/db/kegg/genes.rb +61 -15
  22. data/lib/bio/db/kegg/genome.rb +43 -38
  23. data/lib/bio/db/kegg/module.rb +158 -0
  24. data/lib/bio/db/kegg/orthology.rb +40 -1
  25. data/lib/bio/db/kegg/pathway.rb +254 -0
  26. data/lib/bio/db/medline.rb +6 -2
  27. data/lib/bio/io/flatfile/autodetection.rb +6 -0
  28. data/lib/bio/location.rb +39 -0
  29. data/lib/bio/reference.rb +24 -0
  30. data/lib/bio/sequence.rb +2 -0
  31. data/lib/bio/sequence/adapter.rb +1 -0
  32. data/lib/bio/sequence/format.rb +14 -0
  33. data/lib/bio/sequence/sequence_masker.rb +95 -0
  34. data/lib/bio/tree.rb +4 -4
  35. data/lib/bio/util/restriction_enzyme/double_stranded/aligned_strands.rb +5 -0
  36. data/lib/bio/version.rb +1 -1
  37. data/setup.rb +5 -0
  38. data/test/data/KEGG/K02338.orthology +180 -52
  39. data/test/data/KEGG/M00118.module +44 -0
  40. data/test/data/KEGG/T00005.genome +140 -0
  41. data/test/data/KEGG/T00070.genome +34 -0
  42. data/test/data/KEGG/b0529.gene +47 -0
  43. data/test/data/KEGG/ec00072.pathway +23 -0
  44. data/test/data/KEGG/hsa00790.pathway +59 -0
  45. data/test/data/KEGG/ko00312.pathway +16 -0
  46. data/test/data/KEGG/map00030.pathway +37 -0
  47. data/test/data/KEGG/map00052.pathway +13 -0
  48. data/test/data/KEGG/rn00250.pathway +114 -0
  49. data/test/data/clustalw/example1.aln +58 -0
  50. data/test/data/go/selected_component.ontology +12 -0
  51. data/test/data/go/selected_gene_association.sgd +31 -0
  52. data/test/data/go/selected_wikipedia2go +13 -0
  53. data/test/data/medline/20146148_modified.medline +54 -0
  54. data/test/data/paml/codeml/models/aa.aln +26 -0
  55. data/test/data/paml/codeml/models/aa.dnd +13 -0
  56. data/test/data/paml/codeml/models/aa.ph +13 -0
  57. data/test/data/paml/codeml/models/alignment.phy +49 -0
  58. data/test/data/paml/codeml/models/results0-3.txt +312 -0
  59. data/test/data/paml/codeml/models/results7-8.txt +340 -0
  60. data/test/functional/bio/io/test_togows.rb +8 -8
  61. data/test/functional/bio/test_command.rb +7 -6
  62. data/test/unit/bio/appl/clustalw/test_report.rb +80 -0
  63. data/test/unit/bio/appl/paml/codeml/test_rates.rb +6 -6
  64. data/test/unit/bio/appl/paml/codeml/test_report.rb +231 -24
  65. data/test/unit/bio/appl/paml/codeml/test_report_single.rb +46 -0
  66. data/test/unit/bio/db/embl/test_sptr.rb +1 -1
  67. data/test/unit/bio/db/fasta/test_defline.rb +160 -0
  68. data/test/unit/bio/db/fasta/test_defline_misc.rb +490 -0
  69. data/test/unit/bio/db/kegg/test_genes.rb +281 -1
  70. data/test/unit/bio/db/kegg/test_genome.rb +408 -0
  71. data/test/unit/bio/db/kegg/test_module.rb +246 -0
  72. data/test/unit/bio/db/kegg/test_orthology.rb +95 -0
  73. data/test/unit/bio/db/kegg/test_pathway.rb +1250 -0
  74. data/test/unit/bio/db/test_aaindex.rb +8 -7
  75. data/test/unit/bio/db/test_fastq.rb +36 -0
  76. data/test/unit/bio/db/test_go.rb +171 -0
  77. data/test/unit/bio/db/test_medline.rb +148 -0
  78. data/test/unit/bio/db/test_qual.rb +9 -2
  79. data/test/unit/bio/sequence/test_sequence_masker.rb +169 -0
  80. data/test/unit/bio/test_tree.rb +260 -1
  81. data/test/unit/bio/util/test_contingency_table.rb +7 -7
  82. metadata +53 -6
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # = bio/db/kegg/genes.rb - KEGG/GENES database class
3
3
  #
4
- # Copyright:: Copyright (C) 2001, 2002, 2006
4
+ # Copyright:: Copyright (C) 2001, 2002, 2006, 2010
5
5
  # Toshiaki Katayama <k@bioruby.org>
6
6
  # License:: The Ruby License
7
7
  #
@@ -29,8 +29,7 @@
29
29
  #
30
30
  # # NAME
31
31
  # p entry.name # => String
32
- # p entry.genes # => Array
33
- # p entry.gene # => String
32
+ # p entry.names # => Array
34
33
  #
35
34
  # # DEFINITION
36
35
  # p entry.definition # => String
@@ -47,7 +46,7 @@
47
46
  # p entry.locations # => Bio::Locations
48
47
  #
49
48
  # # MOTIF
50
- # p entry.motif # => Hash of Array
49
+ # p entry.motifs # => Hash of Array
51
50
  #
52
51
  # # DBLINKS
53
52
  # p entry.dblinks # => Hash of Array
@@ -169,10 +168,24 @@ class GENES < KEGGDB
169
168
  #
170
169
  # ---
171
170
  # *Returns*:: Array containing String
172
- def genes
171
+ def names_as_array
173
172
  name.split(', ')
174
173
  end
174
+ alias names names_as_array
175
+
176
+ # The method will be deprecated. Use Bio::KEGG::GENES#names.
177
+ #
178
+ # Names of the entry as an Array, described in the NAME line.
179
+ #
180
+ # ---
181
+ # *Returns*:: Array containing String
182
+ def genes
183
+ names_as_array
184
+ end
175
185
 
186
+ # The method will be deprecated.
187
+ # Use <tt>entry.names.first</tt> instead.
188
+ #
176
189
  # Returns the first gene name described in the NAME line.
177
190
  # ---
178
191
  # *Returns*:: String
@@ -191,12 +204,14 @@ class GENES < KEGGDB
191
204
  # ---
192
205
  # *Returns*:: Array containing String
193
206
  def eclinks
194
- ec_list = definition.slice(/\[EC:(.*?)\]/, 1)
195
- if ec_list
196
- ec_list.strip.split(/\s+/)
197
- else
198
- []
207
+ unless defined? @eclinks
208
+ ec_list =
209
+ definition.slice(/\[EC\:([^\]]+)\]/, 1) ||
210
+ definition.slice(/\(EC\:([^\)]+)\)/, 1)
211
+ ary = ec_list ? ec_list.strip.split(/\s+/) : []
212
+ @eclinks = ary
199
213
  end
214
+ @eclinks
200
215
  end
201
216
 
202
217
  # Orthologs described in the ORTHOLOGY lines.
@@ -210,7 +225,10 @@ class GENES < KEGGDB
210
225
  # ---
211
226
  # *Returns*:: String
212
227
  def pathway
213
- field_fetch('PATHWAY')
228
+ unless defined? @pathway
229
+ @pathway = fetch('PATHWAY')
230
+ end
231
+ @pathway
214
232
  end
215
233
 
216
234
  # Pathways described in the PATHWAY lines.
@@ -220,6 +238,16 @@ class GENES < KEGGDB
220
238
  lines_fetch('PATHWAY')
221
239
  end
222
240
 
241
+ # Returns CLASS field of the entry.
242
+ def keggclass
243
+ field_fetch('CLASS')
244
+ end
245
+
246
+ # Returns an Array of biological classes in CLASS field.
247
+ def keggclasses
248
+ keggclass.gsub(/ \[[^\]]+/, '').split(/\] ?/)
249
+ end
250
+
223
251
  # The position in the genome described in the POSITION line.
224
252
  # ---
225
253
  # *Returns*:: String
@@ -259,16 +287,23 @@ class GENES < KEGGDB
259
287
  Bio::Locations.new(gbposition)
260
288
  end
261
289
 
290
+ # Motif information described in the MOTIF lines.
291
+ # ---
292
+ # *Returns*:: Strings
293
+ def motifs_as_strings
294
+ lines_fetch('MOTIF')
295
+ end
296
+
262
297
  # Motif information described in the MOTIF lines.
263
298
  # ---
264
299
  # *Returns*:: Hash
265
- def motif
300
+ def motifs_as_hash
266
301
  unless @data['MOTIF']
267
302
  hash = {}
268
303
  db = nil
269
- lines_fetch('MOTIF').each do |line|
304
+ motifs_as_strings.each do |line|
270
305
  if line[/^\S+:/]
271
- db, str = line.split(/:/)
306
+ db, str = line.split(/:/, 2)
272
307
  else
273
308
  str = line
274
309
  end
@@ -279,6 +314,17 @@ class GENES < KEGGDB
279
314
  end
280
315
  @data['MOTIF'] # Hash of Array of IDs in MOTIF
281
316
  end
317
+ alias motifs motifs_as_hash
318
+
319
+ # The specification of the method will be changed in the future.
320
+ # Please use Bio::KEGG::GENES#motifs.
321
+ #
322
+ # Motif information described in the MOTIF lines.
323
+ # ---
324
+ # *Returns*:: Hash
325
+ def motif
326
+ motifs
327
+ end
282
328
 
283
329
  # Links to other databases described in the DBLINKS lines.
284
330
  # ---
@@ -298,7 +344,7 @@ class GENES < KEGGDB
298
344
  end
299
345
  alias structures structure
300
346
 
301
- # Codon usage data described in the CODON_USAGE lines.
347
+ # Codon usage data described in the CODON_USAGE lines. (Deprecated: no more exists)
302
348
  # ---
303
349
  # *Returns*:: Hash
304
350
  def codon_usage(codon = nil)
@@ -8,6 +8,8 @@
8
8
  #
9
9
 
10
10
  require 'bio/db'
11
+ require 'bio/reference'
12
+ require 'bio/db/kegg/common'
11
13
 
12
14
  module Bio
13
15
  class KEGG
@@ -27,10 +29,37 @@ class GENOME < KEGGDB
27
29
  DELIMITER = RS = "\n///\n"
28
30
  TAGSIZE = 12
29
31
 
32
+ include Common::References
33
+ # REFERENCE -- Returns contents of the REFERENCE records as an Array of
34
+ # Bio::Reference objects.
35
+ def references; super; end if false #dummy for RDoc
36
+
37
+
30
38
  def initialize(entry)
31
39
  super(entry, TAGSIZE)
32
40
  end
33
41
 
42
+ # (private) Returns a tag name of the field as a String.
43
+ # Needed to redefine because of the PLASMID field.
44
+ def tag_get(str)
45
+ if /\APLASMID\s+/ =~ str.to_s then
46
+ 'PLASMID'
47
+ else
48
+ super(str)
49
+ end
50
+ end
51
+ private :tag_get
52
+
53
+ # (private) Returns a String of the field without a tag name.
54
+ # Needed to redefine because of the PLASMID field.
55
+ def tag_cut(str)
56
+ if /\APLASMID\s+/ =~ str.to_s then
57
+ $'
58
+ else
59
+ super(str)
60
+ end
61
+ end
62
+ private :tag_cut
34
63
 
35
64
  # ENTRY -- Returns contents of the ENTRY record as a String.
36
65
  def entry_id
@@ -80,7 +109,20 @@ class GENOME < KEGGDB
80
109
 
81
110
  # ORIGINAL_DB -- Returns contents of the ORIGINAL_DB record as a String.
82
111
  def original_db
83
- field_fetch('ORIGINAL_DB')
112
+ #field_fetch('ORIGINAL_DB')
113
+ unless defined?(@original_db)
114
+ @original_db = fetch('ORIGINAL_DB')
115
+ end
116
+ @original_db
117
+ end
118
+
119
+ # Returns ORIGINAL_DB record as an Array containing String objects.
120
+ #
121
+ # ---
122
+ # *Arguments*:
123
+ # *Returns*:: Array containing String objects
124
+ def original_databases
125
+ lines_fetch('ORIGINAL_DB')
84
126
  end
85
127
 
86
128
  # DISEASE -- Returns contents of the COMMENT record as a String.
@@ -93,43 +135,6 @@ class GENOME < KEGGDB
93
135
  field_fetch('COMMENT')
94
136
  end
95
137
 
96
- # REFERENCE -- Returns contents of the REFERENCE records as an Array of
97
- # Bio::Reference objects.
98
- def references
99
- unless @data['REFERENCE']
100
- ary = []
101
- toptag2array(get('REFERENCE')).each do |ref|
102
- hash = Hash.new('')
103
- subtag2array(ref).each do |field|
104
- case tag_get(field)
105
- when /AUTHORS/
106
- authors = truncate(tag_cut(field))
107
- authors = authors.split(', ')
108
- authors[-1] = authors[-1].split(/\s+and\s+/)
109
- authors = authors.flatten.map { |a| a.sub(',', ', ') }
110
- hash['authors'] = authors
111
- when /TITLE/
112
- hash['title'] = truncate(tag_cut(field))
113
- when /JOURNAL/
114
- journal = truncate(tag_cut(field))
115
- if journal =~ /(.*) (\d+):(\d+)-(\d+) \((\d+)\) \[UI:(\d+)\]$/
116
- hash['journal'] = $1
117
- hash['volume'] = $2
118
- hash['pages'] = $3
119
- hash['year'] = $5
120
- hash['medline'] = $6
121
- else
122
- hash['journal'] = journal
123
- end
124
- end
125
- end
126
- ary.push(Reference.new(hash))
127
- end
128
- @data['REFERENCE'] = References.new(ary)
129
- end
130
- @data['REFERENCE']
131
- end
132
-
133
138
  # CHROMOSOME -- Returns contents of the CHROMOSOME records as an Array
134
139
  # of Hash.
135
140
  def chromosomes
@@ -0,0 +1,158 @@
1
+ #
2
+ # = bio/db/kegg/module.rb - KEGG MODULE database class
3
+ #
4
+ # Copyright:: Copyright (C) 2010 Kozo Nishida <kozo-ni@is.naist.jp>
5
+ # Copyright:: Copyright (C) 2010 Toshiaki Katayama <k@bioruby.org>
6
+ # License:: The Ruby License
7
+ #
8
+ # $Id:$
9
+ #
10
+
11
+ require 'bio/db'
12
+ require 'bio/db/kegg/common'
13
+
14
+ module Bio
15
+ class KEGG
16
+
17
+ # == Description
18
+ #
19
+ # Bio::KEGG::MODULE is a parser class for the KEGG MODULE database entry.
20
+ #
21
+ # == References
22
+ #
23
+ # * http://www.kegg.jp/kegg-bin/get_htext?ko00002.keg
24
+ # * ftp://ftp.genome.jp/pub/kegg/pathway/module
25
+ #
26
+ class MODULE < KEGGDB
27
+
28
+ DELIMITER = RS = "\n///\n"
29
+ TAGSIZE = 12
30
+
31
+ #--
32
+ # for a private method strings_as_hash.
33
+ #++
34
+ include Common::StringsAsHash
35
+
36
+ # Creates a new Bio::KEGG::MODULE object.
37
+ # ---
38
+ # *Arguments*:
39
+ # * (required) _entry_: (String) single entry as a string
40
+ # *Returns*:: Bio::KEGG::MODULE object
41
+ def initialize(entry)
42
+ super(entry, TAGSIZE)
43
+ end
44
+
45
+ # Return the ID, described in the ENTRY line.
46
+ # ---
47
+ # *Returns*:: String
48
+ def entry_id
49
+ field_fetch('ENTRY')[/\S+/]
50
+ end
51
+
52
+ # Name of the module, described in the NAME line.
53
+ # ---
54
+ # *Returns*:: String
55
+ def name
56
+ field_fetch('NAME')
57
+ end
58
+
59
+ # Definition of the module, described in the DEFINITION line.
60
+ # ---
61
+ # *Returns*:: String
62
+ def definition
63
+ field_fetch('DEFINITION')
64
+ end
65
+
66
+ # Name of the KEGG class, described in the CLASS line.
67
+ # ---
68
+ # *Returns*:: String
69
+ def keggclass
70
+ field_fetch('CLASS')
71
+ end
72
+
73
+ # Pathways described in the PATHWAY lines.
74
+ # ---
75
+ # *Returns*:: Array containing String
76
+ def pathways_as_strings
77
+ lines_fetch('PATHWAY')
78
+ end
79
+
80
+ # Pathways described in the PATHWAY lines.
81
+ # ---
82
+ # *Returns*:: Hash of pathway ID and its definition
83
+ def pathways_as_hash
84
+ unless @pathways_as_hash
85
+ @pathways_as_hash = strings_as_hash(pathways_as_strings)
86
+ end
87
+ @pathways_as_hash
88
+ end
89
+ alias pathways pathways_as_hash
90
+
91
+
92
+ # Orthologs described in the ORTHOLOGY lines.
93
+ # ---
94
+ # *Returns*:: Array containing String
95
+ def orthologs_as_strings
96
+ lines_fetch('ORTHOLOGY')
97
+ end
98
+
99
+ # Orthologs described in the ORTHOLOGY lines.
100
+ # ---
101
+ # *Returns*:: Hash of orthology ID and its definition
102
+ def orthologs_as_hash
103
+ unless @orthologs_as_hash
104
+ @orthologs_as_hash = strings_as_hash(orthologs_as_strings)
105
+ end
106
+ @orthologs_as_hash
107
+ end
108
+ alias orthologs orthologs_as_hash
109
+
110
+ # All KO IDs in the ORTHOLOGY lines.
111
+ # ---
112
+ # *Returns*:: Array of orthology IDs
113
+ def orthologs_as_array
114
+ orthologs_as_hash.keys.map{|x| x.split(/\+|\-|,/)}.flatten.sort.uniq
115
+ end
116
+
117
+
118
+ # Reactions described in the REACTION lines.
119
+ # ---
120
+ # *Returns*:: Array containing String
121
+ def reactions_as_strings
122
+ lines_fetch('REACTION')
123
+ end
124
+
125
+ # Reactions described in the REACTION lines.
126
+ # ---
127
+ # *Returns*:: Hash of reaction ID and its definition
128
+ def reactions_as_hash
129
+ unless @reactions_as_hash
130
+ @reactions_as_hash = strings_as_hash(reactions_as_strings)
131
+ end
132
+ @reactions_as_hash
133
+ end
134
+ alias reactions reactions_as_hash
135
+
136
+
137
+ # Compounds described in the COMPOUND lines.
138
+ # ---
139
+ # *Returns*:: Array containing String
140
+ def compounds_as_strings
141
+ lines_fetch('COMPOUND')
142
+ end
143
+
144
+ # Compounds described in the COMPOUND lines.
145
+ # ---
146
+ # *Returns*:: Hash of compound ID and its definition
147
+ def compounds_as_hash
148
+ unless @compounds_as_hash
149
+ @compounds_as_hash = strings_as_hash(compounds_as_strings)
150
+ end
151
+ @compounds_as_hash
152
+ end
153
+ alias compounds compounds_as_hash
154
+
155
+ end # MODULE
156
+
157
+ end # KEGG
158
+ end # Bio
@@ -38,6 +38,27 @@ class ORTHOLOGY < KEGGDB
38
38
  def genes_as_hash; super; end if false #dummy for RDoc
39
39
  alias genes genes_as_hash
40
40
 
41
+ include Common::PathwaysAsHash
42
+ # Returns a Hash of the pathway ID and name in PATHWAY field.
43
+ def pathways_as_hash; super; end if false #dummy for RDoc
44
+ alias pathways pathways_as_hash
45
+
46
+ include Common::ModulesAsHash
47
+ # Returns MODULE field as a Hash.
48
+ # Each key of the hash is KEGG MODULE ID,
49
+ # and each value is the name of the Pathway Module.
50
+ # ---
51
+ # *Returns*:: Hash
52
+ def modules_as_hash; super; end if false #dummy for RDoc
53
+ alias modules modules_as_hash
54
+
55
+ include Common::References
56
+ # REFERENCE -- Returns contents of the REFERENCE records as an Array of
57
+ # Bio::Reference objects.
58
+ # ---
59
+ # *Returns*:: an Array containing Bio::Reference objects
60
+ def references; super; end if false #dummy for RDoc
61
+
41
62
  # Reads a flat file format entry of the KO database.
42
63
  def initialize(entry)
43
64
  super(entry, TAGSIZE)
@@ -73,10 +94,28 @@ class ORTHOLOGY < KEGGDB
73
94
  keggclass.gsub(/ \[[^\]]+/, '').split(/\] ?/)
74
95
  end
75
96
 
97
+ # Pathways described in the PATHWAY field.
98
+ # ---
99
+ # *Returns*:: Array containing String
100
+ def pathways_as_strings
101
+ lines_fetch('PATHWAY')
102
+ end
103
+
104
+ # *OBSOLETE* Do not use this method.
105
+ # Because KEGG ORTHOLOGY format is changed and PATHWAY field is added,
106
+ # older "pathways" method is renamed and remain only for compatibility.
107
+ #
76
108
  # Returns an Array of KEGG/PATHWAY ID in CLASS field.
77
- def pathways
109
+ def pathways_in_keggclass
78
110
  keggclass.scan(/\[PATH:(.*?)\]/).flatten
79
111
  end
112
+
113
+ # Returns MODULE field of the entry.
114
+ # ---
115
+ # *Returns*:: Array containing String objects
116
+ def modules_as_strings
117
+ lines_fetch('MODULE')
118
+ end
80
119
 
81
120
  # Returns an Array of a database name and entry IDs in DBLINKS field.
82
121
  def dblinks_as_strings