bio 1.4.0 → 1.4.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (82) hide show
  1. data/ChangeLog +1712 -0
  2. data/KNOWN_ISSUES.rdoc +11 -1
  3. data/README.rdoc +3 -2
  4. data/RELEASE_NOTES.rdoc +65 -127
  5. data/bioruby.gemspec +38 -2
  6. data/doc/RELEASE_NOTES-1.4.0.rdoc +167 -0
  7. data/doc/Tutorial.rd +74 -16
  8. data/doc/Tutorial.rd.html +68 -16
  9. data/lib/bio.rb +2 -0
  10. data/lib/bio/appl/clustalw/report.rb +18 -0
  11. data/lib/bio/appl/paml/codeml/report.rb +579 -21
  12. data/lib/bio/command.rb +149 -21
  13. data/lib/bio/db/aaindex.rb +11 -1
  14. data/lib/bio/db/embl/sptr.rb +1 -1
  15. data/lib/bio/db/fasta/defline.rb +7 -2
  16. data/lib/bio/db/fasta/qual.rb +24 -0
  17. data/lib/bio/db/fasta/qual_to_biosequence.rb +29 -0
  18. data/lib/bio/db/fastq.rb +15 -0
  19. data/lib/bio/db/go.rb +2 -2
  20. data/lib/bio/db/kegg/common.rb +109 -5
  21. data/lib/bio/db/kegg/genes.rb +61 -15
  22. data/lib/bio/db/kegg/genome.rb +43 -38
  23. data/lib/bio/db/kegg/module.rb +158 -0
  24. data/lib/bio/db/kegg/orthology.rb +40 -1
  25. data/lib/bio/db/kegg/pathway.rb +254 -0
  26. data/lib/bio/db/medline.rb +6 -2
  27. data/lib/bio/io/flatfile/autodetection.rb +6 -0
  28. data/lib/bio/location.rb +39 -0
  29. data/lib/bio/reference.rb +24 -0
  30. data/lib/bio/sequence.rb +2 -0
  31. data/lib/bio/sequence/adapter.rb +1 -0
  32. data/lib/bio/sequence/format.rb +14 -0
  33. data/lib/bio/sequence/sequence_masker.rb +95 -0
  34. data/lib/bio/tree.rb +4 -4
  35. data/lib/bio/util/restriction_enzyme/double_stranded/aligned_strands.rb +5 -0
  36. data/lib/bio/version.rb +1 -1
  37. data/setup.rb +5 -0
  38. data/test/data/KEGG/K02338.orthology +180 -52
  39. data/test/data/KEGG/M00118.module +44 -0
  40. data/test/data/KEGG/T00005.genome +140 -0
  41. data/test/data/KEGG/T00070.genome +34 -0
  42. data/test/data/KEGG/b0529.gene +47 -0
  43. data/test/data/KEGG/ec00072.pathway +23 -0
  44. data/test/data/KEGG/hsa00790.pathway +59 -0
  45. data/test/data/KEGG/ko00312.pathway +16 -0
  46. data/test/data/KEGG/map00030.pathway +37 -0
  47. data/test/data/KEGG/map00052.pathway +13 -0
  48. data/test/data/KEGG/rn00250.pathway +114 -0
  49. data/test/data/clustalw/example1.aln +58 -0
  50. data/test/data/go/selected_component.ontology +12 -0
  51. data/test/data/go/selected_gene_association.sgd +31 -0
  52. data/test/data/go/selected_wikipedia2go +13 -0
  53. data/test/data/medline/20146148_modified.medline +54 -0
  54. data/test/data/paml/codeml/models/aa.aln +26 -0
  55. data/test/data/paml/codeml/models/aa.dnd +13 -0
  56. data/test/data/paml/codeml/models/aa.ph +13 -0
  57. data/test/data/paml/codeml/models/alignment.phy +49 -0
  58. data/test/data/paml/codeml/models/results0-3.txt +312 -0
  59. data/test/data/paml/codeml/models/results7-8.txt +340 -0
  60. data/test/functional/bio/io/test_togows.rb +8 -8
  61. data/test/functional/bio/test_command.rb +7 -6
  62. data/test/unit/bio/appl/clustalw/test_report.rb +80 -0
  63. data/test/unit/bio/appl/paml/codeml/test_rates.rb +6 -6
  64. data/test/unit/bio/appl/paml/codeml/test_report.rb +231 -24
  65. data/test/unit/bio/appl/paml/codeml/test_report_single.rb +46 -0
  66. data/test/unit/bio/db/embl/test_sptr.rb +1 -1
  67. data/test/unit/bio/db/fasta/test_defline.rb +160 -0
  68. data/test/unit/bio/db/fasta/test_defline_misc.rb +490 -0
  69. data/test/unit/bio/db/kegg/test_genes.rb +281 -1
  70. data/test/unit/bio/db/kegg/test_genome.rb +408 -0
  71. data/test/unit/bio/db/kegg/test_module.rb +246 -0
  72. data/test/unit/bio/db/kegg/test_orthology.rb +95 -0
  73. data/test/unit/bio/db/kegg/test_pathway.rb +1250 -0
  74. data/test/unit/bio/db/test_aaindex.rb +8 -7
  75. data/test/unit/bio/db/test_fastq.rb +36 -0
  76. data/test/unit/bio/db/test_go.rb +171 -0
  77. data/test/unit/bio/db/test_medline.rb +148 -0
  78. data/test/unit/bio/db/test_qual.rb +9 -2
  79. data/test/unit/bio/sequence/test_sequence_masker.rb +169 -0
  80. data/test/unit/bio/test_tree.rb +260 -1
  81. data/test/unit/bio/util/test_contingency_table.rb +7 -7
  82. metadata +53 -6
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # = bio/db/kegg/genes.rb - KEGG/GENES database class
3
3
  #
4
- # Copyright:: Copyright (C) 2001, 2002, 2006
4
+ # Copyright:: Copyright (C) 2001, 2002, 2006, 2010
5
5
  # Toshiaki Katayama <k@bioruby.org>
6
6
  # License:: The Ruby License
7
7
  #
@@ -29,8 +29,7 @@
29
29
  #
30
30
  # # NAME
31
31
  # p entry.name # => String
32
- # p entry.genes # => Array
33
- # p entry.gene # => String
32
+ # p entry.names # => Array
34
33
  #
35
34
  # # DEFINITION
36
35
  # p entry.definition # => String
@@ -47,7 +46,7 @@
47
46
  # p entry.locations # => Bio::Locations
48
47
  #
49
48
  # # MOTIF
50
- # p entry.motif # => Hash of Array
49
+ # p entry.motifs # => Hash of Array
51
50
  #
52
51
  # # DBLINKS
53
52
  # p entry.dblinks # => Hash of Array
@@ -169,10 +168,24 @@ class GENES < KEGGDB
169
168
  #
170
169
  # ---
171
170
  # *Returns*:: Array containing String
172
- def genes
171
+ def names_as_array
173
172
  name.split(', ')
174
173
  end
174
+ alias names names_as_array
175
+
176
+ # The method will be deprecated. Use Bio::KEGG::GENES#names.
177
+ #
178
+ # Names of the entry as an Array, described in the NAME line.
179
+ #
180
+ # ---
181
+ # *Returns*:: Array containing String
182
+ def genes
183
+ names_as_array
184
+ end
175
185
 
186
+ # The method will be deprecated.
187
+ # Use <tt>entry.names.first</tt> instead.
188
+ #
176
189
  # Returns the first gene name described in the NAME line.
177
190
  # ---
178
191
  # *Returns*:: String
@@ -191,12 +204,14 @@ class GENES < KEGGDB
191
204
  # ---
192
205
  # *Returns*:: Array containing String
193
206
  def eclinks
194
- ec_list = definition.slice(/\[EC:(.*?)\]/, 1)
195
- if ec_list
196
- ec_list.strip.split(/\s+/)
197
- else
198
- []
207
+ unless defined? @eclinks
208
+ ec_list =
209
+ definition.slice(/\[EC\:([^\]]+)\]/, 1) ||
210
+ definition.slice(/\(EC\:([^\)]+)\)/, 1)
211
+ ary = ec_list ? ec_list.strip.split(/\s+/) : []
212
+ @eclinks = ary
199
213
  end
214
+ @eclinks
200
215
  end
201
216
 
202
217
  # Orthologs described in the ORTHOLOGY lines.
@@ -210,7 +225,10 @@ class GENES < KEGGDB
210
225
  # ---
211
226
  # *Returns*:: String
212
227
  def pathway
213
- field_fetch('PATHWAY')
228
+ unless defined? @pathway
229
+ @pathway = fetch('PATHWAY')
230
+ end
231
+ @pathway
214
232
  end
215
233
 
216
234
  # Pathways described in the PATHWAY lines.
@@ -220,6 +238,16 @@ class GENES < KEGGDB
220
238
  lines_fetch('PATHWAY')
221
239
  end
222
240
 
241
+ # Returns CLASS field of the entry.
242
+ def keggclass
243
+ field_fetch('CLASS')
244
+ end
245
+
246
+ # Returns an Array of biological classes in CLASS field.
247
+ def keggclasses
248
+ keggclass.gsub(/ \[[^\]]+/, '').split(/\] ?/)
249
+ end
250
+
223
251
  # The position in the genome described in the POSITION line.
224
252
  # ---
225
253
  # *Returns*:: String
@@ -259,16 +287,23 @@ class GENES < KEGGDB
259
287
  Bio::Locations.new(gbposition)
260
288
  end
261
289
 
290
+ # Motif information described in the MOTIF lines.
291
+ # ---
292
+ # *Returns*:: Strings
293
+ def motifs_as_strings
294
+ lines_fetch('MOTIF')
295
+ end
296
+
262
297
  # Motif information described in the MOTIF lines.
263
298
  # ---
264
299
  # *Returns*:: Hash
265
- def motif
300
+ def motifs_as_hash
266
301
  unless @data['MOTIF']
267
302
  hash = {}
268
303
  db = nil
269
- lines_fetch('MOTIF').each do |line|
304
+ motifs_as_strings.each do |line|
270
305
  if line[/^\S+:/]
271
- db, str = line.split(/:/)
306
+ db, str = line.split(/:/, 2)
272
307
  else
273
308
  str = line
274
309
  end
@@ -279,6 +314,17 @@ class GENES < KEGGDB
279
314
  end
280
315
  @data['MOTIF'] # Hash of Array of IDs in MOTIF
281
316
  end
317
+ alias motifs motifs_as_hash
318
+
319
+ # The specification of the method will be changed in the future.
320
+ # Please use Bio::KEGG::GENES#motifs.
321
+ #
322
+ # Motif information described in the MOTIF lines.
323
+ # ---
324
+ # *Returns*:: Hash
325
+ def motif
326
+ motifs
327
+ end
282
328
 
283
329
  # Links to other databases described in the DBLINKS lines.
284
330
  # ---
@@ -298,7 +344,7 @@ class GENES < KEGGDB
298
344
  end
299
345
  alias structures structure
300
346
 
301
- # Codon usage data described in the CODON_USAGE lines.
347
+ # Codon usage data described in the CODON_USAGE lines. (Deprecated: no more exists)
302
348
  # ---
303
349
  # *Returns*:: Hash
304
350
  def codon_usage(codon = nil)
@@ -8,6 +8,8 @@
8
8
  #
9
9
 
10
10
  require 'bio/db'
11
+ require 'bio/reference'
12
+ require 'bio/db/kegg/common'
11
13
 
12
14
  module Bio
13
15
  class KEGG
@@ -27,10 +29,37 @@ class GENOME < KEGGDB
27
29
  DELIMITER = RS = "\n///\n"
28
30
  TAGSIZE = 12
29
31
 
32
+ include Common::References
33
+ # REFERENCE -- Returns contents of the REFERENCE records as an Array of
34
+ # Bio::Reference objects.
35
+ def references; super; end if false #dummy for RDoc
36
+
37
+
30
38
  def initialize(entry)
31
39
  super(entry, TAGSIZE)
32
40
  end
33
41
 
42
+ # (private) Returns a tag name of the field as a String.
43
+ # Needed to redefine because of the PLASMID field.
44
+ def tag_get(str)
45
+ if /\APLASMID\s+/ =~ str.to_s then
46
+ 'PLASMID'
47
+ else
48
+ super(str)
49
+ end
50
+ end
51
+ private :tag_get
52
+
53
+ # (private) Returns a String of the field without a tag name.
54
+ # Needed to redefine because of the PLASMID field.
55
+ def tag_cut(str)
56
+ if /\APLASMID\s+/ =~ str.to_s then
57
+ $'
58
+ else
59
+ super(str)
60
+ end
61
+ end
62
+ private :tag_cut
34
63
 
35
64
  # ENTRY -- Returns contents of the ENTRY record as a String.
36
65
  def entry_id
@@ -80,7 +109,20 @@ class GENOME < KEGGDB
80
109
 
81
110
  # ORIGINAL_DB -- Returns contents of the ORIGINAL_DB record as a String.
82
111
  def original_db
83
- field_fetch('ORIGINAL_DB')
112
+ #field_fetch('ORIGINAL_DB')
113
+ unless defined?(@original_db)
114
+ @original_db = fetch('ORIGINAL_DB')
115
+ end
116
+ @original_db
117
+ end
118
+
119
+ # Returns ORIGINAL_DB record as an Array containing String objects.
120
+ #
121
+ # ---
122
+ # *Arguments*:
123
+ # *Returns*:: Array containing String objects
124
+ def original_databases
125
+ lines_fetch('ORIGINAL_DB')
84
126
  end
85
127
 
86
128
  # DISEASE -- Returns contents of the COMMENT record as a String.
@@ -93,43 +135,6 @@ class GENOME < KEGGDB
93
135
  field_fetch('COMMENT')
94
136
  end
95
137
 
96
- # REFERENCE -- Returns contents of the REFERENCE records as an Array of
97
- # Bio::Reference objects.
98
- def references
99
- unless @data['REFERENCE']
100
- ary = []
101
- toptag2array(get('REFERENCE')).each do |ref|
102
- hash = Hash.new('')
103
- subtag2array(ref).each do |field|
104
- case tag_get(field)
105
- when /AUTHORS/
106
- authors = truncate(tag_cut(field))
107
- authors = authors.split(', ')
108
- authors[-1] = authors[-1].split(/\s+and\s+/)
109
- authors = authors.flatten.map { |a| a.sub(',', ', ') }
110
- hash['authors'] = authors
111
- when /TITLE/
112
- hash['title'] = truncate(tag_cut(field))
113
- when /JOURNAL/
114
- journal = truncate(tag_cut(field))
115
- if journal =~ /(.*) (\d+):(\d+)-(\d+) \((\d+)\) \[UI:(\d+)\]$/
116
- hash['journal'] = $1
117
- hash['volume'] = $2
118
- hash['pages'] = $3
119
- hash['year'] = $5
120
- hash['medline'] = $6
121
- else
122
- hash['journal'] = journal
123
- end
124
- end
125
- end
126
- ary.push(Reference.new(hash))
127
- end
128
- @data['REFERENCE'] = References.new(ary)
129
- end
130
- @data['REFERENCE']
131
- end
132
-
133
138
  # CHROMOSOME -- Returns contents of the CHROMOSOME records as an Array
134
139
  # of Hash.
135
140
  def chromosomes
@@ -0,0 +1,158 @@
1
+ #
2
+ # = bio/db/kegg/module.rb - KEGG MODULE database class
3
+ #
4
+ # Copyright:: Copyright (C) 2010 Kozo Nishida <kozo-ni@is.naist.jp>
5
+ # Copyright:: Copyright (C) 2010 Toshiaki Katayama <k@bioruby.org>
6
+ # License:: The Ruby License
7
+ #
8
+ # $Id:$
9
+ #
10
+
11
+ require 'bio/db'
12
+ require 'bio/db/kegg/common'
13
+
14
+ module Bio
15
+ class KEGG
16
+
17
+ # == Description
18
+ #
19
+ # Bio::KEGG::MODULE is a parser class for the KEGG MODULE database entry.
20
+ #
21
+ # == References
22
+ #
23
+ # * http://www.kegg.jp/kegg-bin/get_htext?ko00002.keg
24
+ # * ftp://ftp.genome.jp/pub/kegg/pathway/module
25
+ #
26
+ class MODULE < KEGGDB
27
+
28
+ DELIMITER = RS = "\n///\n"
29
+ TAGSIZE = 12
30
+
31
+ #--
32
+ # for a private method strings_as_hash.
33
+ #++
34
+ include Common::StringsAsHash
35
+
36
+ # Creates a new Bio::KEGG::MODULE object.
37
+ # ---
38
+ # *Arguments*:
39
+ # * (required) _entry_: (String) single entry as a string
40
+ # *Returns*:: Bio::KEGG::MODULE object
41
+ def initialize(entry)
42
+ super(entry, TAGSIZE)
43
+ end
44
+
45
+ # Return the ID, described in the ENTRY line.
46
+ # ---
47
+ # *Returns*:: String
48
+ def entry_id
49
+ field_fetch('ENTRY')[/\S+/]
50
+ end
51
+
52
+ # Name of the module, described in the NAME line.
53
+ # ---
54
+ # *Returns*:: String
55
+ def name
56
+ field_fetch('NAME')
57
+ end
58
+
59
+ # Definition of the module, described in the DEFINITION line.
60
+ # ---
61
+ # *Returns*:: String
62
+ def definition
63
+ field_fetch('DEFINITION')
64
+ end
65
+
66
+ # Name of the KEGG class, described in the CLASS line.
67
+ # ---
68
+ # *Returns*:: String
69
+ def keggclass
70
+ field_fetch('CLASS')
71
+ end
72
+
73
+ # Pathways described in the PATHWAY lines.
74
+ # ---
75
+ # *Returns*:: Array containing String
76
+ def pathways_as_strings
77
+ lines_fetch('PATHWAY')
78
+ end
79
+
80
+ # Pathways described in the PATHWAY lines.
81
+ # ---
82
+ # *Returns*:: Hash of pathway ID and its definition
83
+ def pathways_as_hash
84
+ unless @pathways_as_hash
85
+ @pathways_as_hash = strings_as_hash(pathways_as_strings)
86
+ end
87
+ @pathways_as_hash
88
+ end
89
+ alias pathways pathways_as_hash
90
+
91
+
92
+ # Orthologs described in the ORTHOLOGY lines.
93
+ # ---
94
+ # *Returns*:: Array containing String
95
+ def orthologs_as_strings
96
+ lines_fetch('ORTHOLOGY')
97
+ end
98
+
99
+ # Orthologs described in the ORTHOLOGY lines.
100
+ # ---
101
+ # *Returns*:: Hash of orthology ID and its definition
102
+ def orthologs_as_hash
103
+ unless @orthologs_as_hash
104
+ @orthologs_as_hash = strings_as_hash(orthologs_as_strings)
105
+ end
106
+ @orthologs_as_hash
107
+ end
108
+ alias orthologs orthologs_as_hash
109
+
110
+ # All KO IDs in the ORTHOLOGY lines.
111
+ # ---
112
+ # *Returns*:: Array of orthology IDs
113
+ def orthologs_as_array
114
+ orthologs_as_hash.keys.map{|x| x.split(/\+|\-|,/)}.flatten.sort.uniq
115
+ end
116
+
117
+
118
+ # Reactions described in the REACTION lines.
119
+ # ---
120
+ # *Returns*:: Array containing String
121
+ def reactions_as_strings
122
+ lines_fetch('REACTION')
123
+ end
124
+
125
+ # Reactions described in the REACTION lines.
126
+ # ---
127
+ # *Returns*:: Hash of reaction ID and its definition
128
+ def reactions_as_hash
129
+ unless @reactions_as_hash
130
+ @reactions_as_hash = strings_as_hash(reactions_as_strings)
131
+ end
132
+ @reactions_as_hash
133
+ end
134
+ alias reactions reactions_as_hash
135
+
136
+
137
+ # Compounds described in the COMPOUND lines.
138
+ # ---
139
+ # *Returns*:: Array containing String
140
+ def compounds_as_strings
141
+ lines_fetch('COMPOUND')
142
+ end
143
+
144
+ # Compounds described in the COMPOUND lines.
145
+ # ---
146
+ # *Returns*:: Hash of compound ID and its definition
147
+ def compounds_as_hash
148
+ unless @compounds_as_hash
149
+ @compounds_as_hash = strings_as_hash(compounds_as_strings)
150
+ end
151
+ @compounds_as_hash
152
+ end
153
+ alias compounds compounds_as_hash
154
+
155
+ end # MODULE
156
+
157
+ end # KEGG
158
+ end # Bio
@@ -38,6 +38,27 @@ class ORTHOLOGY < KEGGDB
38
38
  def genes_as_hash; super; end if false #dummy for RDoc
39
39
  alias genes genes_as_hash
40
40
 
41
+ include Common::PathwaysAsHash
42
+ # Returns a Hash of the pathway ID and name in PATHWAY field.
43
+ def pathways_as_hash; super; end if false #dummy for RDoc
44
+ alias pathways pathways_as_hash
45
+
46
+ include Common::ModulesAsHash
47
+ # Returns MODULE field as a Hash.
48
+ # Each key of the hash is KEGG MODULE ID,
49
+ # and each value is the name of the Pathway Module.
50
+ # ---
51
+ # *Returns*:: Hash
52
+ def modules_as_hash; super; end if false #dummy for RDoc
53
+ alias modules modules_as_hash
54
+
55
+ include Common::References
56
+ # REFERENCE -- Returns contents of the REFERENCE records as an Array of
57
+ # Bio::Reference objects.
58
+ # ---
59
+ # *Returns*:: an Array containing Bio::Reference objects
60
+ def references; super; end if false #dummy for RDoc
61
+
41
62
  # Reads a flat file format entry of the KO database.
42
63
  def initialize(entry)
43
64
  super(entry, TAGSIZE)
@@ -73,10 +94,28 @@ class ORTHOLOGY < KEGGDB
73
94
  keggclass.gsub(/ \[[^\]]+/, '').split(/\] ?/)
74
95
  end
75
96
 
97
+ # Pathways described in the PATHWAY field.
98
+ # ---
99
+ # *Returns*:: Array containing String
100
+ def pathways_as_strings
101
+ lines_fetch('PATHWAY')
102
+ end
103
+
104
+ # *OBSOLETE* Do not use this method.
105
+ # Because KEGG ORTHOLOGY format is changed and PATHWAY field is added,
106
+ # older "pathways" method is renamed and remain only for compatibility.
107
+ #
76
108
  # Returns an Array of KEGG/PATHWAY ID in CLASS field.
77
- def pathways
109
+ def pathways_in_keggclass
78
110
  keggclass.scan(/\[PATH:(.*?)\]/).flatten
79
111
  end
112
+
113
+ # Returns MODULE field of the entry.
114
+ # ---
115
+ # *Returns*:: Array containing String objects
116
+ def modules_as_strings
117
+ lines_fetch('MODULE')
118
+ end
80
119
 
81
120
  # Returns an Array of a database name and entry IDs in DBLINKS field.
82
121
  def dblinks_as_strings