bio 1.4.1 → 1.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. data/ChangeLog +954 -0
  2. data/KNOWN_ISSUES.rdoc +40 -5
  3. data/README.rdoc +36 -35
  4. data/RELEASE_NOTES.rdoc +87 -59
  5. data/bioruby.gemspec +24 -2
  6. data/doc/RELEASE_NOTES-1.4.1.rdoc +104 -0
  7. data/doc/Tutorial.rd +162 -200
  8. data/doc/Tutorial.rd.html +149 -146
  9. data/lib/bio.rb +1 -0
  10. data/lib/bio/appl/blast.rb +1 -1
  11. data/lib/bio/appl/blast/ddbj.rb +26 -34
  12. data/lib/bio/appl/blast/genomenet.rb +21 -11
  13. data/lib/bio/db/embl/sptr.rb +193 -21
  14. data/lib/bio/db/fasta.rb +1 -1
  15. data/lib/bio/db/fastq.rb +14 -0
  16. data/lib/bio/db/fastq/format_fastq.rb +2 -2
  17. data/lib/bio/db/genbank/ddbj.rb +1 -2
  18. data/lib/bio/db/genbank/format_genbank.rb +1 -1
  19. data/lib/bio/db/medline.rb +1 -0
  20. data/lib/bio/db/newick.rb +3 -1
  21. data/lib/bio/db/pdb/pdb.rb +9 -9
  22. data/lib/bio/db/pdb/residue.rb +2 -2
  23. data/lib/bio/io/ddbjrest.rb +344 -0
  24. data/lib/bio/io/ncbirest.rb +121 -1
  25. data/lib/bio/location.rb +2 -2
  26. data/lib/bio/reference.rb +3 -4
  27. data/lib/bio/shell/plugin/entry.rb +7 -3
  28. data/lib/bio/shell/plugin/ncbirest.rb +5 -1
  29. data/lib/bio/util/restriction_enzyme.rb +3 -0
  30. data/lib/bio/util/restriction_enzyme/dense_int_array.rb +195 -0
  31. data/lib/bio/util/restriction_enzyme/range/sequence_range.rb +7 -7
  32. data/lib/bio/util/restriction_enzyme/range/sequence_range/calculated_cuts.rb +57 -18
  33. data/lib/bio/util/restriction_enzyme/range/sequence_range/fragment.rb +2 -2
  34. data/lib/bio/util/restriction_enzyme/sorted_num_array.rb +219 -0
  35. data/lib/bio/version.rb +1 -1
  36. data/sample/test_restriction_enzyme_long.rb +4403 -0
  37. data/test/data/fasta/EFTU_BACSU.fasta +8 -0
  38. data/test/data/genbank/CAA35997.gp +48 -0
  39. data/test/data/genbank/SCU49845.gb +167 -0
  40. data/test/data/litdb/1717226.litdb +13 -0
  41. data/test/data/pir/CRAB_ANAPL.pir +6 -0
  42. data/test/functional/bio/appl/blast/test_remote.rb +93 -0
  43. data/test/functional/bio/appl/test_blast.rb +61 -0
  44. data/test/functional/bio/io/test_ddbjrest.rb +47 -0
  45. data/test/functional/bio/test_command.rb +3 -3
  46. data/test/unit/bio/db/embl/test_sptr.rb +6 -6
  47. data/test/unit/bio/db/embl/test_uniprot_new_part.rb +208 -0
  48. data/test/unit/bio/db/genbank/test_common.rb +274 -0
  49. data/test/unit/bio/db/genbank/test_genbank.rb +401 -0
  50. data/test/unit/bio/db/genbank/test_genpept.rb +81 -0
  51. data/test/unit/bio/db/pdb/test_pdb.rb +3287 -11
  52. data/test/unit/bio/db/test_fasta.rb +34 -12
  53. data/test/unit/bio/db/test_fastq.rb +26 -0
  54. data/test/unit/bio/db/test_litdb.rb +95 -0
  55. data/test/unit/bio/db/test_medline.rb +1 -0
  56. data/test/unit/bio/db/test_nbrf.rb +82 -0
  57. data/test/unit/bio/db/test_newick.rb +22 -4
  58. data/test/unit/bio/test_reference.rb +35 -0
  59. data/test/unit/bio/util/restriction_enzyme/test_dense_int_array.rb +201 -0
  60. data/test/unit/bio/util/restriction_enzyme/test_sorted_num_array.rb +281 -0
  61. metadata +44 -38
@@ -127,6 +127,7 @@ class REST
127
127
  def ncbi_post_form(serv, opts)
128
128
  ncbi_check_parameters(opts)
129
129
  ncbi_access_wait
130
+ #$stderr.puts opts.inspect
130
131
  response = Bio::Command.post_form(serv, opts)
131
132
  response
132
133
  end
@@ -485,7 +486,7 @@ class REST
485
486
  # nucleotide = nuccore + nucest + nucgss
486
487
  #
487
488
  # format (rettype):
488
- # * native all but Gene Default format for viewing sequences
489
+ # * native all but Gene ASN Default format for viewing sequences
489
490
  # * fasta all sequence FASTA view of a sequence
490
491
  # * gb NA sequence GenBank view for sequences
491
492
  # * gbc NA sequence INSDSeq structured flat file
@@ -540,6 +541,125 @@ class REST
540
541
  Bio::NCBI::REST.efetch(ids, opts)
541
542
  end
542
543
 
544
+ # Retrieve nucleotide sequence entries by given IDs using E-Utils
545
+ # (efetch).
546
+ #
547
+ # * http://eutils.ncbi.nlm.nih.gov/entrez/query/static/efetchseq_help.html
548
+ # nucleotide = nuccore + nucest + nucgss
549
+ #
550
+ # format (rettype):
551
+ # * native all but Gene ASN Default format for viewing sequences
552
+ # * fasta all sequence FASTA view of a sequence
553
+ # * gb NA sequence GenBank view for sequences
554
+ # * gbc NA sequence INSDSeq structured flat file
555
+ # * gbwithparts NA sequence GenBank CON division with sequences
556
+ # * est dbEST sequence EST Report
557
+ # * gss dbGSS sequence GSS Report
558
+ # * gp AA sequence GenPept view
559
+ # * gpc AA sequence INSDSeq structured flat file
560
+ # * seqid all sequence Convert GIs into seqids
561
+ # * acc all sequence Convert GIs into accessions
562
+ # * chr dbSNP only SNP Chromosome Report
563
+ # * flt dbSNP only SNP Flat File report
564
+ # * rsr dbSNP only SNP RS Cluster report
565
+ # * brief dbSNP only SNP ID list
566
+ # * docset dbSNP only SNP RS summary
567
+ #
568
+ # == Usage
569
+ #
570
+ # Bio::NCBI::REST::EFetch.nucleotide("123,U12345,U12345.1,gb|U12345|")
571
+ #
572
+ # list = [123, "U12345.1", "gb|U12345|"]
573
+ # Bio::NCBI::REST::EFetch.nucleotide(list)
574
+ # Bio::NCBI::REST::EFetch.nucleotide(list, "fasta")
575
+ # Bio::NCBI::REST::EFetch.nucleotide(list, "acc")
576
+ # Bio::NCBI::REST::EFetch.nucleotide(list, "xml")
577
+ #
578
+ # Bio::NCBI::REST::EFetch.nucleotide("AE009950")
579
+ # Bio::NCBI::REST::EFetch.nucleotide("AE009950", "gbwithparts")
580
+ #
581
+ # ncbi = Bio::NCBI::REST::EFetch.new
582
+ # ncbi.nucleotide("123,U12345,U12345.1,gb|U12345|")
583
+ # ncbi.nucleotide(list)
584
+ # ncbi.nucleotide(list, "fasta")
585
+ # ncbi.nucleotide(list, "acc")
586
+ # ncbi.nucleotide(list, "xml")
587
+ # ncbi.nucleotide("AE009950")
588
+ # ncbi.nucleotide("AE009950", "gbwithparts")
589
+ #
590
+ # ---
591
+ #
592
+ # *Arguments*:
593
+ # * _ids_: list of NCBI entry IDs (required)
594
+ # * _format_: "gb", "gbc", "fasta", "acc", "xml" etc.
595
+ # *Returns*:: String
596
+ def nucleotide(ids, format = "gb", hash = {})
597
+ case format
598
+ when "xml"
599
+ format = "gbc"
600
+ end
601
+ opts = { "db" => "nucleotide", "rettype" => format }
602
+ opts.update(hash)
603
+ Bio::NCBI::REST.efetch(ids, opts)
604
+ end
605
+
606
+ # Retrieve protein sequence entries by given IDs using E-Utils
607
+ # (efetch).
608
+ #
609
+ # * http://eutils.ncbi.nlm.nih.gov/entrez/query/static/efetchseq_help.html
610
+ # protein
611
+ #
612
+ # format (rettype):
613
+ # * native all but Gene ASN Default format for viewing sequences
614
+ # * fasta all sequence FASTA view of a sequence
615
+ # * gb NA sequence GenBank view for sequences
616
+ # * gbc NA sequence INSDSeq structured flat file
617
+ # * gbwithparts NA sequence GenBank CON division with sequences
618
+ # * est dbEST sequence EST Report
619
+ # * gss dbGSS sequence GSS Report
620
+ # * gp AA sequence GenPept view
621
+ # * gpc AA sequence INSDSeq structured flat file
622
+ # * seqid all sequence Convert GIs into seqids
623
+ # * acc all sequence Convert GIs into accessions
624
+ # * chr dbSNP only SNP Chromosome Report
625
+ # * flt dbSNP only SNP Flat File report
626
+ # * rsr dbSNP only SNP RS Cluster report
627
+ # * brief dbSNP only SNP ID list
628
+ # * docset dbSNP only SNP RS summary
629
+ #
630
+ # == Usage
631
+ #
632
+ # Bio::NCBI::REST::EFetch.protein("7527480,AAF63163.1,AAF63163")
633
+ #
634
+ # list = [ 7527480, "AAF63163.1", "AAF63163"]
635
+ # Bio::NCBI::REST::EFetch.protein(list)
636
+ # Bio::NCBI::REST::EFetch.protein(list, "fasta")
637
+ # Bio::NCBI::REST::EFetch.protein(list, "acc")
638
+ # Bio::NCBI::REST::EFetch.protein(list, "xml")
639
+ #
640
+ # ncbi = Bio::NCBI::REST::EFetch.new
641
+ # ncbi.protein("7527480,AAF63163.1,AAF63163")
642
+ # ncbi.protein(list)
643
+ # ncbi.protein(list, "fasta")
644
+ # ncbi.protein(list, "acc")
645
+ # ncbi.protein(list, "xml")
646
+ #
647
+ # ---
648
+ #
649
+ # *Arguments*:
650
+ # * _ids_: list of NCBI entry IDs (required)
651
+ # * _format_: "gp", "gpc", "fasta", "acc", "xml" etc.
652
+ # *Returns*:: String
653
+ def protein(ids, format = "gp", hash = {})
654
+ case format
655
+ when "xml"
656
+ format = "gpc"
657
+ end
658
+ opts = { "db" => "protein", "rettype" => format }
659
+ opts.update(hash)
660
+ Bio::NCBI::REST.efetch(ids, opts)
661
+ end
662
+
543
663
  # Retrieve PubMed entries by given IDs using E-Utils (efetch).
544
664
  #
545
665
  # * http://eutils.ncbi.nlm.nih.gov/entrez/query/static/efetchlit_help.html
@@ -632,8 +632,8 @@ class Locations
632
632
  end
633
633
  end
634
634
 
635
- join_list.each do |position|
636
- ary << gbl_pos2loc(position)
635
+ join_list.each do |pos|
636
+ ary << gbl_pos2loc(pos)
637
637
  end
638
638
 
639
639
  when /^complement\((.*)\)$/ # (J) complement()
@@ -272,7 +272,7 @@ module Bio
272
272
  lines << "%N #{@issue}" unless @issue.to_s.empty?
273
273
  lines << "%P #{@pages}" unless @pages.empty?
274
274
  lines << "%M #{@pubmed}" unless @pubmed.to_s.empty?
275
- u = @url.empty? ? pubmed_url : @url
275
+ u = @url.to_s.empty? ? pubmed_url : @url
276
276
  lines << "%U #{u}" unless u.empty?
277
277
  lines << "%X #{@abstract}" unless @abstract.empty?
278
278
  @mesh.each do |term|
@@ -587,9 +587,8 @@ module Bio
587
587
  # *Returns*:: String
588
588
  def pubmed_url
589
589
  unless @pubmed.to_s.empty?
590
- cgi = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi"
591
- opts = "cmd=Retrieve&db=PubMed&dopt=Citation&list_uids"
592
- return "#{cgi}?#{opts}=#{@pubmed}"
590
+ head = "http://www.ncbi.nlm.nih.gov/pubmed"
591
+ return "#{head}/#{@pubmed}"
593
592
  end
594
593
  ''
595
594
  end
@@ -62,7 +62,7 @@ module Bio::Shell
62
62
  # * "db:entry" -- local BioFlat, OBDA, EMBOSS, KEGG API
63
63
  def getent(arg)
64
64
  entry = ""
65
- db, entry_id = arg.to_s.strip.split(/:/)
65
+ db, entry_id = arg.to_s.strip.split(/\:/, 2)
66
66
 
67
67
  # local file
68
68
  if arg.respond_to?(:gets) or File.exists?(arg)
@@ -81,8 +81,12 @@ module Bio::Shell
81
81
 
82
82
  else
83
83
  # EMBOSS USA in ~/.embossrc
84
- str = entret(arg)
85
- if $?.exitstatus == 0 and str.length != 0
84
+ begin
85
+ str = entret(arg)
86
+ rescue SystemCallError
87
+ str = ''
88
+ end
89
+ if $? and $?.exitstatus == 0 and str.length != 0
86
90
  puts "Retrieving entry from EMBOSS (#{arg})"
87
91
  entry = str
88
92
 
@@ -31,7 +31,11 @@ module Bio::Shell
31
31
  # Otherwise, it acts the same as Bio::NCBI::REST.efetch.
32
32
  def efetch(ids, *arg)
33
33
  if arg.empty? then
34
- Bio::NCBI::REST::EFetch.sequence(ids)
34
+ ret = Bio::NCBI::REST::EFetch.nucleotide(ids)
35
+ unless /^LOCUS / =~ ret.to_s then
36
+ ret = Bio::NCBI::REST::EFetch.protein(ids)
37
+ end
38
+ ret
35
39
  elsif arg[0].kind_of?(Symbol)
36
40
  meth = arg[0]
37
41
  case meth.to_s
@@ -125,6 +125,9 @@ class RestrictionEnzyme
125
125
  autoload :Analysis, 'bio/util/restriction_enzyme/analysis'
126
126
  autoload :Range, 'bio/util/restriction_enzyme/range/sequence_range'
127
127
 
128
+ autoload :SortedNumArray, 'bio/util/restriction_enzyme/sorted_num_array'
129
+ autoload :DenseIntArray, 'bio/util/restriction_enzyme/dense_int_array'
130
+
128
131
  include CutSymbol
129
132
  extend CutSymbol
130
133
 
@@ -0,0 +1,195 @@
1
+ #
2
+ # bio/util/restriction_enzyme/dense_int_array.rb - Internal data storage for Bio::RestrictionEnzyme::Range::SequenceRange
3
+ #
4
+ # Copyright:: Copyright (C) 2011
5
+ # Naohisa Goto <ng@bioruby.org>
6
+ # Tomoaki NISHIYAMA
7
+ # License:: The Ruby License
8
+ #
9
+
10
+ module Bio
11
+ class RestrictionEnzyme
12
+
13
+ # a class to store integer numbers, containing many contiguous
14
+ # integral numbers.
15
+ #
16
+ # Bio::RestrictionEnzyme internal use only.
17
+ # Please do not create the instance outside Bio::RestrictionEnzyme.
18
+ class DenseIntArray
19
+ MutableRange = Struct.new(:first, :last)
20
+
21
+ include Enumerable
22
+
23
+ # Same usage as Array.[]
24
+ def self.[](*args)
25
+ a = self.new
26
+ args.each do |elem|
27
+ a.push elem
28
+ end
29
+ a
30
+ end
31
+
32
+ # creates a new object
33
+ def initialize
34
+ @data = []
35
+ end
36
+
37
+ # initialize copy
38
+ def initialize_copy(other)
39
+ super(other)
40
+ @data = @data.collect { |elem| elem.dup }
41
+ end
42
+
43
+ # sets internal data object
44
+ def internal_data=(a)
45
+ #clear_cache
46
+ @data = a
47
+ self
48
+ end
49
+ protected :internal_data=
50
+
51
+ # gets internal data object
52
+ def internal_data
53
+ @data
54
+ end
55
+ protected :internal_data
56
+
57
+ # Same usage as Array#[]
58
+ def [](*arg)
59
+ #$stderr.puts "SortedIntArray#[]"
60
+ to_a[*arg]
61
+ end
62
+
63
+ # Not implemented
64
+ def []=(*arg)
65
+ raise NotImplementedError, 'DenseIntArray#[]= is not implemented.'
66
+ end
67
+
68
+ # Same usage as Array#each
69
+ def each
70
+ @data.each do |elem|
71
+ elem.first.upto(elem.last) { |num| yield num }
72
+ end
73
+ self
74
+ end
75
+
76
+ # Same usage as Array#reverse_each
77
+ def reverse_each
78
+ @data.reverse_each do |elem|
79
+ elem.last.downto(elem.first) { |num| yield num }
80
+ end
81
+ self
82
+ end
83
+
84
+ # Same usage as Array#+, but accepts only the same classes instance.
85
+ def +(other)
86
+ unless other.is_a?(self.class) then
87
+ raise TypeError, 'unsupported data type'
88
+ end
89
+ tmpdata = @data + other.internal_data
90
+ tmpdata.sort! { |a,b| a.first <=> b.first }
91
+ result = self.class.new
92
+ return result if tmpdata.empty?
93
+ newdata = result.internal_data
94
+ newdata.push tmpdata[0].dup
95
+ (1...(tmpdata.size)).each do |i|
96
+ if (x = newdata[-1].last) >= tmpdata[i].first then
97
+ newdata[-1].last = tmpdata[i].last if tmpdata[i].last > x
98
+ else
99
+ newdata.push tmpdata[i].dup
100
+ end
101
+ end
102
+ result
103
+ end
104
+
105
+ # Same usage as Array#==
106
+ def ==(other)
107
+ if r = super(other) then
108
+ r
109
+ elsif other.is_a?(self.class) then
110
+ other.internal_data == @data
111
+ else
112
+ false
113
+ end
114
+ end
115
+
116
+ # Same usage as Array#concat
117
+ def concat(ary)
118
+ ary.each { |elem| self.<<(elem) }
119
+ self
120
+ end
121
+
122
+ # Same usage as Array#push
123
+ def push(*args)
124
+ args.each do |elem|
125
+ self.<<(elem)
126
+ end
127
+ self
128
+ end
129
+
130
+ # Same usage as Array#unshift
131
+ def unshift(*arg)
132
+ raise NotImplementedError, 'DenseIntArray#unshift is not implemented.'
133
+ end
134
+
135
+ # Same usage as Array#<<
136
+ def <<(elem)
137
+ if !@data.empty? and
138
+ @data[-1].last + 1 == elem then
139
+ @data[-1].last = elem
140
+ else
141
+ @data << MutableRange.new(elem, elem)
142
+ end
143
+ self
144
+ end
145
+
146
+ # Same usage as Array#include?
147
+ def include?(elem)
148
+ return false if @data.empty? or elem < self.first or self.last < elem
149
+ @data.any? do |range|
150
+ range.first <= elem && elem <= range.last
151
+ end
152
+ end
153
+
154
+ # Same usage as Array#first
155
+ def first
156
+ elem = @data.first
157
+ elem ? elem.first : nil
158
+ end
159
+
160
+ # Same usage as Array#last
161
+ def last
162
+ elem = @data.last
163
+ elem ? elem.last : nil
164
+ end
165
+
166
+ # Same usage as Array#size
167
+ def size
168
+ sum = 0
169
+ @data.each do |range|
170
+ sum += (range.last - range.first + 1)
171
+ end
172
+ sum
173
+ end
174
+ alias length size
175
+
176
+ # Same usage as Array#delete
177
+ def delete(elem)
178
+ raise NotImplementedError, 'DenseIntArray#delete is not implemented.'
179
+ end
180
+
181
+ # Does nothing
182
+ def sort!(&block)
183
+ # does nothing
184
+ self
185
+ end
186
+
187
+ # Does nothing
188
+ def uniq!
189
+ # does nothing
190
+ self
191
+ end
192
+ end #class DenseIntArray
193
+
194
+ end #class RestrictionEnzyme
195
+ end #module Bio
@@ -5,7 +5,7 @@
5
5
  # Copyright:: Copyright (c) 2005-2007 Midwinter Laboratories, LLC (http://midwinterlabs.com)
6
6
  # License:: The Ruby License
7
7
  #
8
- # $Id: sequence_range.rb,v 1.9 2007/07/16 19:28:48 k Exp $
8
+ # $Id:$
9
9
  #
10
10
 
11
11
  require 'bio/util/restriction_enzyme'
@@ -160,7 +160,7 @@ class SequenceRange
160
160
  @__fragments_current = true
161
161
 
162
162
  num_txt = '0123456789'
163
- num_txt_repeat = (num_txt * ( @size / num_txt.size.to_f ).ceil)[0..@size-1]
163
+ num_txt_repeat = (num_txt * ( @size.div(num_txt.size) + 1))[0..@size-1]
164
164
  fragments = Fragments.new(num_txt_repeat, num_txt_repeat)
165
165
 
166
166
  cc = Bio::RestrictionEnzyme::Range::SequenceRange::CalculatedCuts.new(@size)
@@ -193,9 +193,9 @@ class SequenceRange
193
193
  # * +cc+: Bio::RestrictionEnzyme::Range::SequenceRange::CalculatedCuts
194
194
  # *Returns*:: +Hash+ Keys are unique, values are Bio::RestrictionEnzyme::Range::SequenceRange::Bin objects filled with indexes of the sequence locations they represent.
195
195
  def create_bins(cc)
196
- p_cut = cc.vc_primary
197
- c_cut = cc.vc_complement
198
- h_cut = cc.hc_between_strands
196
+ p_cut = cc.vc_primary_as_original_class
197
+ c_cut = cc.vc_complement_as_original_class
198
+ h_cut = cc.hc_between_strands_as_original_class
199
199
 
200
200
  if @circular
201
201
  # NOTE
@@ -247,8 +247,8 @@ class SequenceRange
247
247
  # initializing the bin.
248
248
  def setup_new_bin(bins, bin_id)
249
249
  bins[ bin_id ] = Bin.new
250
- bins[ bin_id ].p = []
251
- bins[ bin_id ].c = []
250
+ bins[ bin_id ].p = DenseIntArray[] #could be replaced by SortedNumArray[]
251
+ bins[ bin_id ].c = DenseIntArray[] #could be replaced by SortedNumArray[]
252
252
  end
253
253
 
254
254
  end # SequenceRange