bio 1.4.1 → 1.4.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (61) hide show
  1. data/ChangeLog +954 -0
  2. data/KNOWN_ISSUES.rdoc +40 -5
  3. data/README.rdoc +36 -35
  4. data/RELEASE_NOTES.rdoc +87 -59
  5. data/bioruby.gemspec +24 -2
  6. data/doc/RELEASE_NOTES-1.4.1.rdoc +104 -0
  7. data/doc/Tutorial.rd +162 -200
  8. data/doc/Tutorial.rd.html +149 -146
  9. data/lib/bio.rb +1 -0
  10. data/lib/bio/appl/blast.rb +1 -1
  11. data/lib/bio/appl/blast/ddbj.rb +26 -34
  12. data/lib/bio/appl/blast/genomenet.rb +21 -11
  13. data/lib/bio/db/embl/sptr.rb +193 -21
  14. data/lib/bio/db/fasta.rb +1 -1
  15. data/lib/bio/db/fastq.rb +14 -0
  16. data/lib/bio/db/fastq/format_fastq.rb +2 -2
  17. data/lib/bio/db/genbank/ddbj.rb +1 -2
  18. data/lib/bio/db/genbank/format_genbank.rb +1 -1
  19. data/lib/bio/db/medline.rb +1 -0
  20. data/lib/bio/db/newick.rb +3 -1
  21. data/lib/bio/db/pdb/pdb.rb +9 -9
  22. data/lib/bio/db/pdb/residue.rb +2 -2
  23. data/lib/bio/io/ddbjrest.rb +344 -0
  24. data/lib/bio/io/ncbirest.rb +121 -1
  25. data/lib/bio/location.rb +2 -2
  26. data/lib/bio/reference.rb +3 -4
  27. data/lib/bio/shell/plugin/entry.rb +7 -3
  28. data/lib/bio/shell/plugin/ncbirest.rb +5 -1
  29. data/lib/bio/util/restriction_enzyme.rb +3 -0
  30. data/lib/bio/util/restriction_enzyme/dense_int_array.rb +195 -0
  31. data/lib/bio/util/restriction_enzyme/range/sequence_range.rb +7 -7
  32. data/lib/bio/util/restriction_enzyme/range/sequence_range/calculated_cuts.rb +57 -18
  33. data/lib/bio/util/restriction_enzyme/range/sequence_range/fragment.rb +2 -2
  34. data/lib/bio/util/restriction_enzyme/sorted_num_array.rb +219 -0
  35. data/lib/bio/version.rb +1 -1
  36. data/sample/test_restriction_enzyme_long.rb +4403 -0
  37. data/test/data/fasta/EFTU_BACSU.fasta +8 -0
  38. data/test/data/genbank/CAA35997.gp +48 -0
  39. data/test/data/genbank/SCU49845.gb +167 -0
  40. data/test/data/litdb/1717226.litdb +13 -0
  41. data/test/data/pir/CRAB_ANAPL.pir +6 -0
  42. data/test/functional/bio/appl/blast/test_remote.rb +93 -0
  43. data/test/functional/bio/appl/test_blast.rb +61 -0
  44. data/test/functional/bio/io/test_ddbjrest.rb +47 -0
  45. data/test/functional/bio/test_command.rb +3 -3
  46. data/test/unit/bio/db/embl/test_sptr.rb +6 -6
  47. data/test/unit/bio/db/embl/test_uniprot_new_part.rb +208 -0
  48. data/test/unit/bio/db/genbank/test_common.rb +274 -0
  49. data/test/unit/bio/db/genbank/test_genbank.rb +401 -0
  50. data/test/unit/bio/db/genbank/test_genpept.rb +81 -0
  51. data/test/unit/bio/db/pdb/test_pdb.rb +3287 -11
  52. data/test/unit/bio/db/test_fasta.rb +34 -12
  53. data/test/unit/bio/db/test_fastq.rb +26 -0
  54. data/test/unit/bio/db/test_litdb.rb +95 -0
  55. data/test/unit/bio/db/test_medline.rb +1 -0
  56. data/test/unit/bio/db/test_nbrf.rb +82 -0
  57. data/test/unit/bio/db/test_newick.rb +22 -4
  58. data/test/unit/bio/test_reference.rb +35 -0
  59. data/test/unit/bio/util/restriction_enzyme/test_dense_int_array.rb +201 -0
  60. data/test/unit/bio/util/restriction_enzyme/test_sorted_num_array.rb +281 -0
  61. metadata +44 -38
@@ -127,6 +127,7 @@ class REST
127
127
  def ncbi_post_form(serv, opts)
128
128
  ncbi_check_parameters(opts)
129
129
  ncbi_access_wait
130
+ #$stderr.puts opts.inspect
130
131
  response = Bio::Command.post_form(serv, opts)
131
132
  response
132
133
  end
@@ -485,7 +486,7 @@ class REST
485
486
  # nucleotide = nuccore + nucest + nucgss
486
487
  #
487
488
  # format (rettype):
488
- # * native all but Gene Default format for viewing sequences
489
+ # * native all but Gene ASN Default format for viewing sequences
489
490
  # * fasta all sequence FASTA view of a sequence
490
491
  # * gb NA sequence GenBank view for sequences
491
492
  # * gbc NA sequence INSDSeq structured flat file
@@ -540,6 +541,125 @@ class REST
540
541
  Bio::NCBI::REST.efetch(ids, opts)
541
542
  end
542
543
 
544
+ # Retrieve nucleotide sequence entries by given IDs using E-Utils
545
+ # (efetch).
546
+ #
547
+ # * http://eutils.ncbi.nlm.nih.gov/entrez/query/static/efetchseq_help.html
548
+ # nucleotide = nuccore + nucest + nucgss
549
+ #
550
+ # format (rettype):
551
+ # * native all but Gene ASN Default format for viewing sequences
552
+ # * fasta all sequence FASTA view of a sequence
553
+ # * gb NA sequence GenBank view for sequences
554
+ # * gbc NA sequence INSDSeq structured flat file
555
+ # * gbwithparts NA sequence GenBank CON division with sequences
556
+ # * est dbEST sequence EST Report
557
+ # * gss dbGSS sequence GSS Report
558
+ # * gp AA sequence GenPept view
559
+ # * gpc AA sequence INSDSeq structured flat file
560
+ # * seqid all sequence Convert GIs into seqids
561
+ # * acc all sequence Convert GIs into accessions
562
+ # * chr dbSNP only SNP Chromosome Report
563
+ # * flt dbSNP only SNP Flat File report
564
+ # * rsr dbSNP only SNP RS Cluster report
565
+ # * brief dbSNP only SNP ID list
566
+ # * docset dbSNP only SNP RS summary
567
+ #
568
+ # == Usage
569
+ #
570
+ # Bio::NCBI::REST::EFetch.nucleotide("123,U12345,U12345.1,gb|U12345|")
571
+ #
572
+ # list = [123, "U12345.1", "gb|U12345|"]
573
+ # Bio::NCBI::REST::EFetch.nucleotide(list)
574
+ # Bio::NCBI::REST::EFetch.nucleotide(list, "fasta")
575
+ # Bio::NCBI::REST::EFetch.nucleotide(list, "acc")
576
+ # Bio::NCBI::REST::EFetch.nucleotide(list, "xml")
577
+ #
578
+ # Bio::NCBI::REST::EFetch.nucleotide("AE009950")
579
+ # Bio::NCBI::REST::EFetch.nucleotide("AE009950", "gbwithparts")
580
+ #
581
+ # ncbi = Bio::NCBI::REST::EFetch.new
582
+ # ncbi.nucleotide("123,U12345,U12345.1,gb|U12345|")
583
+ # ncbi.nucleotide(list)
584
+ # ncbi.nucleotide(list, "fasta")
585
+ # ncbi.nucleotide(list, "acc")
586
+ # ncbi.nucleotide(list, "xml")
587
+ # ncbi.nucleotide("AE009950")
588
+ # ncbi.nucleotide("AE009950", "gbwithparts")
589
+ #
590
+ # ---
591
+ #
592
+ # *Arguments*:
593
+ # * _ids_: list of NCBI entry IDs (required)
594
+ # * _format_: "gb", "gbc", "fasta", "acc", "xml" etc.
595
+ # *Returns*:: String
596
+ def nucleotide(ids, format = "gb", hash = {})
597
+ case format
598
+ when "xml"
599
+ format = "gbc"
600
+ end
601
+ opts = { "db" => "nucleotide", "rettype" => format }
602
+ opts.update(hash)
603
+ Bio::NCBI::REST.efetch(ids, opts)
604
+ end
605
+
606
+ # Retrieve protein sequence entries by given IDs using E-Utils
607
+ # (efetch).
608
+ #
609
+ # * http://eutils.ncbi.nlm.nih.gov/entrez/query/static/efetchseq_help.html
610
+ # protein
611
+ #
612
+ # format (rettype):
613
+ # * native all but Gene ASN Default format for viewing sequences
614
+ # * fasta all sequence FASTA view of a sequence
615
+ # * gb NA sequence GenBank view for sequences
616
+ # * gbc NA sequence INSDSeq structured flat file
617
+ # * gbwithparts NA sequence GenBank CON division with sequences
618
+ # * est dbEST sequence EST Report
619
+ # * gss dbGSS sequence GSS Report
620
+ # * gp AA sequence GenPept view
621
+ # * gpc AA sequence INSDSeq structured flat file
622
+ # * seqid all sequence Convert GIs into seqids
623
+ # * acc all sequence Convert GIs into accessions
624
+ # * chr dbSNP only SNP Chromosome Report
625
+ # * flt dbSNP only SNP Flat File report
626
+ # * rsr dbSNP only SNP RS Cluster report
627
+ # * brief dbSNP only SNP ID list
628
+ # * docset dbSNP only SNP RS summary
629
+ #
630
+ # == Usage
631
+ #
632
+ # Bio::NCBI::REST::EFetch.protein("7527480,AAF63163.1,AAF63163")
633
+ #
634
+ # list = [ 7527480, "AAF63163.1", "AAF63163"]
635
+ # Bio::NCBI::REST::EFetch.protein(list)
636
+ # Bio::NCBI::REST::EFetch.protein(list, "fasta")
637
+ # Bio::NCBI::REST::EFetch.protein(list, "acc")
638
+ # Bio::NCBI::REST::EFetch.protein(list, "xml")
639
+ #
640
+ # ncbi = Bio::NCBI::REST::EFetch.new
641
+ # ncbi.protein("7527480,AAF63163.1,AAF63163")
642
+ # ncbi.protein(list)
643
+ # ncbi.protein(list, "fasta")
644
+ # ncbi.protein(list, "acc")
645
+ # ncbi.protein(list, "xml")
646
+ #
647
+ # ---
648
+ #
649
+ # *Arguments*:
650
+ # * _ids_: list of NCBI entry IDs (required)
651
+ # * _format_: "gp", "gpc", "fasta", "acc", "xml" etc.
652
+ # *Returns*:: String
653
+ def protein(ids, format = "gp", hash = {})
654
+ case format
655
+ when "xml"
656
+ format = "gpc"
657
+ end
658
+ opts = { "db" => "protein", "rettype" => format }
659
+ opts.update(hash)
660
+ Bio::NCBI::REST.efetch(ids, opts)
661
+ end
662
+
543
663
  # Retrieve PubMed entries by given IDs using E-Utils (efetch).
544
664
  #
545
665
  # * http://eutils.ncbi.nlm.nih.gov/entrez/query/static/efetchlit_help.html
@@ -632,8 +632,8 @@ class Locations
632
632
  end
633
633
  end
634
634
 
635
- join_list.each do |position|
636
- ary << gbl_pos2loc(position)
635
+ join_list.each do |pos|
636
+ ary << gbl_pos2loc(pos)
637
637
  end
638
638
 
639
639
  when /^complement\((.*)\)$/ # (J) complement()
@@ -272,7 +272,7 @@ module Bio
272
272
  lines << "%N #{@issue}" unless @issue.to_s.empty?
273
273
  lines << "%P #{@pages}" unless @pages.empty?
274
274
  lines << "%M #{@pubmed}" unless @pubmed.to_s.empty?
275
- u = @url.empty? ? pubmed_url : @url
275
+ u = @url.to_s.empty? ? pubmed_url : @url
276
276
  lines << "%U #{u}" unless u.empty?
277
277
  lines << "%X #{@abstract}" unless @abstract.empty?
278
278
  @mesh.each do |term|
@@ -587,9 +587,8 @@ module Bio
587
587
  # *Returns*:: String
588
588
  def pubmed_url
589
589
  unless @pubmed.to_s.empty?
590
- cgi = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi"
591
- opts = "cmd=Retrieve&db=PubMed&dopt=Citation&list_uids"
592
- return "#{cgi}?#{opts}=#{@pubmed}"
590
+ head = "http://www.ncbi.nlm.nih.gov/pubmed"
591
+ return "#{head}/#{@pubmed}"
593
592
  end
594
593
  ''
595
594
  end
@@ -62,7 +62,7 @@ module Bio::Shell
62
62
  # * "db:entry" -- local BioFlat, OBDA, EMBOSS, KEGG API
63
63
  def getent(arg)
64
64
  entry = ""
65
- db, entry_id = arg.to_s.strip.split(/:/)
65
+ db, entry_id = arg.to_s.strip.split(/\:/, 2)
66
66
 
67
67
  # local file
68
68
  if arg.respond_to?(:gets) or File.exists?(arg)
@@ -81,8 +81,12 @@ module Bio::Shell
81
81
 
82
82
  else
83
83
  # EMBOSS USA in ~/.embossrc
84
- str = entret(arg)
85
- if $?.exitstatus == 0 and str.length != 0
84
+ begin
85
+ str = entret(arg)
86
+ rescue SystemCallError
87
+ str = ''
88
+ end
89
+ if $? and $?.exitstatus == 0 and str.length != 0
86
90
  puts "Retrieving entry from EMBOSS (#{arg})"
87
91
  entry = str
88
92
 
@@ -31,7 +31,11 @@ module Bio::Shell
31
31
  # Otherwise, it acts the same as Bio::NCBI::REST.efetch.
32
32
  def efetch(ids, *arg)
33
33
  if arg.empty? then
34
- Bio::NCBI::REST::EFetch.sequence(ids)
34
+ ret = Bio::NCBI::REST::EFetch.nucleotide(ids)
35
+ unless /^LOCUS / =~ ret.to_s then
36
+ ret = Bio::NCBI::REST::EFetch.protein(ids)
37
+ end
38
+ ret
35
39
  elsif arg[0].kind_of?(Symbol)
36
40
  meth = arg[0]
37
41
  case meth.to_s
@@ -125,6 +125,9 @@ class RestrictionEnzyme
125
125
  autoload :Analysis, 'bio/util/restriction_enzyme/analysis'
126
126
  autoload :Range, 'bio/util/restriction_enzyme/range/sequence_range'
127
127
 
128
+ autoload :SortedNumArray, 'bio/util/restriction_enzyme/sorted_num_array'
129
+ autoload :DenseIntArray, 'bio/util/restriction_enzyme/dense_int_array'
130
+
128
131
  include CutSymbol
129
132
  extend CutSymbol
130
133
 
@@ -0,0 +1,195 @@
1
+ #
2
+ # bio/util/restriction_enzyme/dense_int_array.rb - Internal data storage for Bio::RestrictionEnzyme::Range::SequenceRange
3
+ #
4
+ # Copyright:: Copyright (C) 2011
5
+ # Naohisa Goto <ng@bioruby.org>
6
+ # Tomoaki NISHIYAMA
7
+ # License:: The Ruby License
8
+ #
9
+
10
+ module Bio
11
+ class RestrictionEnzyme
12
+
13
+ # a class to store integer numbers, containing many contiguous
14
+ # integral numbers.
15
+ #
16
+ # Bio::RestrictionEnzyme internal use only.
17
+ # Please do not create the instance outside Bio::RestrictionEnzyme.
18
+ class DenseIntArray
19
+ MutableRange = Struct.new(:first, :last)
20
+
21
+ include Enumerable
22
+
23
+ # Same usage as Array.[]
24
+ def self.[](*args)
25
+ a = self.new
26
+ args.each do |elem|
27
+ a.push elem
28
+ end
29
+ a
30
+ end
31
+
32
+ # creates a new object
33
+ def initialize
34
+ @data = []
35
+ end
36
+
37
+ # initialize copy
38
+ def initialize_copy(other)
39
+ super(other)
40
+ @data = @data.collect { |elem| elem.dup }
41
+ end
42
+
43
+ # sets internal data object
44
+ def internal_data=(a)
45
+ #clear_cache
46
+ @data = a
47
+ self
48
+ end
49
+ protected :internal_data=
50
+
51
+ # gets internal data object
52
+ def internal_data
53
+ @data
54
+ end
55
+ protected :internal_data
56
+
57
+ # Same usage as Array#[]
58
+ def [](*arg)
59
+ #$stderr.puts "SortedIntArray#[]"
60
+ to_a[*arg]
61
+ end
62
+
63
+ # Not implemented
64
+ def []=(*arg)
65
+ raise NotImplementedError, 'DenseIntArray#[]= is not implemented.'
66
+ end
67
+
68
+ # Same usage as Array#each
69
+ def each
70
+ @data.each do |elem|
71
+ elem.first.upto(elem.last) { |num| yield num }
72
+ end
73
+ self
74
+ end
75
+
76
+ # Same usage as Array#reverse_each
77
+ def reverse_each
78
+ @data.reverse_each do |elem|
79
+ elem.last.downto(elem.first) { |num| yield num }
80
+ end
81
+ self
82
+ end
83
+
84
+ # Same usage as Array#+, but accepts only the same classes instance.
85
+ def +(other)
86
+ unless other.is_a?(self.class) then
87
+ raise TypeError, 'unsupported data type'
88
+ end
89
+ tmpdata = @data + other.internal_data
90
+ tmpdata.sort! { |a,b| a.first <=> b.first }
91
+ result = self.class.new
92
+ return result if tmpdata.empty?
93
+ newdata = result.internal_data
94
+ newdata.push tmpdata[0].dup
95
+ (1...(tmpdata.size)).each do |i|
96
+ if (x = newdata[-1].last) >= tmpdata[i].first then
97
+ newdata[-1].last = tmpdata[i].last if tmpdata[i].last > x
98
+ else
99
+ newdata.push tmpdata[i].dup
100
+ end
101
+ end
102
+ result
103
+ end
104
+
105
+ # Same usage as Array#==
106
+ def ==(other)
107
+ if r = super(other) then
108
+ r
109
+ elsif other.is_a?(self.class) then
110
+ other.internal_data == @data
111
+ else
112
+ false
113
+ end
114
+ end
115
+
116
+ # Same usage as Array#concat
117
+ def concat(ary)
118
+ ary.each { |elem| self.<<(elem) }
119
+ self
120
+ end
121
+
122
+ # Same usage as Array#push
123
+ def push(*args)
124
+ args.each do |elem|
125
+ self.<<(elem)
126
+ end
127
+ self
128
+ end
129
+
130
+ # Same usage as Array#unshift
131
+ def unshift(*arg)
132
+ raise NotImplementedError, 'DenseIntArray#unshift is not implemented.'
133
+ end
134
+
135
+ # Same usage as Array#<<
136
+ def <<(elem)
137
+ if !@data.empty? and
138
+ @data[-1].last + 1 == elem then
139
+ @data[-1].last = elem
140
+ else
141
+ @data << MutableRange.new(elem, elem)
142
+ end
143
+ self
144
+ end
145
+
146
+ # Same usage as Array#include?
147
+ def include?(elem)
148
+ return false if @data.empty? or elem < self.first or self.last < elem
149
+ @data.any? do |range|
150
+ range.first <= elem && elem <= range.last
151
+ end
152
+ end
153
+
154
+ # Same usage as Array#first
155
+ def first
156
+ elem = @data.first
157
+ elem ? elem.first : nil
158
+ end
159
+
160
+ # Same usage as Array#last
161
+ def last
162
+ elem = @data.last
163
+ elem ? elem.last : nil
164
+ end
165
+
166
+ # Same usage as Array#size
167
+ def size
168
+ sum = 0
169
+ @data.each do |range|
170
+ sum += (range.last - range.first + 1)
171
+ end
172
+ sum
173
+ end
174
+ alias length size
175
+
176
+ # Same usage as Array#delete
177
+ def delete(elem)
178
+ raise NotImplementedError, 'DenseIntArray#delete is not implemented.'
179
+ end
180
+
181
+ # Does nothing
182
+ def sort!(&block)
183
+ # does nothing
184
+ self
185
+ end
186
+
187
+ # Does nothing
188
+ def uniq!
189
+ # does nothing
190
+ self
191
+ end
192
+ end #class DenseIntArray
193
+
194
+ end #class RestrictionEnzyme
195
+ end #module Bio
@@ -5,7 +5,7 @@
5
5
  # Copyright:: Copyright (c) 2005-2007 Midwinter Laboratories, LLC (http://midwinterlabs.com)
6
6
  # License:: The Ruby License
7
7
  #
8
- # $Id: sequence_range.rb,v 1.9 2007/07/16 19:28:48 k Exp $
8
+ # $Id:$
9
9
  #
10
10
 
11
11
  require 'bio/util/restriction_enzyme'
@@ -160,7 +160,7 @@ class SequenceRange
160
160
  @__fragments_current = true
161
161
 
162
162
  num_txt = '0123456789'
163
- num_txt_repeat = (num_txt * ( @size / num_txt.size.to_f ).ceil)[0..@size-1]
163
+ num_txt_repeat = (num_txt * ( @size.div(num_txt.size) + 1))[0..@size-1]
164
164
  fragments = Fragments.new(num_txt_repeat, num_txt_repeat)
165
165
 
166
166
  cc = Bio::RestrictionEnzyme::Range::SequenceRange::CalculatedCuts.new(@size)
@@ -193,9 +193,9 @@ class SequenceRange
193
193
  # * +cc+: Bio::RestrictionEnzyme::Range::SequenceRange::CalculatedCuts
194
194
  # *Returns*:: +Hash+ Keys are unique, values are Bio::RestrictionEnzyme::Range::SequenceRange::Bin objects filled with indexes of the sequence locations they represent.
195
195
  def create_bins(cc)
196
- p_cut = cc.vc_primary
197
- c_cut = cc.vc_complement
198
- h_cut = cc.hc_between_strands
196
+ p_cut = cc.vc_primary_as_original_class
197
+ c_cut = cc.vc_complement_as_original_class
198
+ h_cut = cc.hc_between_strands_as_original_class
199
199
 
200
200
  if @circular
201
201
  # NOTE
@@ -247,8 +247,8 @@ class SequenceRange
247
247
  # initializing the bin.
248
248
  def setup_new_bin(bins, bin_id)
249
249
  bins[ bin_id ] = Bin.new
250
- bins[ bin_id ].p = []
251
- bins[ bin_id ].c = []
250
+ bins[ bin_id ].p = DenseIntArray[] #could be replaced by SortedNumArray[]
251
+ bins[ bin_id ].c = DenseIntArray[] #could be replaced by SortedNumArray[]
252
252
  end
253
253
 
254
254
  end # SequenceRange