bio 1.2.1 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (259) hide show
  1. data/ChangeLog +3421 -0
  2. data/KNOWN_ISSUES.rdoc +88 -0
  3. data/README.rdoc +252 -0
  4. data/README_DEV.rdoc +285 -0
  5. data/Rakefile +143 -0
  6. data/bin/bioruby +0 -0
  7. data/bin/br_biofetch.rb +0 -0
  8. data/bin/br_bioflat.rb +12 -1
  9. data/bin/br_biogetseq.rb +0 -0
  10. data/bin/br_pmfetch.rb +4 -3
  11. data/bioruby.gemspec +477 -0
  12. data/bioruby.gemspec.erb +117 -0
  13. data/doc/Changes-0.7.rd +7 -0
  14. data/doc/Changes-1.3.rdoc +239 -0
  15. data/doc/Tutorial.rd +296 -184
  16. data/doc/Tutorial.rd.html +1031 -0
  17. data/doc/Tutorial.rd.ja +111 -45
  18. data/doc/Tutorial.rd.ja.html +2225 -0
  19. data/doc/bioruby.css +281 -0
  20. data/extconf.rb +2 -0
  21. data/lib/bio.rb +29 -4
  22. data/lib/bio/appl/blast.rb +306 -121
  23. data/lib/bio/appl/blast/ddbj.rb +142 -0
  24. data/lib/bio/appl/blast/format0.rb +35 -25
  25. data/lib/bio/appl/blast/format8.rb +2 -2
  26. data/lib/bio/appl/blast/genomenet.rb +263 -0
  27. data/lib/bio/appl/blast/ncbioptions.rb +220 -0
  28. data/lib/bio/appl/blast/remote.rb +106 -0
  29. data/lib/bio/appl/blast/report.rb +260 -9
  30. data/lib/bio/appl/blast/rexml.rb +12 -5
  31. data/lib/bio/appl/blast/rpsblast.rb +277 -0
  32. data/lib/bio/appl/blast/wublast.rb +133 -12
  33. data/lib/bio/appl/blast/xmlparser.rb +35 -18
  34. data/lib/bio/appl/blat/report.rb +46 -5
  35. data/lib/bio/appl/emboss.rb +62 -13
  36. data/lib/bio/appl/fasta.rb +9 -11
  37. data/lib/bio/appl/genscan/report.rb +3 -3
  38. data/lib/bio/appl/hmmer.rb +1 -1
  39. data/lib/bio/appl/hmmer/report.rb +10 -10
  40. data/lib/bio/appl/paml/baseml.rb +95 -0
  41. data/lib/bio/appl/paml/baseml/report.rb +32 -0
  42. data/lib/bio/appl/paml/codeml.rb +242 -0
  43. data/lib/bio/appl/paml/codeml/rates.rb +67 -0
  44. data/lib/bio/appl/paml/codeml/report.rb +67 -0
  45. data/lib/bio/appl/paml/common.rb +348 -0
  46. data/lib/bio/appl/paml/common_report.rb +38 -0
  47. data/lib/bio/appl/paml/yn00.rb +103 -0
  48. data/lib/bio/appl/paml/yn00/report.rb +32 -0
  49. data/lib/bio/appl/psort.rb +2 -2
  50. data/lib/bio/appl/pts1.rb +5 -5
  51. data/lib/bio/appl/tmhmm/report.rb +10 -1
  52. data/lib/bio/command.rb +297 -41
  53. data/lib/bio/compat/features.rb +157 -0
  54. data/lib/bio/compat/references.rb +128 -0
  55. data/lib/bio/db/biosql/biosql_to_biosequence.rb +67 -0
  56. data/lib/bio/db/biosql/sequence.rb +508 -0
  57. data/lib/bio/db/embl/common.rb +28 -12
  58. data/lib/bio/db/embl/embl.rb +107 -9
  59. data/lib/bio/db/embl/embl_to_biosequence.rb +85 -0
  60. data/lib/bio/db/embl/format_embl.rb +190 -0
  61. data/lib/bio/db/embl/sptr.rb +15 -16
  62. data/lib/bio/db/fantom.rb +6 -8
  63. data/lib/bio/db/fasta.rb +10 -507
  64. data/lib/bio/db/fasta/defline.rb +532 -0
  65. data/lib/bio/db/fasta/fasta_to_biosequence.rb +63 -0
  66. data/lib/bio/db/fasta/format_fasta.rb +97 -0
  67. data/lib/bio/db/genbank/common.rb +25 -8
  68. data/lib/bio/db/genbank/format_genbank.rb +187 -0
  69. data/lib/bio/db/genbank/genbank.rb +36 -1
  70. data/lib/bio/db/genbank/genbank_to_biosequence.rb +86 -0
  71. data/lib/bio/db/gff.rb +1791 -119
  72. data/lib/bio/db/kegg/glycan.rb +2 -6
  73. data/lib/bio/db/lasergene.rb +3 -3
  74. data/lib/bio/db/medline.rb +4 -1
  75. data/lib/bio/db/newick.rb +10 -10
  76. data/lib/bio/db/pdb/chain.rb +6 -2
  77. data/lib/bio/db/pdb/pdb.rb +12 -3
  78. data/lib/bio/db/rebase.rb +7 -8
  79. data/lib/bio/db/soft.rb +3 -3
  80. data/lib/bio/feature.rb +1 -88
  81. data/lib/bio/io/biosql/biodatabase.rb +64 -0
  82. data/lib/bio/io/biosql/bioentry.rb +29 -0
  83. data/lib/bio/io/biosql/bioentry_dbxref.rb +11 -0
  84. data/lib/bio/io/biosql/bioentry_path.rb +12 -0
  85. data/lib/bio/io/biosql/bioentry_qualifier_value.rb +10 -0
  86. data/lib/bio/io/biosql/bioentry_reference.rb +10 -0
  87. data/lib/bio/io/biosql/bioentry_relationship.rb +10 -0
  88. data/lib/bio/io/biosql/biosequence.rb +11 -0
  89. data/lib/bio/io/biosql/comment.rb +7 -0
  90. data/lib/bio/io/biosql/config/database.yml +20 -0
  91. data/lib/bio/io/biosql/dbxref.rb +13 -0
  92. data/lib/bio/io/biosql/dbxref_qualifier_value.rb +12 -0
  93. data/lib/bio/io/biosql/location.rb +32 -0
  94. data/lib/bio/io/biosql/location_qualifier_value.rb +11 -0
  95. data/lib/bio/io/biosql/ontology.rb +10 -0
  96. data/lib/bio/io/biosql/reference.rb +9 -0
  97. data/lib/bio/io/biosql/seqfeature.rb +32 -0
  98. data/lib/bio/io/biosql/seqfeature_dbxref.rb +11 -0
  99. data/lib/bio/io/biosql/seqfeature_path.rb +11 -0
  100. data/lib/bio/io/biosql/seqfeature_qualifier_value.rb +20 -0
  101. data/lib/bio/io/biosql/seqfeature_relationship.rb +11 -0
  102. data/lib/bio/io/biosql/taxon.rb +12 -0
  103. data/lib/bio/io/biosql/taxon_name.rb +9 -0
  104. data/lib/bio/io/biosql/term.rb +27 -0
  105. data/lib/bio/io/biosql/term_dbxref.rb +11 -0
  106. data/lib/bio/io/biosql/term_path.rb +12 -0
  107. data/lib/bio/io/biosql/term_relationship.rb +13 -0
  108. data/lib/bio/io/biosql/term_relationship_term.rb +11 -0
  109. data/lib/bio/io/biosql/term_synonym.rb +10 -0
  110. data/lib/bio/io/das.rb +7 -7
  111. data/lib/bio/io/ddbjxml.rb +57 -0
  112. data/lib/bio/io/ensembl.rb +2 -2
  113. data/lib/bio/io/fetch.rb +28 -14
  114. data/lib/bio/io/flatfile.rb +17 -853
  115. data/lib/bio/io/flatfile/autodetection.rb +545 -0
  116. data/lib/bio/io/flatfile/buffer.rb +237 -0
  117. data/lib/bio/io/flatfile/index.rb +17 -7
  118. data/lib/bio/io/flatfile/indexer.rb +30 -12
  119. data/lib/bio/io/flatfile/splitter.rb +297 -0
  120. data/lib/bio/io/hinv.rb +442 -0
  121. data/lib/bio/io/keggapi.rb +2 -2
  122. data/lib/bio/io/ncbirest.rb +733 -0
  123. data/lib/bio/io/pubmed.rb +34 -80
  124. data/lib/bio/io/registry.rb +2 -2
  125. data/lib/bio/io/sql.rb +178 -357
  126. data/lib/bio/io/togows.rb +458 -0
  127. data/lib/bio/location.rb +106 -11
  128. data/lib/bio/pathway.rb +120 -14
  129. data/lib/bio/reference.rb +115 -101
  130. data/lib/bio/sequence.rb +164 -183
  131. data/lib/bio/sequence/adapter.rb +108 -0
  132. data/lib/bio/sequence/common.rb +22 -45
  133. data/lib/bio/sequence/compat.rb +2 -2
  134. data/lib/bio/sequence/dblink.rb +54 -0
  135. data/lib/bio/sequence/format.rb +254 -77
  136. data/lib/bio/sequence/format_raw.rb +23 -0
  137. data/lib/bio/shell.rb +3 -1
  138. data/lib/bio/shell/core.rb +2 -2
  139. data/lib/bio/shell/plugin/entry.rb +33 -4
  140. data/lib/bio/shell/plugin/ncbirest.rb +64 -0
  141. data/lib/bio/shell/plugin/togows.rb +40 -0
  142. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/bioruby_generator.rb +0 -0
  143. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_classes.rhtml +0 -0
  144. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_log.rhtml +0 -0
  145. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_methods.rhtml +0 -0
  146. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_modules.rhtml +0 -0
  147. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_variables.rhtml +0 -0
  148. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-bg.gif +0 -0
  149. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-gem.png +0 -0
  150. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-link.gif +0 -0
  151. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.css +0 -0
  152. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.rhtml +0 -0
  153. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_controller.rb +0 -0
  154. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_helper.rb +0 -0
  155. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/commands.rhtml +0 -0
  156. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/history.rhtml +0 -0
  157. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/index.rhtml +0 -0
  158. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/spinner.gif +0 -0
  159. data/lib/bio/tree.rb +4 -2
  160. data/lib/bio/util/color_scheme.rb +2 -2
  161. data/lib/bio/util/contingency_table.rb +2 -2
  162. data/lib/bio/util/restriction_enzyme.rb +2 -2
  163. data/lib/bio/util/restriction_enzyme/single_strand.rb +6 -5
  164. data/lib/bio/version.rb +25 -0
  165. data/rdoc.zsh +8 -0
  166. data/sample/any2fasta.rb +0 -0
  167. data/sample/biofetch.rb +0 -0
  168. data/sample/dbget +0 -0
  169. data/sample/demo_sequence.rb +158 -0
  170. data/sample/enzymes.rb +0 -0
  171. data/sample/fasta2tab.rb +0 -0
  172. data/sample/fastagrep.rb +72 -0
  173. data/sample/fastasort.rb +54 -0
  174. data/sample/fsplit.rb +0 -0
  175. data/sample/gb2fasta.rb +2 -3
  176. data/sample/gb2tab.rb +0 -0
  177. data/sample/gbtab2mysql.rb +0 -0
  178. data/sample/genes2nuc.rb +0 -0
  179. data/sample/genes2pep.rb +0 -0
  180. data/sample/genes2tab.rb +0 -0
  181. data/sample/genome2rb.rb +0 -0
  182. data/sample/genome2tab.rb +0 -0
  183. data/sample/goslim.rb +0 -0
  184. data/sample/gt2fasta.rb +0 -0
  185. data/sample/na2aa.rb +34 -0
  186. data/sample/pmfetch.rb +0 -0
  187. data/sample/pmsearch.rb +0 -0
  188. data/sample/ssearch2tab.rb +0 -0
  189. data/sample/tfastx2tab.rb +0 -0
  190. data/sample/vs-genes.rb +0 -0
  191. data/setup.rb +1596 -0
  192. data/test/data/blast/blastp-multi.m7 +188 -0
  193. data/test/data/command/echoarg2.bat +1 -0
  194. data/test/data/paml/codeml/control_file.txt +30 -0
  195. data/test/data/paml/codeml/output.txt +78 -0
  196. data/test/data/paml/codeml/rates +217 -0
  197. data/test/data/rpsblast/misc.rpsblast +193 -0
  198. data/test/data/soft/GDS100_partial.soft +0 -0
  199. data/test/data/soft/GSE3457_family_partial.soft +0 -0
  200. data/test/functional/bio/appl/test_pts1.rb +115 -0
  201. data/test/functional/bio/io/test_ensembl.rb +123 -80
  202. data/test/functional/bio/io/test_togows.rb +267 -0
  203. data/test/functional/bio/sequence/test_output_embl.rb +51 -0
  204. data/test/functional/bio/test_command.rb +301 -0
  205. data/test/runner.rb +17 -1
  206. data/test/unit/bio/appl/blast/test_ncbioptions.rb +112 -0
  207. data/test/unit/bio/appl/blast/test_report.rb +753 -35
  208. data/test/unit/bio/appl/blast/test_rpsblast.rb +398 -0
  209. data/test/unit/bio/appl/paml/codeml/test_rates.rb +45 -0
  210. data/test/unit/bio/appl/paml/codeml/test_report.rb +45 -0
  211. data/test/unit/bio/appl/paml/test_codeml.rb +174 -0
  212. data/test/unit/bio/appl/test_blast.rb +135 -4
  213. data/test/unit/bio/appl/test_fasta.rb +2 -2
  214. data/test/unit/bio/appl/test_pts1.rb +1 -64
  215. data/test/unit/bio/db/embl/test_common.rb +15 -15
  216. data/test/unit/bio/db/embl/test_embl.rb +4 -4
  217. data/test/unit/bio/db/embl/test_embl_rel89.rb +5 -5
  218. data/test/unit/bio/db/embl/test_embl_to_bioseq.rb +203 -0
  219. data/test/unit/bio/db/embl/test_sptr.rb +38 -1
  220. data/test/unit/bio/db/pdb/test_pdb.rb +2 -2
  221. data/test/unit/bio/db/test_gff.rb +1151 -25
  222. data/test/unit/bio/db/test_medline.rb +127 -0
  223. data/test/unit/bio/db/test_nexus.rb +5 -1
  224. data/test/unit/bio/db/test_prosite.rb +4 -4
  225. data/test/unit/bio/io/flatfile/test_autodetection.rb +375 -0
  226. data/test/unit/bio/io/flatfile/test_buffer.rb +251 -0
  227. data/test/unit/bio/io/flatfile/test_splitter.rb +369 -0
  228. data/test/unit/bio/io/test_ddbjxml.rb +8 -3
  229. data/test/unit/bio/io/test_fastacmd.rb +5 -5
  230. data/test/unit/bio/io/test_flatfile.rb +357 -106
  231. data/test/unit/bio/io/test_soapwsdl.rb +2 -2
  232. data/test/unit/bio/io/test_togows.rb +161 -0
  233. data/test/unit/bio/sequence/test_common.rb +210 -11
  234. data/test/unit/bio/sequence/test_compat.rb +3 -3
  235. data/test/unit/bio/sequence/test_dblink.rb +58 -0
  236. data/test/unit/bio/sequence/test_na.rb +2 -2
  237. data/test/unit/bio/test_command.rb +111 -50
  238. data/test/unit/bio/test_feature.rb +29 -1
  239. data/test/unit/bio/test_location.rb +566 -6
  240. data/test/unit/bio/test_pathway.rb +91 -65
  241. data/test/unit/bio/test_reference.rb +67 -13
  242. data/test/unit/bio/util/restriction_enzyme/analysis/test_calculated_cuts.rb +3 -3
  243. data/test/unit/bio/util/restriction_enzyme/analysis/test_cut_ranges.rb +3 -3
  244. data/test/unit/bio/util/restriction_enzyme/analysis/test_sequence_range.rb +3 -3
  245. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_aligned_strands.rb +4 -3
  246. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair.rb +3 -3
  247. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair_in_enzyme_notation.rb +3 -3
  248. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations.rb +3 -3
  249. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations_in_enzyme_notation.rb +3 -3
  250. data/test/unit/bio/util/restriction_enzyme/single_strand/test_cut_locations_in_enzyme_notation.rb +3 -3
  251. data/test/unit/bio/util/restriction_enzyme/test_analysis.rb +3 -3
  252. data/test/unit/bio/util/restriction_enzyme/test_cut_symbol.rb +4 -4
  253. data/test/unit/bio/util/restriction_enzyme/test_double_stranded.rb +3 -3
  254. data/test/unit/bio/util/restriction_enzyme/test_single_strand.rb +3 -3
  255. data/test/unit/bio/util/restriction_enzyme/test_single_strand_complement.rb +3 -3
  256. data/test/unit/bio/util/restriction_enzyme/test_string_formatting.rb +3 -3
  257. data/test/unit/bio/util/test_restriction_enzyme.rb +3 -3
  258. metadata +202 -167
  259. data/test/unit/bio/appl/blast/test_xmlparser.rb +0 -388
@@ -4,7 +4,7 @@
4
4
  # Copyright:: Copyright (C) 2001-2006 Mitsuteru C. Nakao <n@bioruby.org>
5
5
  # License:: The Ruby License
6
6
  #
7
- # $Id: sptr.rb,v 1.36 2007/04/05 23:35:40 trevor Exp $
7
+ # $Id:$
8
8
  #
9
9
  # == Description
10
10
  #
@@ -241,7 +241,7 @@ class SPTR < EMBLDB
241
241
  records = gn_line.split(/\s*and\s*/)
242
242
  records.each do |record|
243
243
  gene_hash = {:name => '', :synonyms => [], :loci => [], :orfs => []}
244
- record.each(';') do |element|
244
+ record.each_line(';') do |element|
245
245
  case element
246
246
  when /Name=/ then
247
247
  gene_hash[:name] = $'[0..-2]
@@ -505,11 +505,10 @@ class SPTR < EMBLDB
505
505
  else
506
506
  hash['journal'] = value
507
507
  end
508
- when 'RX' # PUBMED, MEDLINE
509
- value.split('.').each {|item|
510
- tag, xref = item.split(/; /).map {|i| i.strip }
508
+ when 'RX' # PUBMED, MEDLINE, DOI
509
+ value.each do |tag, xref|
511
510
  hash[ tag.downcase ] = xref
512
- }
511
+ end
513
512
  end
514
513
  }
515
514
  Reference.new(hash)
@@ -678,17 +677,17 @@ class SPTR < EMBLDB
678
677
  when 'COFACTOR'
679
678
  return @data['CC'][topic]
680
679
  when 'DEVELOPMENTAL STAGE'
681
- return @data['CC'][topic].to_s
680
+ return @data['CC'][topic].join('')
682
681
  when 'DISEASE'
683
- return @data['CC'][topic].to_s
682
+ return @data['CC'][topic].join('')
684
683
  when 'DOMAIN'
685
684
  return @data['CC'][topic]
686
685
  when 'ENZYME REGULATION'
687
- return @data['CC'][topic].to_s
686
+ return @data['CC'][topic].join('')
688
687
  when 'FUNCTION'
689
- return @data['CC'][topic].to_s
688
+ return @data['CC'][topic].join('')
690
689
  when 'INDUCTION'
691
- return @data['CC'][topic].to_s
690
+ return @data['CC'][topic].join('')
692
691
  when 'INTERACTION'
693
692
  return cc_interaction(@data['CC'][topic])
694
693
  when 'MASS SPECTROMETRY'
@@ -749,7 +748,7 @@ class SPTR < EMBLDB
749
748
 
750
749
 
751
750
  def cc_alternative_products(data)
752
- ap = data.to_s
751
+ ap = data.join('')
753
752
  return ap unless ap
754
753
 
755
754
  # Event, Named isoforms, Comment, [Name, Synonyms, IsoId, Sequnce]+
@@ -822,7 +821,7 @@ class SPTR < EMBLDB
822
821
 
823
822
 
824
823
  def cc_caution(data)
825
- data.to_s
824
+ data.join('')
826
825
  end
827
826
  private :cc_caution
828
827
 
@@ -831,7 +830,7 @@ class SPTR < EMBLDB
831
830
  #
832
831
  # CC P46527:CDKN1B; NbExp=1; IntAct=EBI-359815, EBI-519280;
833
832
  def cc_interaction(data)
834
- str = data.to_s
833
+ str = data.join('')
835
834
  it = str.scan(/(.+?); NbExp=(.+?); IntAct=(.+?);/)
836
835
  it.map {|ent|
837
836
  ent.map! {|x| x.strip }
@@ -894,7 +893,7 @@ class SPTR < EMBLDB
894
893
 
895
894
 
896
895
  def cc_rna_editing(data)
897
- data = data.to_s
896
+ data = data.join('')
898
897
  entry = {'Modified_positions' => [], 'Note' => ""}
899
898
  if data =~ /Modified_positions=(.+?)(\.|;)/
900
899
  entry['Modified_positions'] = $1.sub(/\.$/, '').split(', ')
@@ -961,7 +960,7 @@ class SPTR < EMBLDB
961
960
  unless key
962
961
  embl_dr
963
962
  else
964
- embl_dr[key].map {|x|
963
+ (embl_dr[key] or []).map {|x|
965
964
  {'Accession' => x[0],
966
965
  'Version' => x[1],
967
966
  ' ' => x[2],
@@ -4,13 +4,11 @@
4
4
  # Copyright:: Copyright (C) 2003 GOTO Naohisa <ng@bioruby.org>
5
5
  # License:: The Ruby License
6
6
  #
7
- # $Id: fantom.rb,v 1.14 2007/04/05 23:35:40 trevor Exp $
7
+ # $Id:$
8
8
  #
9
9
 
10
- begin
11
- require 'rexml/document'
12
- rescue LoadError
13
- end
10
+ require 'rexml/document'
11
+ require 'cgi'
14
12
  require 'uri'
15
13
  require 'net/http'
16
14
 
@@ -32,17 +30,17 @@ module Bio
32
30
  def get_by_id(idstr, http_proxy = nil)
33
31
  addr = 'fantom.gsc.riken.go.jp'
34
32
  port = 80
35
- path = "/db/maxml/maxmlseq.cgi?masterid=#{URI.escape(idstr.to_s)}&style=xml"
33
+ path = "/db/maxml/maxmlseq.cgi?masterid=#{CGI.escape(idstr.to_s)}&style=xml"
36
34
  xml = ''
37
35
  if http_proxy then
38
36
  proxy = URI.parse(http_proxy.to_s)
39
37
  Net::HTTP.start(addr, port, proxy.host, proxy.port) do |http|
40
- response, = http.get(path)
38
+ response = http.get(path)
41
39
  xml = response.body
42
40
  end
43
41
  else
44
42
  Bio::Command.start_http(addr, port) do |http|
45
- response, = http.get(path)
43
+ response = http.get(path)
46
44
  xml = response.body
47
45
  end
48
46
  end
@@ -2,11 +2,11 @@
2
2
  # = bio/db/fasta.rb - FASTA format class
3
3
  #
4
4
  # Copyright:: Copyright (C) 2001, 2002
5
- # GOTO Naohisa <ngoto@gen-info.osaka-u.ac.jp>,
5
+ # Naohisa Goto <ng@bioruby.org>,
6
6
  # Toshiaki Katayama <k@bioruby.org>
7
7
  # License:: The Ruby License
8
8
  #
9
- # $Id: fasta.rb,v 1.28 2007/04/05 23:35:40 trevor Exp $
9
+ # $Id: fasta.rb,v 1.28.2.3 2008/06/20 13:43:36 ngoto Exp $
10
10
  #
11
11
  # == Description
12
12
  #
@@ -14,45 +14,7 @@
14
14
  #
15
15
  # == Examples
16
16
  #
17
- # rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
18
- # rub.entry_id ==> 'gi|671595'
19
- # rub.get('emb') ==> 'CAA85678.1'
20
- # rub.emb ==> 'CAA85678.1'
21
- # rub.gi ==> '671595'
22
- # rub.accession ==> 'CAA85678'
23
- # rub.accessions ==> [ 'CAA85678' ]
24
- # rub.acc_version ==> 'CAA85678.1'
25
- # rub.locus ==> nil
26
- # rub.list_ids ==> [["gi", "671595"],
27
- # ["emb", "CAA85678.1", nil],
28
- # ["Perovskia abrotanoides"]]
29
- #
30
- # ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
31
- # ckr.entry_id ==> "gi|2495000"
32
- # ckr.sp ==> "CCKR_CAVPO"
33
- # ckr.pir ==> "I51898"
34
- # ckr.gb ==> "AAB29504.1"
35
- # ckr.gi ==> "2495000"
36
- # ckr.accession ==> "AAB29504"
37
- # ckr.accessions ==> ["Q63931", "AAB29504"]
38
- # ckr.acc_version ==> "AAB29504.1"
39
- # ckr.locus ==> nil
40
- # ckr.description ==>
41
- # "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
42
- # ckr.descriptions ==>
43
- # ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
44
- # "cholecystokinin A receptor - guinea pig",
45
- # "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
46
- # ckr.words ==>
47
- # ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
48
- # "receptor", "type"]
49
- # ckr.id_strings ==>
50
- # ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
51
- # "544724", "AAB29504.1", "Cavia"]
52
- # ckr.list_ids ==>
53
- # [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
54
- # ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
55
- # ["gb", "AAB29504.1", nil], ["Cavia"]]
17
+ # See documents of Bio::FastaFormat class.
56
18
  #
57
19
  # == References
58
20
  #
@@ -65,6 +27,8 @@
65
27
 
66
28
  require 'bio/db'
67
29
  require 'bio/sequence'
30
+ require 'bio/sequence/dblink'
31
+ require 'bio/db/fasta/defline'
68
32
 
69
33
  module Bio
70
34
 
@@ -81,7 +45,7 @@ module Bio
81
45
  #
82
46
  # === Examples
83
47
  #
84
- # f_str = <<END
48
+ # f_str = <<END_OF_STRING
85
49
  # >sce:YBR160W CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
86
50
  # MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEG
87
51
  # VPSTAIREISLLKELKDDNIVRLYDIVHSDAHKLYLVFEFLDLDLKRYME
@@ -101,7 +65,7 @@ module Bio
101
65
  # CNELVKRHLQFNPNKLTKFYTLQPMDVLLPILEKALNLSQIRVKPDLFAN
102
66
  # FERLCELLGYDNVFPLIINIKTKSNGGYQLCGSISIIKIEEELKSVGFER
103
67
  # KTGDPLEWRRLFKKISTICRDIILIPN
104
- # END
68
+ # END_OF_STRING
105
69
  #
106
70
  # f = Bio::FastaFormat.new(f_str)
107
71
  # puts "### FastaFormat"
@@ -253,12 +217,10 @@ module Bio
253
217
  # might also be changed (but not always be changed)
254
218
  # because of efficiency.
255
219
  #
256
- def to_seq
257
- seq
258
- obj = Bio::Sequence.new(@seq)
259
- obj.definition = self.definition
260
- obj
220
+ def to_biosequence
221
+ Bio::Sequence.adapter(self, Bio::Sequence::Adapter::FastaFormat)
261
222
  end
223
+ alias to_seq to_biosequence
262
224
 
263
225
  # Parsing FASTA Defline, and extract IDs.
264
226
  # IDs are NSIDs (NCBI standard FASTA sequence identifiers)
@@ -362,465 +324,6 @@ module Bio
362
324
 
363
325
  end #class FastaNumericFormat
364
326
 
365
-
366
- # Parsing FASTA Defline, and extract IDs and other informations.
367
- # IDs are NSIDs (NCBI standard FASTA sequence identifiers)
368
- # or ":"-separated IDs.
369
- #
370
- # specs are described in:
371
- # ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
372
- # http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
373
- #
374
- # === Examples
375
- #
376
- # rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
377
- # rub.entry_id ==> 'gi|671595'
378
- # rub.get('emb') ==> 'CAA85678.1'
379
- # rub.emb ==> 'CAA85678.1'
380
- # rub.gi ==> '671595'
381
- # rub.accession ==> 'CAA85678'
382
- # rub.accessions ==> [ 'CAA85678' ]
383
- # rub.acc_version ==> 'CAA85678.1'
384
- # rub.locus ==> nil
385
- # rub.list_ids ==> [["gi", "671595"],
386
- # ["emb", "CAA85678.1", nil],
387
- # ["Perovskia abrotanoides"]]
388
- #
389
- # ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
390
- # ckr.entry_id ==> "gi|2495000"
391
- # ckr.sp ==> "CCKR_CAVPO"
392
- # ckr.pir ==> "I51898"
393
- # ckr.gb ==> "AAB29504.1"
394
- # ckr.gi ==> "2495000"
395
- # ckr.accession ==> "AAB29504"
396
- # ckr.accessions ==> ["Q63931", "AAB29504"]
397
- # ckr.acc_version ==> "AAB29504.1"
398
- # ckr.locus ==> nil
399
- # ckr.description ==>
400
- # "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
401
- # ckr.descriptions ==>
402
- # ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
403
- # "cholecystokinin A receptor - guinea pig",
404
- # "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
405
- # ckr.words ==>
406
- # ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
407
- # "receptor", "type"]
408
- # ckr.id_strings ==>
409
- # ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
410
- # "544724", "AAB29504.1", "Cavia"]
411
- # ckr.list_ids ==>
412
- # [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
413
- # ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
414
- # ["gb", "AAB29504.1", nil], ["Cavia"]]
415
- #
416
- # === Refereneces
417
- #
418
- # * Fasta format description (NCBI)
419
- # http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
420
- #
421
- # * Frequently Asked Questions: Indexing of Sequence Identifiers (by Warren R. Gish.)
422
- # http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
423
- #
424
- # * README.formatdb
425
- # ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
426
- #
427
- class FastaDefline
428
-
429
- NSIDs = {
430
- # NCBI and WU-BLAST
431
- 'gi' => [ 'gi' ], # NCBI GI
432
- 'gb' => [ 'acc_version', 'locus' ], # GenBank
433
- 'emb' => [ 'acc_version', 'locus' ], # EMBL
434
- 'dbj' => [ 'acc_version', 'locus' ], # DDBJ
435
- 'sp' => [ 'accession', 'entry_id' ], # SWISS-PROT
436
- 'pdb' => [ 'entry_id', 'chain' ], # PDB
437
- 'bbs' => [ 'number' ], # GenInfo Backbone Id
438
- 'gnl' => [ 'database' , 'entry_id' ], # General database identifier
439
- 'ref' => [ 'acc_version' , 'locus' ], # NCBI Reference Sequence
440
- 'lcl' => [ 'entry_id' ], # Local Sequence identifier
441
-
442
- # WU-BLAST and NCBI
443
- 'pir' => [ 'accession', 'entry_id' ], # PIR
444
- 'prf' => [ 'accession', 'entry_id' ], # Protein Research Foundation
445
- 'pat' => [ 'country', 'number', 'serial' ], # Patents
446
-
447
- # WU-BLAST only
448
- 'bbm' => [ 'number' ], # NCBI GenInfo Backbone database identifier
449
- 'gim' => [ 'number' ], # NCBI GenInfo Import identifier
450
- 'gp' => [ 'acc_version', 'locus' ], # GenPept
451
- 'oth' => [ 'accession', 'name', 'release' ], # Other (user-definable) identifier
452
- 'tpd' => [ 'accession', 'name' ], # Third party annotation, DDBJ
453
- 'tpe' => [ 'accession', 'name' ], # Third party annotation, EMBL
454
- 'tpg' => [ 'accession', 'name' ], # Third party annotation, GenBank
455
-
456
- # Original
457
- 'ri' => [ 'entry_id', 'rearray_id', 'len' ], # RIKEN FANTOM DB
458
- }
459
-
460
- # Shows array that contains IDs (or ID-like strings).
461
- # Returns an array of arrays of strings.
462
- attr_reader :list_ids
463
-
464
- # Shows a possibly unique identifier.
465
- # Returns a string.
466
- attr_reader :entry_id
467
-
468
- # Parses given string.
469
- def initialize(str)
470
- @deflines = []
471
- @info = {}
472
- @list_ids = []
473
-
474
- @entry_id = nil
475
-
476
- lines = str.split("\x01")
477
- lines.each do |line|
478
- add_defline(line)
479
- end
480
- end #def initialize
481
-
482
- # Parses given string and adds parsed data.
483
- def add_defline(str)
484
- case str
485
- when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/
486
- # NSIDs
487
- # examples:
488
- # >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P
489
- #
490
- # note: regexp (:?) means grouping without backreferences
491
- i = $1
492
- d = $2
493
- tks = i.split('|')
494
- tks << '' if i[-1,1] == '|'
495
- a = parse_NSIDs(tks)
496
- i = a[0].join('|')
497
- a.unshift('|')
498
- d = tks.join('|') + ' ' + d unless tks.empty?
499
- a << d
500
- this_line = a
501
- match_EC(d)
502
- parse_square_brackets(d).each do |x|
503
- if !match_EC(x, false) and x =~ /\A[A-Z]/ then
504
- di = [ x ]
505
- @list_ids << di
506
- @info['organism'] = x unless @info['organism']
507
- end
508
- end
509
-
510
- when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/
511
- # examples:
512
- # >sce:YBR160W CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
513
- # >emb:CACDC28 [X80034] C.albicans CDC28 gene
514
- i = $1
515
- d = $2
516
- a = parse_ColonSepID(i)
517
- i = a.join(':')
518
- this_line = [ ':', a , d ]
519
- match_EC(d)
520
- parse_square_brackets(d).each do |x|
521
- if !match_EC(x, false) and x =~ /:/ then
522
- parse_ColonSepID(x)
523
- elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then
524
- @list_ids << [ $1 ]
525
- end
526
- end
527
-
528
- when /^\>?\s*(\S+)(?:\s+(.+))?$/
529
- # examples:
530
- # >ABC12345 this is test
531
- i = $1
532
- d = $2.to_s
533
- @list_ids << [ i.chomp('.') ]
534
- this_line = [ '', [ i ], d ]
535
- match_EC(d)
536
- else
537
- i = str
538
- d = ''
539
- match_EC(i)
540
- this_line = [ '', [ i ], d ]
541
- end
542
-
543
- @deflines << this_line
544
- @entry_id = i unless @entry_id
545
- end
546
-
547
- def match_EC(str, write_flag = true)
548
- di = nil
549
- str.scan(/EC\:((:?[\-\d]+\.){3}(:?[\-\d]+))/i) do |x|
550
- di = [ 'EC', $1 ]
551
- if write_flag then
552
- @info['ec'] = di[1] if (!@info['ec'] or @info['ec'].to_s =~ /\-/)
553
- @list_ids << di
554
- end
555
- end
556
- di
557
- end
558
- private :match_EC
559
-
560
- def parse_square_brackets(str)
561
- r = []
562
- str.scan(/\[([^\]]*)\]/) do |x|
563
- r << x[0]
564
- end
565
- r
566
- end
567
- private :parse_square_brackets
568
-
569
- def parse_ColonSepID(str)
570
- di = str.split(':', 2)
571
- di << nil if di.size <= 1
572
- @list_ids << di
573
- di
574
- end
575
- private :parse_ColonSepID
576
-
577
- def parse_NSIDs(ary)
578
- # this method destroys ary
579
- data = []
580
- while token = ary.shift
581
- if labels = self.class::NSIDs[token] then
582
- di = [ token ]
583
- idtype = token
584
- labels.each do |x|
585
- token = ary.shift
586
- break unless token
587
- if self.class::NSIDs[token] then
588
- ary.unshift(token)
589
- break #each
590
- end
591
- if token.length > 0 then
592
- di << token
593
- else
594
- di << nil
595
- end
596
- end
597
- data << di
598
- else
599
- if token.length > 0 then
600
- # UCID (uncontrolled identifiers)
601
- di = [ token ]
602
- data << di
603
- @info['ucid'] = token unless @info['ucid']
604
- end
605
- break #while
606
- end
607
- end #while
608
- @list_ids.concat data
609
- data
610
- end #def parse_NSIDs
611
- private :parse_NSIDs
612
-
613
-
614
- # Shows original string.
615
- # Note that the result of this method may be different from
616
- # original string which is given in FastaDefline.new method.
617
- def to_s
618
- @deflines.collect { |a|
619
- s = a[0]
620
- (a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip
621
- }.join("\x01")
622
- end
623
-
624
- # Shows description.
625
- def description
626
- @deflines[0].to_a[-1]
627
- end
628
-
629
- # Returns descriptions.
630
- def descriptions
631
- @deflines.collect do |a|
632
- a[-1]
633
- end
634
- end
635
-
636
- # Shows ID-like strings.
637
- # Returns an array of strings.
638
- def id_strings
639
- r = []
640
- @list_ids.each do |a|
641
- if a.size >= 2 then
642
- r.concat a[1..-1].find_all { |x| x }
643
- else
644
- if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/
645
- r << a[0]
646
- end
647
- end
648
- end
649
- r.concat( words(true, []).find_all do |x|
650
- x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or
651
- x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
652
- end)
653
- r
654
- end
655
-
656
- KillWords = [
657
- 'an', 'the', 'this', 'that',
658
- 'is', 'are', 'were', 'was', 'be', 'can', 'may', 'might',
659
- 'as', 'at', 'by', 'for', 'in', 'of', 'on', 'to', 'with',
660
- 'from', 'and', 'or', 'not',
661
- 'dna', 'rna', 'mrna', 'cdna', 'orf',
662
- 'aa', 'nt', 'pct', 'id', 'ec', 'sp', 'subsp',
663
- 'similar', 'involved', 'identical', 'identity',
664
- 'cds', 'clone', 'library', 'contig', 'contigs',
665
- 'homolog', 'homologue', 'homologs', 'homologous',
666
- 'protein', 'proteins', 'gene', 'genes',
667
- 'product', 'products', 'sequence', 'sequences',
668
- 'strain', 'strains', 'region', 'regions',
669
- ]
670
- KillWordsHash = {}
671
- KillWords.each { |x| KillWordsHash[x] = true }
672
-
673
- KillRegexpArray = [
674
- /\A\d{1,3}\%?\z/,
675
- /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/,
676
- /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
677
- ]
678
-
679
- # Shows words used in the defline. Returns an Array.
680
- def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray,
681
- kwhash = self.class::KillWordsHash)
682
- a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\#\s\x00-\x1f\x7f]+/)
683
- a.collect! do |x|
684
- x.sub!(/\A[\$\*\-\+]+/, '')
685
- x.sub!(/[\$\*\-\=]+\z/, '')
686
- if x.size <= 1 then
687
- nil
688
- elsif kwhash[x.downcase] then
689
- nil
690
- else
691
- if kill_regexp.find { |expr| expr =~ x } then
692
- nil
693
- else
694
- x
695
- end
696
- end
697
- end
698
- a.compact!
699
- a.collect! { |x| x.downcase } unless case_sensitive
700
- a.sort!
701
- a.uniq!
702
- a
703
- end
704
-
705
- # Returns identifires by a database name.
706
- def get(dbname)
707
- db = dbname.to_s
708
- r = nil
709
- unless r = @info[db] then
710
- di = @list_ids.find { |x| x[0] == db.to_s }
711
- if di and di.size <= 2 then
712
- r = di[-1]
713
- elsif di then
714
- labels = self.class::NSIDs[db]
715
- [ 'acc_version', 'entry_id',
716
- 'locus', 'accession', 'number'].each do |x|
717
- if i = labels.index(x) then
718
- r = di[i+1]
719
- break if r
720
- end
721
- end
722
- r = di[1..-1].find { |x| x } unless r
723
- end
724
- @info[db] = r if r
725
- end
726
- r
727
- end
728
-
729
- # Returns an identifier by given type.
730
- def get_by_type(type_str)
731
- @list_ids.each do |x|
732
- if labels = self.class::NSIDs[x[0]] then
733
- if i = labels.index(type_str) then
734
- return x[i+1]
735
- end
736
- end
737
- end
738
- nil
739
- end
740
-
741
- # Returns identifiers by given type.
742
- def get_all_by_type(*type_strarg)
743
- d = []
744
- @list_ids.each do |x|
745
- if labels = self.class::NSIDs[x[0]] then
746
- type_strarg.each do |y|
747
- if i = labels.index(y) then
748
- d << x[i+1] if x[i+1]
749
- end
750
- end
751
- end
752
- end
753
- d
754
- end
755
-
756
- # Shows locus.
757
- # If the entry has more than two of such IDs,
758
- # only the first ID are shown.
759
- # Returns a string or nil.
760
- def locus
761
- unless defined?(@locus)
762
- @locus = get_by_type('locus')
763
- end
764
- @locus
765
- end
766
-
767
- # Shows GI.
768
- # If the entry has more than two of such IDs,
769
- # only the first ID are shown.
770
- # Returns a string or nil.
771
- def gi
772
- unless defined?(@gi) then
773
- @gi = get_by_type('gi')
774
- end
775
- @gi
776
- end
777
-
778
- # Shows accession with version number.
779
- # If the entry has more than two of such IDs,
780
- # only the first ID are shown.
781
- # Returns a string or nil.
782
- def acc_version
783
- unless defined?(@acc_version) then
784
- @acc_version = get_by_type('acc_version')
785
- end
786
- @acc_version
787
- end
788
-
789
- # Shows accession numbers.
790
- # Returns an array of strings.
791
- def accessions
792
- unless defined?(@accessions) then
793
- @accessions = get_all_by_type('accession', 'acc_version')
794
- @accessions.collect! { |x| x.sub(/\..*\z/, '') }
795
- end
796
- @accessions
797
- end
798
-
799
- # Shows an accession number.
800
- def accession
801
- unless defined?(@accession) then
802
- if acc_version then
803
- @accession = acc_version.split('.')[0]
804
- else
805
- @accession = accessions[0]
806
- end
807
- end
808
- @accession
809
- end
810
-
811
- def method_missing(name, *args)
812
- # raise ArgumentError,
813
- # "wrong # of arguments(#{args.size} for 1)" if args.size >= 2
814
- r = get(name, *args)
815
- if !r and !(self.class::NSIDs[name.to_s]) then
816
- raise "NameError: undefined method `#{name.inspect}'"
817
- end
818
- r
819
- end
820
-
821
-
822
- end #class FastaDefline
823
-
824
327
  end #module Bio
825
328
 
826
329
  if __FILE__ == $0