bio 1.2.1 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (259) hide show
  1. data/ChangeLog +3421 -0
  2. data/KNOWN_ISSUES.rdoc +88 -0
  3. data/README.rdoc +252 -0
  4. data/README_DEV.rdoc +285 -0
  5. data/Rakefile +143 -0
  6. data/bin/bioruby +0 -0
  7. data/bin/br_biofetch.rb +0 -0
  8. data/bin/br_bioflat.rb +12 -1
  9. data/bin/br_biogetseq.rb +0 -0
  10. data/bin/br_pmfetch.rb +4 -3
  11. data/bioruby.gemspec +477 -0
  12. data/bioruby.gemspec.erb +117 -0
  13. data/doc/Changes-0.7.rd +7 -0
  14. data/doc/Changes-1.3.rdoc +239 -0
  15. data/doc/Tutorial.rd +296 -184
  16. data/doc/Tutorial.rd.html +1031 -0
  17. data/doc/Tutorial.rd.ja +111 -45
  18. data/doc/Tutorial.rd.ja.html +2225 -0
  19. data/doc/bioruby.css +281 -0
  20. data/extconf.rb +2 -0
  21. data/lib/bio.rb +29 -4
  22. data/lib/bio/appl/blast.rb +306 -121
  23. data/lib/bio/appl/blast/ddbj.rb +142 -0
  24. data/lib/bio/appl/blast/format0.rb +35 -25
  25. data/lib/bio/appl/blast/format8.rb +2 -2
  26. data/lib/bio/appl/blast/genomenet.rb +263 -0
  27. data/lib/bio/appl/blast/ncbioptions.rb +220 -0
  28. data/lib/bio/appl/blast/remote.rb +106 -0
  29. data/lib/bio/appl/blast/report.rb +260 -9
  30. data/lib/bio/appl/blast/rexml.rb +12 -5
  31. data/lib/bio/appl/blast/rpsblast.rb +277 -0
  32. data/lib/bio/appl/blast/wublast.rb +133 -12
  33. data/lib/bio/appl/blast/xmlparser.rb +35 -18
  34. data/lib/bio/appl/blat/report.rb +46 -5
  35. data/lib/bio/appl/emboss.rb +62 -13
  36. data/lib/bio/appl/fasta.rb +9 -11
  37. data/lib/bio/appl/genscan/report.rb +3 -3
  38. data/lib/bio/appl/hmmer.rb +1 -1
  39. data/lib/bio/appl/hmmer/report.rb +10 -10
  40. data/lib/bio/appl/paml/baseml.rb +95 -0
  41. data/lib/bio/appl/paml/baseml/report.rb +32 -0
  42. data/lib/bio/appl/paml/codeml.rb +242 -0
  43. data/lib/bio/appl/paml/codeml/rates.rb +67 -0
  44. data/lib/bio/appl/paml/codeml/report.rb +67 -0
  45. data/lib/bio/appl/paml/common.rb +348 -0
  46. data/lib/bio/appl/paml/common_report.rb +38 -0
  47. data/lib/bio/appl/paml/yn00.rb +103 -0
  48. data/lib/bio/appl/paml/yn00/report.rb +32 -0
  49. data/lib/bio/appl/psort.rb +2 -2
  50. data/lib/bio/appl/pts1.rb +5 -5
  51. data/lib/bio/appl/tmhmm/report.rb +10 -1
  52. data/lib/bio/command.rb +297 -41
  53. data/lib/bio/compat/features.rb +157 -0
  54. data/lib/bio/compat/references.rb +128 -0
  55. data/lib/bio/db/biosql/biosql_to_biosequence.rb +67 -0
  56. data/lib/bio/db/biosql/sequence.rb +508 -0
  57. data/lib/bio/db/embl/common.rb +28 -12
  58. data/lib/bio/db/embl/embl.rb +107 -9
  59. data/lib/bio/db/embl/embl_to_biosequence.rb +85 -0
  60. data/lib/bio/db/embl/format_embl.rb +190 -0
  61. data/lib/bio/db/embl/sptr.rb +15 -16
  62. data/lib/bio/db/fantom.rb +6 -8
  63. data/lib/bio/db/fasta.rb +10 -507
  64. data/lib/bio/db/fasta/defline.rb +532 -0
  65. data/lib/bio/db/fasta/fasta_to_biosequence.rb +63 -0
  66. data/lib/bio/db/fasta/format_fasta.rb +97 -0
  67. data/lib/bio/db/genbank/common.rb +25 -8
  68. data/lib/bio/db/genbank/format_genbank.rb +187 -0
  69. data/lib/bio/db/genbank/genbank.rb +36 -1
  70. data/lib/bio/db/genbank/genbank_to_biosequence.rb +86 -0
  71. data/lib/bio/db/gff.rb +1791 -119
  72. data/lib/bio/db/kegg/glycan.rb +2 -6
  73. data/lib/bio/db/lasergene.rb +3 -3
  74. data/lib/bio/db/medline.rb +4 -1
  75. data/lib/bio/db/newick.rb +10 -10
  76. data/lib/bio/db/pdb/chain.rb +6 -2
  77. data/lib/bio/db/pdb/pdb.rb +12 -3
  78. data/lib/bio/db/rebase.rb +7 -8
  79. data/lib/bio/db/soft.rb +3 -3
  80. data/lib/bio/feature.rb +1 -88
  81. data/lib/bio/io/biosql/biodatabase.rb +64 -0
  82. data/lib/bio/io/biosql/bioentry.rb +29 -0
  83. data/lib/bio/io/biosql/bioentry_dbxref.rb +11 -0
  84. data/lib/bio/io/biosql/bioentry_path.rb +12 -0
  85. data/lib/bio/io/biosql/bioentry_qualifier_value.rb +10 -0
  86. data/lib/bio/io/biosql/bioentry_reference.rb +10 -0
  87. data/lib/bio/io/biosql/bioentry_relationship.rb +10 -0
  88. data/lib/bio/io/biosql/biosequence.rb +11 -0
  89. data/lib/bio/io/biosql/comment.rb +7 -0
  90. data/lib/bio/io/biosql/config/database.yml +20 -0
  91. data/lib/bio/io/biosql/dbxref.rb +13 -0
  92. data/lib/bio/io/biosql/dbxref_qualifier_value.rb +12 -0
  93. data/lib/bio/io/biosql/location.rb +32 -0
  94. data/lib/bio/io/biosql/location_qualifier_value.rb +11 -0
  95. data/lib/bio/io/biosql/ontology.rb +10 -0
  96. data/lib/bio/io/biosql/reference.rb +9 -0
  97. data/lib/bio/io/biosql/seqfeature.rb +32 -0
  98. data/lib/bio/io/biosql/seqfeature_dbxref.rb +11 -0
  99. data/lib/bio/io/biosql/seqfeature_path.rb +11 -0
  100. data/lib/bio/io/biosql/seqfeature_qualifier_value.rb +20 -0
  101. data/lib/bio/io/biosql/seqfeature_relationship.rb +11 -0
  102. data/lib/bio/io/biosql/taxon.rb +12 -0
  103. data/lib/bio/io/biosql/taxon_name.rb +9 -0
  104. data/lib/bio/io/biosql/term.rb +27 -0
  105. data/lib/bio/io/biosql/term_dbxref.rb +11 -0
  106. data/lib/bio/io/biosql/term_path.rb +12 -0
  107. data/lib/bio/io/biosql/term_relationship.rb +13 -0
  108. data/lib/bio/io/biosql/term_relationship_term.rb +11 -0
  109. data/lib/bio/io/biosql/term_synonym.rb +10 -0
  110. data/lib/bio/io/das.rb +7 -7
  111. data/lib/bio/io/ddbjxml.rb +57 -0
  112. data/lib/bio/io/ensembl.rb +2 -2
  113. data/lib/bio/io/fetch.rb +28 -14
  114. data/lib/bio/io/flatfile.rb +17 -853
  115. data/lib/bio/io/flatfile/autodetection.rb +545 -0
  116. data/lib/bio/io/flatfile/buffer.rb +237 -0
  117. data/lib/bio/io/flatfile/index.rb +17 -7
  118. data/lib/bio/io/flatfile/indexer.rb +30 -12
  119. data/lib/bio/io/flatfile/splitter.rb +297 -0
  120. data/lib/bio/io/hinv.rb +442 -0
  121. data/lib/bio/io/keggapi.rb +2 -2
  122. data/lib/bio/io/ncbirest.rb +733 -0
  123. data/lib/bio/io/pubmed.rb +34 -80
  124. data/lib/bio/io/registry.rb +2 -2
  125. data/lib/bio/io/sql.rb +178 -357
  126. data/lib/bio/io/togows.rb +458 -0
  127. data/lib/bio/location.rb +106 -11
  128. data/lib/bio/pathway.rb +120 -14
  129. data/lib/bio/reference.rb +115 -101
  130. data/lib/bio/sequence.rb +164 -183
  131. data/lib/bio/sequence/adapter.rb +108 -0
  132. data/lib/bio/sequence/common.rb +22 -45
  133. data/lib/bio/sequence/compat.rb +2 -2
  134. data/lib/bio/sequence/dblink.rb +54 -0
  135. data/lib/bio/sequence/format.rb +254 -77
  136. data/lib/bio/sequence/format_raw.rb +23 -0
  137. data/lib/bio/shell.rb +3 -1
  138. data/lib/bio/shell/core.rb +2 -2
  139. data/lib/bio/shell/plugin/entry.rb +33 -4
  140. data/lib/bio/shell/plugin/ncbirest.rb +64 -0
  141. data/lib/bio/shell/plugin/togows.rb +40 -0
  142. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/bioruby_generator.rb +0 -0
  143. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_classes.rhtml +0 -0
  144. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_log.rhtml +0 -0
  145. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_methods.rhtml +0 -0
  146. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_modules.rhtml +0 -0
  147. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_variables.rhtml +0 -0
  148. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-bg.gif +0 -0
  149. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-gem.png +0 -0
  150. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-link.gif +0 -0
  151. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.css +0 -0
  152. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.rhtml +0 -0
  153. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_controller.rb +0 -0
  154. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_helper.rb +0 -0
  155. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/commands.rhtml +0 -0
  156. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/history.rhtml +0 -0
  157. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/index.rhtml +0 -0
  158. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/spinner.gif +0 -0
  159. data/lib/bio/tree.rb +4 -2
  160. data/lib/bio/util/color_scheme.rb +2 -2
  161. data/lib/bio/util/contingency_table.rb +2 -2
  162. data/lib/bio/util/restriction_enzyme.rb +2 -2
  163. data/lib/bio/util/restriction_enzyme/single_strand.rb +6 -5
  164. data/lib/bio/version.rb +25 -0
  165. data/rdoc.zsh +8 -0
  166. data/sample/any2fasta.rb +0 -0
  167. data/sample/biofetch.rb +0 -0
  168. data/sample/dbget +0 -0
  169. data/sample/demo_sequence.rb +158 -0
  170. data/sample/enzymes.rb +0 -0
  171. data/sample/fasta2tab.rb +0 -0
  172. data/sample/fastagrep.rb +72 -0
  173. data/sample/fastasort.rb +54 -0
  174. data/sample/fsplit.rb +0 -0
  175. data/sample/gb2fasta.rb +2 -3
  176. data/sample/gb2tab.rb +0 -0
  177. data/sample/gbtab2mysql.rb +0 -0
  178. data/sample/genes2nuc.rb +0 -0
  179. data/sample/genes2pep.rb +0 -0
  180. data/sample/genes2tab.rb +0 -0
  181. data/sample/genome2rb.rb +0 -0
  182. data/sample/genome2tab.rb +0 -0
  183. data/sample/goslim.rb +0 -0
  184. data/sample/gt2fasta.rb +0 -0
  185. data/sample/na2aa.rb +34 -0
  186. data/sample/pmfetch.rb +0 -0
  187. data/sample/pmsearch.rb +0 -0
  188. data/sample/ssearch2tab.rb +0 -0
  189. data/sample/tfastx2tab.rb +0 -0
  190. data/sample/vs-genes.rb +0 -0
  191. data/setup.rb +1596 -0
  192. data/test/data/blast/blastp-multi.m7 +188 -0
  193. data/test/data/command/echoarg2.bat +1 -0
  194. data/test/data/paml/codeml/control_file.txt +30 -0
  195. data/test/data/paml/codeml/output.txt +78 -0
  196. data/test/data/paml/codeml/rates +217 -0
  197. data/test/data/rpsblast/misc.rpsblast +193 -0
  198. data/test/data/soft/GDS100_partial.soft +0 -0
  199. data/test/data/soft/GSE3457_family_partial.soft +0 -0
  200. data/test/functional/bio/appl/test_pts1.rb +115 -0
  201. data/test/functional/bio/io/test_ensembl.rb +123 -80
  202. data/test/functional/bio/io/test_togows.rb +267 -0
  203. data/test/functional/bio/sequence/test_output_embl.rb +51 -0
  204. data/test/functional/bio/test_command.rb +301 -0
  205. data/test/runner.rb +17 -1
  206. data/test/unit/bio/appl/blast/test_ncbioptions.rb +112 -0
  207. data/test/unit/bio/appl/blast/test_report.rb +753 -35
  208. data/test/unit/bio/appl/blast/test_rpsblast.rb +398 -0
  209. data/test/unit/bio/appl/paml/codeml/test_rates.rb +45 -0
  210. data/test/unit/bio/appl/paml/codeml/test_report.rb +45 -0
  211. data/test/unit/bio/appl/paml/test_codeml.rb +174 -0
  212. data/test/unit/bio/appl/test_blast.rb +135 -4
  213. data/test/unit/bio/appl/test_fasta.rb +2 -2
  214. data/test/unit/bio/appl/test_pts1.rb +1 -64
  215. data/test/unit/bio/db/embl/test_common.rb +15 -15
  216. data/test/unit/bio/db/embl/test_embl.rb +4 -4
  217. data/test/unit/bio/db/embl/test_embl_rel89.rb +5 -5
  218. data/test/unit/bio/db/embl/test_embl_to_bioseq.rb +203 -0
  219. data/test/unit/bio/db/embl/test_sptr.rb +38 -1
  220. data/test/unit/bio/db/pdb/test_pdb.rb +2 -2
  221. data/test/unit/bio/db/test_gff.rb +1151 -25
  222. data/test/unit/bio/db/test_medline.rb +127 -0
  223. data/test/unit/bio/db/test_nexus.rb +5 -1
  224. data/test/unit/bio/db/test_prosite.rb +4 -4
  225. data/test/unit/bio/io/flatfile/test_autodetection.rb +375 -0
  226. data/test/unit/bio/io/flatfile/test_buffer.rb +251 -0
  227. data/test/unit/bio/io/flatfile/test_splitter.rb +369 -0
  228. data/test/unit/bio/io/test_ddbjxml.rb +8 -3
  229. data/test/unit/bio/io/test_fastacmd.rb +5 -5
  230. data/test/unit/bio/io/test_flatfile.rb +357 -106
  231. data/test/unit/bio/io/test_soapwsdl.rb +2 -2
  232. data/test/unit/bio/io/test_togows.rb +161 -0
  233. data/test/unit/bio/sequence/test_common.rb +210 -11
  234. data/test/unit/bio/sequence/test_compat.rb +3 -3
  235. data/test/unit/bio/sequence/test_dblink.rb +58 -0
  236. data/test/unit/bio/sequence/test_na.rb +2 -2
  237. data/test/unit/bio/test_command.rb +111 -50
  238. data/test/unit/bio/test_feature.rb +29 -1
  239. data/test/unit/bio/test_location.rb +566 -6
  240. data/test/unit/bio/test_pathway.rb +91 -65
  241. data/test/unit/bio/test_reference.rb +67 -13
  242. data/test/unit/bio/util/restriction_enzyme/analysis/test_calculated_cuts.rb +3 -3
  243. data/test/unit/bio/util/restriction_enzyme/analysis/test_cut_ranges.rb +3 -3
  244. data/test/unit/bio/util/restriction_enzyme/analysis/test_sequence_range.rb +3 -3
  245. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_aligned_strands.rb +4 -3
  246. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair.rb +3 -3
  247. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair_in_enzyme_notation.rb +3 -3
  248. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations.rb +3 -3
  249. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations_in_enzyme_notation.rb +3 -3
  250. data/test/unit/bio/util/restriction_enzyme/single_strand/test_cut_locations_in_enzyme_notation.rb +3 -3
  251. data/test/unit/bio/util/restriction_enzyme/test_analysis.rb +3 -3
  252. data/test/unit/bio/util/restriction_enzyme/test_cut_symbol.rb +4 -4
  253. data/test/unit/bio/util/restriction_enzyme/test_double_stranded.rb +3 -3
  254. data/test/unit/bio/util/restriction_enzyme/test_single_strand.rb +3 -3
  255. data/test/unit/bio/util/restriction_enzyme/test_single_strand_complement.rb +3 -3
  256. data/test/unit/bio/util/restriction_enzyme/test_string_formatting.rb +3 -3
  257. data/test/unit/bio/util/test_restriction_enzyme.rb +3 -3
  258. metadata +202 -167
  259. data/test/unit/bio/appl/blast/test_xmlparser.rb +0 -388
@@ -4,7 +4,7 @@
4
4
  # Copyright:: Copyright (C) 2001-2006 Mitsuteru C. Nakao <n@bioruby.org>
5
5
  # License:: The Ruby License
6
6
  #
7
- # $Id: sptr.rb,v 1.36 2007/04/05 23:35:40 trevor Exp $
7
+ # $Id:$
8
8
  #
9
9
  # == Description
10
10
  #
@@ -241,7 +241,7 @@ class SPTR < EMBLDB
241
241
  records = gn_line.split(/\s*and\s*/)
242
242
  records.each do |record|
243
243
  gene_hash = {:name => '', :synonyms => [], :loci => [], :orfs => []}
244
- record.each(';') do |element|
244
+ record.each_line(';') do |element|
245
245
  case element
246
246
  when /Name=/ then
247
247
  gene_hash[:name] = $'[0..-2]
@@ -505,11 +505,10 @@ class SPTR < EMBLDB
505
505
  else
506
506
  hash['journal'] = value
507
507
  end
508
- when 'RX' # PUBMED, MEDLINE
509
- value.split('.').each {|item|
510
- tag, xref = item.split(/; /).map {|i| i.strip }
508
+ when 'RX' # PUBMED, MEDLINE, DOI
509
+ value.each do |tag, xref|
511
510
  hash[ tag.downcase ] = xref
512
- }
511
+ end
513
512
  end
514
513
  }
515
514
  Reference.new(hash)
@@ -678,17 +677,17 @@ class SPTR < EMBLDB
678
677
  when 'COFACTOR'
679
678
  return @data['CC'][topic]
680
679
  when 'DEVELOPMENTAL STAGE'
681
- return @data['CC'][topic].to_s
680
+ return @data['CC'][topic].join('')
682
681
  when 'DISEASE'
683
- return @data['CC'][topic].to_s
682
+ return @data['CC'][topic].join('')
684
683
  when 'DOMAIN'
685
684
  return @data['CC'][topic]
686
685
  when 'ENZYME REGULATION'
687
- return @data['CC'][topic].to_s
686
+ return @data['CC'][topic].join('')
688
687
  when 'FUNCTION'
689
- return @data['CC'][topic].to_s
688
+ return @data['CC'][topic].join('')
690
689
  when 'INDUCTION'
691
- return @data['CC'][topic].to_s
690
+ return @data['CC'][topic].join('')
692
691
  when 'INTERACTION'
693
692
  return cc_interaction(@data['CC'][topic])
694
693
  when 'MASS SPECTROMETRY'
@@ -749,7 +748,7 @@ class SPTR < EMBLDB
749
748
 
750
749
 
751
750
  def cc_alternative_products(data)
752
- ap = data.to_s
751
+ ap = data.join('')
753
752
  return ap unless ap
754
753
 
755
754
  # Event, Named isoforms, Comment, [Name, Synonyms, IsoId, Sequnce]+
@@ -822,7 +821,7 @@ class SPTR < EMBLDB
822
821
 
823
822
 
824
823
  def cc_caution(data)
825
- data.to_s
824
+ data.join('')
826
825
  end
827
826
  private :cc_caution
828
827
 
@@ -831,7 +830,7 @@ class SPTR < EMBLDB
831
830
  #
832
831
  # CC P46527:CDKN1B; NbExp=1; IntAct=EBI-359815, EBI-519280;
833
832
  def cc_interaction(data)
834
- str = data.to_s
833
+ str = data.join('')
835
834
  it = str.scan(/(.+?); NbExp=(.+?); IntAct=(.+?);/)
836
835
  it.map {|ent|
837
836
  ent.map! {|x| x.strip }
@@ -894,7 +893,7 @@ class SPTR < EMBLDB
894
893
 
895
894
 
896
895
  def cc_rna_editing(data)
897
- data = data.to_s
896
+ data = data.join('')
898
897
  entry = {'Modified_positions' => [], 'Note' => ""}
899
898
  if data =~ /Modified_positions=(.+?)(\.|;)/
900
899
  entry['Modified_positions'] = $1.sub(/\.$/, '').split(', ')
@@ -961,7 +960,7 @@ class SPTR < EMBLDB
961
960
  unless key
962
961
  embl_dr
963
962
  else
964
- embl_dr[key].map {|x|
963
+ (embl_dr[key] or []).map {|x|
965
964
  {'Accession' => x[0],
966
965
  'Version' => x[1],
967
966
  ' ' => x[2],
@@ -4,13 +4,11 @@
4
4
  # Copyright:: Copyright (C) 2003 GOTO Naohisa <ng@bioruby.org>
5
5
  # License:: The Ruby License
6
6
  #
7
- # $Id: fantom.rb,v 1.14 2007/04/05 23:35:40 trevor Exp $
7
+ # $Id:$
8
8
  #
9
9
 
10
- begin
11
- require 'rexml/document'
12
- rescue LoadError
13
- end
10
+ require 'rexml/document'
11
+ require 'cgi'
14
12
  require 'uri'
15
13
  require 'net/http'
16
14
 
@@ -32,17 +30,17 @@ module Bio
32
30
  def get_by_id(idstr, http_proxy = nil)
33
31
  addr = 'fantom.gsc.riken.go.jp'
34
32
  port = 80
35
- path = "/db/maxml/maxmlseq.cgi?masterid=#{URI.escape(idstr.to_s)}&style=xml"
33
+ path = "/db/maxml/maxmlseq.cgi?masterid=#{CGI.escape(idstr.to_s)}&style=xml"
36
34
  xml = ''
37
35
  if http_proxy then
38
36
  proxy = URI.parse(http_proxy.to_s)
39
37
  Net::HTTP.start(addr, port, proxy.host, proxy.port) do |http|
40
- response, = http.get(path)
38
+ response = http.get(path)
41
39
  xml = response.body
42
40
  end
43
41
  else
44
42
  Bio::Command.start_http(addr, port) do |http|
45
- response, = http.get(path)
43
+ response = http.get(path)
46
44
  xml = response.body
47
45
  end
48
46
  end
@@ -2,11 +2,11 @@
2
2
  # = bio/db/fasta.rb - FASTA format class
3
3
  #
4
4
  # Copyright:: Copyright (C) 2001, 2002
5
- # GOTO Naohisa <ngoto@gen-info.osaka-u.ac.jp>,
5
+ # Naohisa Goto <ng@bioruby.org>,
6
6
  # Toshiaki Katayama <k@bioruby.org>
7
7
  # License:: The Ruby License
8
8
  #
9
- # $Id: fasta.rb,v 1.28 2007/04/05 23:35:40 trevor Exp $
9
+ # $Id: fasta.rb,v 1.28.2.3 2008/06/20 13:43:36 ngoto Exp $
10
10
  #
11
11
  # == Description
12
12
  #
@@ -14,45 +14,7 @@
14
14
  #
15
15
  # == Examples
16
16
  #
17
- # rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
18
- # rub.entry_id ==> 'gi|671595'
19
- # rub.get('emb') ==> 'CAA85678.1'
20
- # rub.emb ==> 'CAA85678.1'
21
- # rub.gi ==> '671595'
22
- # rub.accession ==> 'CAA85678'
23
- # rub.accessions ==> [ 'CAA85678' ]
24
- # rub.acc_version ==> 'CAA85678.1'
25
- # rub.locus ==> nil
26
- # rub.list_ids ==> [["gi", "671595"],
27
- # ["emb", "CAA85678.1", nil],
28
- # ["Perovskia abrotanoides"]]
29
- #
30
- # ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
31
- # ckr.entry_id ==> "gi|2495000"
32
- # ckr.sp ==> "CCKR_CAVPO"
33
- # ckr.pir ==> "I51898"
34
- # ckr.gb ==> "AAB29504.1"
35
- # ckr.gi ==> "2495000"
36
- # ckr.accession ==> "AAB29504"
37
- # ckr.accessions ==> ["Q63931", "AAB29504"]
38
- # ckr.acc_version ==> "AAB29504.1"
39
- # ckr.locus ==> nil
40
- # ckr.description ==>
41
- # "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
42
- # ckr.descriptions ==>
43
- # ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
44
- # "cholecystokinin A receptor - guinea pig",
45
- # "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
46
- # ckr.words ==>
47
- # ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
48
- # "receptor", "type"]
49
- # ckr.id_strings ==>
50
- # ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
51
- # "544724", "AAB29504.1", "Cavia"]
52
- # ckr.list_ids ==>
53
- # [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
54
- # ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
55
- # ["gb", "AAB29504.1", nil], ["Cavia"]]
17
+ # See documents of Bio::FastaFormat class.
56
18
  #
57
19
  # == References
58
20
  #
@@ -65,6 +27,8 @@
65
27
 
66
28
  require 'bio/db'
67
29
  require 'bio/sequence'
30
+ require 'bio/sequence/dblink'
31
+ require 'bio/db/fasta/defline'
68
32
 
69
33
  module Bio
70
34
 
@@ -81,7 +45,7 @@ module Bio
81
45
  #
82
46
  # === Examples
83
47
  #
84
- # f_str = <<END
48
+ # f_str = <<END_OF_STRING
85
49
  # >sce:YBR160W CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
86
50
  # MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEG
87
51
  # VPSTAIREISLLKELKDDNIVRLYDIVHSDAHKLYLVFEFLDLDLKRYME
@@ -101,7 +65,7 @@ module Bio
101
65
  # CNELVKRHLQFNPNKLTKFYTLQPMDVLLPILEKALNLSQIRVKPDLFAN
102
66
  # FERLCELLGYDNVFPLIINIKTKSNGGYQLCGSISIIKIEEELKSVGFER
103
67
  # KTGDPLEWRRLFKKISTICRDIILIPN
104
- # END
68
+ # END_OF_STRING
105
69
  #
106
70
  # f = Bio::FastaFormat.new(f_str)
107
71
  # puts "### FastaFormat"
@@ -253,12 +217,10 @@ module Bio
253
217
  # might also be changed (but not always be changed)
254
218
  # because of efficiency.
255
219
  #
256
- def to_seq
257
- seq
258
- obj = Bio::Sequence.new(@seq)
259
- obj.definition = self.definition
260
- obj
220
+ def to_biosequence
221
+ Bio::Sequence.adapter(self, Bio::Sequence::Adapter::FastaFormat)
261
222
  end
223
+ alias to_seq to_biosequence
262
224
 
263
225
  # Parsing FASTA Defline, and extract IDs.
264
226
  # IDs are NSIDs (NCBI standard FASTA sequence identifiers)
@@ -362,465 +324,6 @@ module Bio
362
324
 
363
325
  end #class FastaNumericFormat
364
326
 
365
-
366
- # Parsing FASTA Defline, and extract IDs and other informations.
367
- # IDs are NSIDs (NCBI standard FASTA sequence identifiers)
368
- # or ":"-separated IDs.
369
- #
370
- # specs are described in:
371
- # ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
372
- # http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
373
- #
374
- # === Examples
375
- #
376
- # rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
377
- # rub.entry_id ==> 'gi|671595'
378
- # rub.get('emb') ==> 'CAA85678.1'
379
- # rub.emb ==> 'CAA85678.1'
380
- # rub.gi ==> '671595'
381
- # rub.accession ==> 'CAA85678'
382
- # rub.accessions ==> [ 'CAA85678' ]
383
- # rub.acc_version ==> 'CAA85678.1'
384
- # rub.locus ==> nil
385
- # rub.list_ids ==> [["gi", "671595"],
386
- # ["emb", "CAA85678.1", nil],
387
- # ["Perovskia abrotanoides"]]
388
- #
389
- # ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
390
- # ckr.entry_id ==> "gi|2495000"
391
- # ckr.sp ==> "CCKR_CAVPO"
392
- # ckr.pir ==> "I51898"
393
- # ckr.gb ==> "AAB29504.1"
394
- # ckr.gi ==> "2495000"
395
- # ckr.accession ==> "AAB29504"
396
- # ckr.accessions ==> ["Q63931", "AAB29504"]
397
- # ckr.acc_version ==> "AAB29504.1"
398
- # ckr.locus ==> nil
399
- # ckr.description ==>
400
- # "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
401
- # ckr.descriptions ==>
402
- # ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
403
- # "cholecystokinin A receptor - guinea pig",
404
- # "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
405
- # ckr.words ==>
406
- # ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
407
- # "receptor", "type"]
408
- # ckr.id_strings ==>
409
- # ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
410
- # "544724", "AAB29504.1", "Cavia"]
411
- # ckr.list_ids ==>
412
- # [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
413
- # ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
414
- # ["gb", "AAB29504.1", nil], ["Cavia"]]
415
- #
416
- # === Refereneces
417
- #
418
- # * Fasta format description (NCBI)
419
- # http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
420
- #
421
- # * Frequently Asked Questions: Indexing of Sequence Identifiers (by Warren R. Gish.)
422
- # http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
423
- #
424
- # * README.formatdb
425
- # ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
426
- #
427
- class FastaDefline
428
-
429
- NSIDs = {
430
- # NCBI and WU-BLAST
431
- 'gi' => [ 'gi' ], # NCBI GI
432
- 'gb' => [ 'acc_version', 'locus' ], # GenBank
433
- 'emb' => [ 'acc_version', 'locus' ], # EMBL
434
- 'dbj' => [ 'acc_version', 'locus' ], # DDBJ
435
- 'sp' => [ 'accession', 'entry_id' ], # SWISS-PROT
436
- 'pdb' => [ 'entry_id', 'chain' ], # PDB
437
- 'bbs' => [ 'number' ], # GenInfo Backbone Id
438
- 'gnl' => [ 'database' , 'entry_id' ], # General database identifier
439
- 'ref' => [ 'acc_version' , 'locus' ], # NCBI Reference Sequence
440
- 'lcl' => [ 'entry_id' ], # Local Sequence identifier
441
-
442
- # WU-BLAST and NCBI
443
- 'pir' => [ 'accession', 'entry_id' ], # PIR
444
- 'prf' => [ 'accession', 'entry_id' ], # Protein Research Foundation
445
- 'pat' => [ 'country', 'number', 'serial' ], # Patents
446
-
447
- # WU-BLAST only
448
- 'bbm' => [ 'number' ], # NCBI GenInfo Backbone database identifier
449
- 'gim' => [ 'number' ], # NCBI GenInfo Import identifier
450
- 'gp' => [ 'acc_version', 'locus' ], # GenPept
451
- 'oth' => [ 'accession', 'name', 'release' ], # Other (user-definable) identifier
452
- 'tpd' => [ 'accession', 'name' ], # Third party annotation, DDBJ
453
- 'tpe' => [ 'accession', 'name' ], # Third party annotation, EMBL
454
- 'tpg' => [ 'accession', 'name' ], # Third party annotation, GenBank
455
-
456
- # Original
457
- 'ri' => [ 'entry_id', 'rearray_id', 'len' ], # RIKEN FANTOM DB
458
- }
459
-
460
- # Shows array that contains IDs (or ID-like strings).
461
- # Returns an array of arrays of strings.
462
- attr_reader :list_ids
463
-
464
- # Shows a possibly unique identifier.
465
- # Returns a string.
466
- attr_reader :entry_id
467
-
468
- # Parses given string.
469
- def initialize(str)
470
- @deflines = []
471
- @info = {}
472
- @list_ids = []
473
-
474
- @entry_id = nil
475
-
476
- lines = str.split("\x01")
477
- lines.each do |line|
478
- add_defline(line)
479
- end
480
- end #def initialize
481
-
482
- # Parses given string and adds parsed data.
483
- def add_defline(str)
484
- case str
485
- when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/
486
- # NSIDs
487
- # examples:
488
- # >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P
489
- #
490
- # note: regexp (:?) means grouping without backreferences
491
- i = $1
492
- d = $2
493
- tks = i.split('|')
494
- tks << '' if i[-1,1] == '|'
495
- a = parse_NSIDs(tks)
496
- i = a[0].join('|')
497
- a.unshift('|')
498
- d = tks.join('|') + ' ' + d unless tks.empty?
499
- a << d
500
- this_line = a
501
- match_EC(d)
502
- parse_square_brackets(d).each do |x|
503
- if !match_EC(x, false) and x =~ /\A[A-Z]/ then
504
- di = [ x ]
505
- @list_ids << di
506
- @info['organism'] = x unless @info['organism']
507
- end
508
- end
509
-
510
- when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/
511
- # examples:
512
- # >sce:YBR160W CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
513
- # >emb:CACDC28 [X80034] C.albicans CDC28 gene
514
- i = $1
515
- d = $2
516
- a = parse_ColonSepID(i)
517
- i = a.join(':')
518
- this_line = [ ':', a , d ]
519
- match_EC(d)
520
- parse_square_brackets(d).each do |x|
521
- if !match_EC(x, false) and x =~ /:/ then
522
- parse_ColonSepID(x)
523
- elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then
524
- @list_ids << [ $1 ]
525
- end
526
- end
527
-
528
- when /^\>?\s*(\S+)(?:\s+(.+))?$/
529
- # examples:
530
- # >ABC12345 this is test
531
- i = $1
532
- d = $2.to_s
533
- @list_ids << [ i.chomp('.') ]
534
- this_line = [ '', [ i ], d ]
535
- match_EC(d)
536
- else
537
- i = str
538
- d = ''
539
- match_EC(i)
540
- this_line = [ '', [ i ], d ]
541
- end
542
-
543
- @deflines << this_line
544
- @entry_id = i unless @entry_id
545
- end
546
-
547
- def match_EC(str, write_flag = true)
548
- di = nil
549
- str.scan(/EC\:((:?[\-\d]+\.){3}(:?[\-\d]+))/i) do |x|
550
- di = [ 'EC', $1 ]
551
- if write_flag then
552
- @info['ec'] = di[1] if (!@info['ec'] or @info['ec'].to_s =~ /\-/)
553
- @list_ids << di
554
- end
555
- end
556
- di
557
- end
558
- private :match_EC
559
-
560
- def parse_square_brackets(str)
561
- r = []
562
- str.scan(/\[([^\]]*)\]/) do |x|
563
- r << x[0]
564
- end
565
- r
566
- end
567
- private :parse_square_brackets
568
-
569
- def parse_ColonSepID(str)
570
- di = str.split(':', 2)
571
- di << nil if di.size <= 1
572
- @list_ids << di
573
- di
574
- end
575
- private :parse_ColonSepID
576
-
577
- def parse_NSIDs(ary)
578
- # this method destroys ary
579
- data = []
580
- while token = ary.shift
581
- if labels = self.class::NSIDs[token] then
582
- di = [ token ]
583
- idtype = token
584
- labels.each do |x|
585
- token = ary.shift
586
- break unless token
587
- if self.class::NSIDs[token] then
588
- ary.unshift(token)
589
- break #each
590
- end
591
- if token.length > 0 then
592
- di << token
593
- else
594
- di << nil
595
- end
596
- end
597
- data << di
598
- else
599
- if token.length > 0 then
600
- # UCID (uncontrolled identifiers)
601
- di = [ token ]
602
- data << di
603
- @info['ucid'] = token unless @info['ucid']
604
- end
605
- break #while
606
- end
607
- end #while
608
- @list_ids.concat data
609
- data
610
- end #def parse_NSIDs
611
- private :parse_NSIDs
612
-
613
-
614
- # Shows original string.
615
- # Note that the result of this method may be different from
616
- # original string which is given in FastaDefline.new method.
617
- def to_s
618
- @deflines.collect { |a|
619
- s = a[0]
620
- (a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip
621
- }.join("\x01")
622
- end
623
-
624
- # Shows description.
625
- def description
626
- @deflines[0].to_a[-1]
627
- end
628
-
629
- # Returns descriptions.
630
- def descriptions
631
- @deflines.collect do |a|
632
- a[-1]
633
- end
634
- end
635
-
636
- # Shows ID-like strings.
637
- # Returns an array of strings.
638
- def id_strings
639
- r = []
640
- @list_ids.each do |a|
641
- if a.size >= 2 then
642
- r.concat a[1..-1].find_all { |x| x }
643
- else
644
- if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/
645
- r << a[0]
646
- end
647
- end
648
- end
649
- r.concat( words(true, []).find_all do |x|
650
- x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or
651
- x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
652
- end)
653
- r
654
- end
655
-
656
- KillWords = [
657
- 'an', 'the', 'this', 'that',
658
- 'is', 'are', 'were', 'was', 'be', 'can', 'may', 'might',
659
- 'as', 'at', 'by', 'for', 'in', 'of', 'on', 'to', 'with',
660
- 'from', 'and', 'or', 'not',
661
- 'dna', 'rna', 'mrna', 'cdna', 'orf',
662
- 'aa', 'nt', 'pct', 'id', 'ec', 'sp', 'subsp',
663
- 'similar', 'involved', 'identical', 'identity',
664
- 'cds', 'clone', 'library', 'contig', 'contigs',
665
- 'homolog', 'homologue', 'homologs', 'homologous',
666
- 'protein', 'proteins', 'gene', 'genes',
667
- 'product', 'products', 'sequence', 'sequences',
668
- 'strain', 'strains', 'region', 'regions',
669
- ]
670
- KillWordsHash = {}
671
- KillWords.each { |x| KillWordsHash[x] = true }
672
-
673
- KillRegexpArray = [
674
- /\A\d{1,3}\%?\z/,
675
- /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/,
676
- /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
677
- ]
678
-
679
- # Shows words used in the defline. Returns an Array.
680
- def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray,
681
- kwhash = self.class::KillWordsHash)
682
- a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\#\s\x00-\x1f\x7f]+/)
683
- a.collect! do |x|
684
- x.sub!(/\A[\$\*\-\+]+/, '')
685
- x.sub!(/[\$\*\-\=]+\z/, '')
686
- if x.size <= 1 then
687
- nil
688
- elsif kwhash[x.downcase] then
689
- nil
690
- else
691
- if kill_regexp.find { |expr| expr =~ x } then
692
- nil
693
- else
694
- x
695
- end
696
- end
697
- end
698
- a.compact!
699
- a.collect! { |x| x.downcase } unless case_sensitive
700
- a.sort!
701
- a.uniq!
702
- a
703
- end
704
-
705
- # Returns identifires by a database name.
706
- def get(dbname)
707
- db = dbname.to_s
708
- r = nil
709
- unless r = @info[db] then
710
- di = @list_ids.find { |x| x[0] == db.to_s }
711
- if di and di.size <= 2 then
712
- r = di[-1]
713
- elsif di then
714
- labels = self.class::NSIDs[db]
715
- [ 'acc_version', 'entry_id',
716
- 'locus', 'accession', 'number'].each do |x|
717
- if i = labels.index(x) then
718
- r = di[i+1]
719
- break if r
720
- end
721
- end
722
- r = di[1..-1].find { |x| x } unless r
723
- end
724
- @info[db] = r if r
725
- end
726
- r
727
- end
728
-
729
- # Returns an identifier by given type.
730
- def get_by_type(type_str)
731
- @list_ids.each do |x|
732
- if labels = self.class::NSIDs[x[0]] then
733
- if i = labels.index(type_str) then
734
- return x[i+1]
735
- end
736
- end
737
- end
738
- nil
739
- end
740
-
741
- # Returns identifiers by given type.
742
- def get_all_by_type(*type_strarg)
743
- d = []
744
- @list_ids.each do |x|
745
- if labels = self.class::NSIDs[x[0]] then
746
- type_strarg.each do |y|
747
- if i = labels.index(y) then
748
- d << x[i+1] if x[i+1]
749
- end
750
- end
751
- end
752
- end
753
- d
754
- end
755
-
756
- # Shows locus.
757
- # If the entry has more than two of such IDs,
758
- # only the first ID are shown.
759
- # Returns a string or nil.
760
- def locus
761
- unless defined?(@locus)
762
- @locus = get_by_type('locus')
763
- end
764
- @locus
765
- end
766
-
767
- # Shows GI.
768
- # If the entry has more than two of such IDs,
769
- # only the first ID are shown.
770
- # Returns a string or nil.
771
- def gi
772
- unless defined?(@gi) then
773
- @gi = get_by_type('gi')
774
- end
775
- @gi
776
- end
777
-
778
- # Shows accession with version number.
779
- # If the entry has more than two of such IDs,
780
- # only the first ID are shown.
781
- # Returns a string or nil.
782
- def acc_version
783
- unless defined?(@acc_version) then
784
- @acc_version = get_by_type('acc_version')
785
- end
786
- @acc_version
787
- end
788
-
789
- # Shows accession numbers.
790
- # Returns an array of strings.
791
- def accessions
792
- unless defined?(@accessions) then
793
- @accessions = get_all_by_type('accession', 'acc_version')
794
- @accessions.collect! { |x| x.sub(/\..*\z/, '') }
795
- end
796
- @accessions
797
- end
798
-
799
- # Shows an accession number.
800
- def accession
801
- unless defined?(@accession) then
802
- if acc_version then
803
- @accession = acc_version.split('.')[0]
804
- else
805
- @accession = accessions[0]
806
- end
807
- end
808
- @accession
809
- end
810
-
811
- def method_missing(name, *args)
812
- # raise ArgumentError,
813
- # "wrong # of arguments(#{args.size} for 1)" if args.size >= 2
814
- r = get(name, *args)
815
- if !r and !(self.class::NSIDs[name.to_s]) then
816
- raise "NameError: undefined method `#{name.inspect}'"
817
- end
818
- r
819
- end
820
-
821
-
822
- end #class FastaDefline
823
-
824
327
  end #module Bio
825
328
 
826
329
  if __FILE__ == $0