bio 1.2.1 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (259) hide show
  1. data/ChangeLog +3421 -0
  2. data/KNOWN_ISSUES.rdoc +88 -0
  3. data/README.rdoc +252 -0
  4. data/README_DEV.rdoc +285 -0
  5. data/Rakefile +143 -0
  6. data/bin/bioruby +0 -0
  7. data/bin/br_biofetch.rb +0 -0
  8. data/bin/br_bioflat.rb +12 -1
  9. data/bin/br_biogetseq.rb +0 -0
  10. data/bin/br_pmfetch.rb +4 -3
  11. data/bioruby.gemspec +477 -0
  12. data/bioruby.gemspec.erb +117 -0
  13. data/doc/Changes-0.7.rd +7 -0
  14. data/doc/Changes-1.3.rdoc +239 -0
  15. data/doc/Tutorial.rd +296 -184
  16. data/doc/Tutorial.rd.html +1031 -0
  17. data/doc/Tutorial.rd.ja +111 -45
  18. data/doc/Tutorial.rd.ja.html +2225 -0
  19. data/doc/bioruby.css +281 -0
  20. data/extconf.rb +2 -0
  21. data/lib/bio.rb +29 -4
  22. data/lib/bio/appl/blast.rb +306 -121
  23. data/lib/bio/appl/blast/ddbj.rb +142 -0
  24. data/lib/bio/appl/blast/format0.rb +35 -25
  25. data/lib/bio/appl/blast/format8.rb +2 -2
  26. data/lib/bio/appl/blast/genomenet.rb +263 -0
  27. data/lib/bio/appl/blast/ncbioptions.rb +220 -0
  28. data/lib/bio/appl/blast/remote.rb +106 -0
  29. data/lib/bio/appl/blast/report.rb +260 -9
  30. data/lib/bio/appl/blast/rexml.rb +12 -5
  31. data/lib/bio/appl/blast/rpsblast.rb +277 -0
  32. data/lib/bio/appl/blast/wublast.rb +133 -12
  33. data/lib/bio/appl/blast/xmlparser.rb +35 -18
  34. data/lib/bio/appl/blat/report.rb +46 -5
  35. data/lib/bio/appl/emboss.rb +62 -13
  36. data/lib/bio/appl/fasta.rb +9 -11
  37. data/lib/bio/appl/genscan/report.rb +3 -3
  38. data/lib/bio/appl/hmmer.rb +1 -1
  39. data/lib/bio/appl/hmmer/report.rb +10 -10
  40. data/lib/bio/appl/paml/baseml.rb +95 -0
  41. data/lib/bio/appl/paml/baseml/report.rb +32 -0
  42. data/lib/bio/appl/paml/codeml.rb +242 -0
  43. data/lib/bio/appl/paml/codeml/rates.rb +67 -0
  44. data/lib/bio/appl/paml/codeml/report.rb +67 -0
  45. data/lib/bio/appl/paml/common.rb +348 -0
  46. data/lib/bio/appl/paml/common_report.rb +38 -0
  47. data/lib/bio/appl/paml/yn00.rb +103 -0
  48. data/lib/bio/appl/paml/yn00/report.rb +32 -0
  49. data/lib/bio/appl/psort.rb +2 -2
  50. data/lib/bio/appl/pts1.rb +5 -5
  51. data/lib/bio/appl/tmhmm/report.rb +10 -1
  52. data/lib/bio/command.rb +297 -41
  53. data/lib/bio/compat/features.rb +157 -0
  54. data/lib/bio/compat/references.rb +128 -0
  55. data/lib/bio/db/biosql/biosql_to_biosequence.rb +67 -0
  56. data/lib/bio/db/biosql/sequence.rb +508 -0
  57. data/lib/bio/db/embl/common.rb +28 -12
  58. data/lib/bio/db/embl/embl.rb +107 -9
  59. data/lib/bio/db/embl/embl_to_biosequence.rb +85 -0
  60. data/lib/bio/db/embl/format_embl.rb +190 -0
  61. data/lib/bio/db/embl/sptr.rb +15 -16
  62. data/lib/bio/db/fantom.rb +6 -8
  63. data/lib/bio/db/fasta.rb +10 -507
  64. data/lib/bio/db/fasta/defline.rb +532 -0
  65. data/lib/bio/db/fasta/fasta_to_biosequence.rb +63 -0
  66. data/lib/bio/db/fasta/format_fasta.rb +97 -0
  67. data/lib/bio/db/genbank/common.rb +25 -8
  68. data/lib/bio/db/genbank/format_genbank.rb +187 -0
  69. data/lib/bio/db/genbank/genbank.rb +36 -1
  70. data/lib/bio/db/genbank/genbank_to_biosequence.rb +86 -0
  71. data/lib/bio/db/gff.rb +1791 -119
  72. data/lib/bio/db/kegg/glycan.rb +2 -6
  73. data/lib/bio/db/lasergene.rb +3 -3
  74. data/lib/bio/db/medline.rb +4 -1
  75. data/lib/bio/db/newick.rb +10 -10
  76. data/lib/bio/db/pdb/chain.rb +6 -2
  77. data/lib/bio/db/pdb/pdb.rb +12 -3
  78. data/lib/bio/db/rebase.rb +7 -8
  79. data/lib/bio/db/soft.rb +3 -3
  80. data/lib/bio/feature.rb +1 -88
  81. data/lib/bio/io/biosql/biodatabase.rb +64 -0
  82. data/lib/bio/io/biosql/bioentry.rb +29 -0
  83. data/lib/bio/io/biosql/bioentry_dbxref.rb +11 -0
  84. data/lib/bio/io/biosql/bioentry_path.rb +12 -0
  85. data/lib/bio/io/biosql/bioentry_qualifier_value.rb +10 -0
  86. data/lib/bio/io/biosql/bioentry_reference.rb +10 -0
  87. data/lib/bio/io/biosql/bioentry_relationship.rb +10 -0
  88. data/lib/bio/io/biosql/biosequence.rb +11 -0
  89. data/lib/bio/io/biosql/comment.rb +7 -0
  90. data/lib/bio/io/biosql/config/database.yml +20 -0
  91. data/lib/bio/io/biosql/dbxref.rb +13 -0
  92. data/lib/bio/io/biosql/dbxref_qualifier_value.rb +12 -0
  93. data/lib/bio/io/biosql/location.rb +32 -0
  94. data/lib/bio/io/biosql/location_qualifier_value.rb +11 -0
  95. data/lib/bio/io/biosql/ontology.rb +10 -0
  96. data/lib/bio/io/biosql/reference.rb +9 -0
  97. data/lib/bio/io/biosql/seqfeature.rb +32 -0
  98. data/lib/bio/io/biosql/seqfeature_dbxref.rb +11 -0
  99. data/lib/bio/io/biosql/seqfeature_path.rb +11 -0
  100. data/lib/bio/io/biosql/seqfeature_qualifier_value.rb +20 -0
  101. data/lib/bio/io/biosql/seqfeature_relationship.rb +11 -0
  102. data/lib/bio/io/biosql/taxon.rb +12 -0
  103. data/lib/bio/io/biosql/taxon_name.rb +9 -0
  104. data/lib/bio/io/biosql/term.rb +27 -0
  105. data/lib/bio/io/biosql/term_dbxref.rb +11 -0
  106. data/lib/bio/io/biosql/term_path.rb +12 -0
  107. data/lib/bio/io/biosql/term_relationship.rb +13 -0
  108. data/lib/bio/io/biosql/term_relationship_term.rb +11 -0
  109. data/lib/bio/io/biosql/term_synonym.rb +10 -0
  110. data/lib/bio/io/das.rb +7 -7
  111. data/lib/bio/io/ddbjxml.rb +57 -0
  112. data/lib/bio/io/ensembl.rb +2 -2
  113. data/lib/bio/io/fetch.rb +28 -14
  114. data/lib/bio/io/flatfile.rb +17 -853
  115. data/lib/bio/io/flatfile/autodetection.rb +545 -0
  116. data/lib/bio/io/flatfile/buffer.rb +237 -0
  117. data/lib/bio/io/flatfile/index.rb +17 -7
  118. data/lib/bio/io/flatfile/indexer.rb +30 -12
  119. data/lib/bio/io/flatfile/splitter.rb +297 -0
  120. data/lib/bio/io/hinv.rb +442 -0
  121. data/lib/bio/io/keggapi.rb +2 -2
  122. data/lib/bio/io/ncbirest.rb +733 -0
  123. data/lib/bio/io/pubmed.rb +34 -80
  124. data/lib/bio/io/registry.rb +2 -2
  125. data/lib/bio/io/sql.rb +178 -357
  126. data/lib/bio/io/togows.rb +458 -0
  127. data/lib/bio/location.rb +106 -11
  128. data/lib/bio/pathway.rb +120 -14
  129. data/lib/bio/reference.rb +115 -101
  130. data/lib/bio/sequence.rb +164 -183
  131. data/lib/bio/sequence/adapter.rb +108 -0
  132. data/lib/bio/sequence/common.rb +22 -45
  133. data/lib/bio/sequence/compat.rb +2 -2
  134. data/lib/bio/sequence/dblink.rb +54 -0
  135. data/lib/bio/sequence/format.rb +254 -77
  136. data/lib/bio/sequence/format_raw.rb +23 -0
  137. data/lib/bio/shell.rb +3 -1
  138. data/lib/bio/shell/core.rb +2 -2
  139. data/lib/bio/shell/plugin/entry.rb +33 -4
  140. data/lib/bio/shell/plugin/ncbirest.rb +64 -0
  141. data/lib/bio/shell/plugin/togows.rb +40 -0
  142. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/bioruby_generator.rb +0 -0
  143. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_classes.rhtml +0 -0
  144. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_log.rhtml +0 -0
  145. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_methods.rhtml +0 -0
  146. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_modules.rhtml +0 -0
  147. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_variables.rhtml +0 -0
  148. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-bg.gif +0 -0
  149. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-gem.png +0 -0
  150. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-link.gif +0 -0
  151. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.css +0 -0
  152. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.rhtml +0 -0
  153. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_controller.rb +0 -0
  154. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_helper.rb +0 -0
  155. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/commands.rhtml +0 -0
  156. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/history.rhtml +0 -0
  157. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/index.rhtml +0 -0
  158. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/spinner.gif +0 -0
  159. data/lib/bio/tree.rb +4 -2
  160. data/lib/bio/util/color_scheme.rb +2 -2
  161. data/lib/bio/util/contingency_table.rb +2 -2
  162. data/lib/bio/util/restriction_enzyme.rb +2 -2
  163. data/lib/bio/util/restriction_enzyme/single_strand.rb +6 -5
  164. data/lib/bio/version.rb +25 -0
  165. data/rdoc.zsh +8 -0
  166. data/sample/any2fasta.rb +0 -0
  167. data/sample/biofetch.rb +0 -0
  168. data/sample/dbget +0 -0
  169. data/sample/demo_sequence.rb +158 -0
  170. data/sample/enzymes.rb +0 -0
  171. data/sample/fasta2tab.rb +0 -0
  172. data/sample/fastagrep.rb +72 -0
  173. data/sample/fastasort.rb +54 -0
  174. data/sample/fsplit.rb +0 -0
  175. data/sample/gb2fasta.rb +2 -3
  176. data/sample/gb2tab.rb +0 -0
  177. data/sample/gbtab2mysql.rb +0 -0
  178. data/sample/genes2nuc.rb +0 -0
  179. data/sample/genes2pep.rb +0 -0
  180. data/sample/genes2tab.rb +0 -0
  181. data/sample/genome2rb.rb +0 -0
  182. data/sample/genome2tab.rb +0 -0
  183. data/sample/goslim.rb +0 -0
  184. data/sample/gt2fasta.rb +0 -0
  185. data/sample/na2aa.rb +34 -0
  186. data/sample/pmfetch.rb +0 -0
  187. data/sample/pmsearch.rb +0 -0
  188. data/sample/ssearch2tab.rb +0 -0
  189. data/sample/tfastx2tab.rb +0 -0
  190. data/sample/vs-genes.rb +0 -0
  191. data/setup.rb +1596 -0
  192. data/test/data/blast/blastp-multi.m7 +188 -0
  193. data/test/data/command/echoarg2.bat +1 -0
  194. data/test/data/paml/codeml/control_file.txt +30 -0
  195. data/test/data/paml/codeml/output.txt +78 -0
  196. data/test/data/paml/codeml/rates +217 -0
  197. data/test/data/rpsblast/misc.rpsblast +193 -0
  198. data/test/data/soft/GDS100_partial.soft +0 -0
  199. data/test/data/soft/GSE3457_family_partial.soft +0 -0
  200. data/test/functional/bio/appl/test_pts1.rb +115 -0
  201. data/test/functional/bio/io/test_ensembl.rb +123 -80
  202. data/test/functional/bio/io/test_togows.rb +267 -0
  203. data/test/functional/bio/sequence/test_output_embl.rb +51 -0
  204. data/test/functional/bio/test_command.rb +301 -0
  205. data/test/runner.rb +17 -1
  206. data/test/unit/bio/appl/blast/test_ncbioptions.rb +112 -0
  207. data/test/unit/bio/appl/blast/test_report.rb +753 -35
  208. data/test/unit/bio/appl/blast/test_rpsblast.rb +398 -0
  209. data/test/unit/bio/appl/paml/codeml/test_rates.rb +45 -0
  210. data/test/unit/bio/appl/paml/codeml/test_report.rb +45 -0
  211. data/test/unit/bio/appl/paml/test_codeml.rb +174 -0
  212. data/test/unit/bio/appl/test_blast.rb +135 -4
  213. data/test/unit/bio/appl/test_fasta.rb +2 -2
  214. data/test/unit/bio/appl/test_pts1.rb +1 -64
  215. data/test/unit/bio/db/embl/test_common.rb +15 -15
  216. data/test/unit/bio/db/embl/test_embl.rb +4 -4
  217. data/test/unit/bio/db/embl/test_embl_rel89.rb +5 -5
  218. data/test/unit/bio/db/embl/test_embl_to_bioseq.rb +203 -0
  219. data/test/unit/bio/db/embl/test_sptr.rb +38 -1
  220. data/test/unit/bio/db/pdb/test_pdb.rb +2 -2
  221. data/test/unit/bio/db/test_gff.rb +1151 -25
  222. data/test/unit/bio/db/test_medline.rb +127 -0
  223. data/test/unit/bio/db/test_nexus.rb +5 -1
  224. data/test/unit/bio/db/test_prosite.rb +4 -4
  225. data/test/unit/bio/io/flatfile/test_autodetection.rb +375 -0
  226. data/test/unit/bio/io/flatfile/test_buffer.rb +251 -0
  227. data/test/unit/bio/io/flatfile/test_splitter.rb +369 -0
  228. data/test/unit/bio/io/test_ddbjxml.rb +8 -3
  229. data/test/unit/bio/io/test_fastacmd.rb +5 -5
  230. data/test/unit/bio/io/test_flatfile.rb +357 -106
  231. data/test/unit/bio/io/test_soapwsdl.rb +2 -2
  232. data/test/unit/bio/io/test_togows.rb +161 -0
  233. data/test/unit/bio/sequence/test_common.rb +210 -11
  234. data/test/unit/bio/sequence/test_compat.rb +3 -3
  235. data/test/unit/bio/sequence/test_dblink.rb +58 -0
  236. data/test/unit/bio/sequence/test_na.rb +2 -2
  237. data/test/unit/bio/test_command.rb +111 -50
  238. data/test/unit/bio/test_feature.rb +29 -1
  239. data/test/unit/bio/test_location.rb +566 -6
  240. data/test/unit/bio/test_pathway.rb +91 -65
  241. data/test/unit/bio/test_reference.rb +67 -13
  242. data/test/unit/bio/util/restriction_enzyme/analysis/test_calculated_cuts.rb +3 -3
  243. data/test/unit/bio/util/restriction_enzyme/analysis/test_cut_ranges.rb +3 -3
  244. data/test/unit/bio/util/restriction_enzyme/analysis/test_sequence_range.rb +3 -3
  245. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_aligned_strands.rb +4 -3
  246. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair.rb +3 -3
  247. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair_in_enzyme_notation.rb +3 -3
  248. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations.rb +3 -3
  249. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations_in_enzyme_notation.rb +3 -3
  250. data/test/unit/bio/util/restriction_enzyme/single_strand/test_cut_locations_in_enzyme_notation.rb +3 -3
  251. data/test/unit/bio/util/restriction_enzyme/test_analysis.rb +3 -3
  252. data/test/unit/bio/util/restriction_enzyme/test_cut_symbol.rb +4 -4
  253. data/test/unit/bio/util/restriction_enzyme/test_double_stranded.rb +3 -3
  254. data/test/unit/bio/util/restriction_enzyme/test_single_strand.rb +3 -3
  255. data/test/unit/bio/util/restriction_enzyme/test_single_strand_complement.rb +3 -3
  256. data/test/unit/bio/util/restriction_enzyme/test_string_formatting.rb +3 -3
  257. data/test/unit/bio/util/test_restriction_enzyme.rb +3 -3
  258. metadata +202 -167
  259. data/test/unit/bio/appl/blast/test_xmlparser.rb +0 -388
@@ -0,0 +1,117 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = 'bio'
3
+ s.version = "<% ###### Below is executed in ERB environment ######
4
+ # Version can be specified by the environment variable
5
+ env_ver = ENV['BIORUBY_GEM_VERSION']
6
+ env_ver = nil if env_ver.to_s.strip.empty?
7
+
8
+ # By default, determined from lib/bio/version.rb
9
+ load "./lib/bio/version.rb" unless defined?(BIO_VERSION_RB_LOADED)
10
+ case Bio::BIORUBY_EXTRA_VERSION
11
+ when nil
12
+ suffix = nil
13
+ when /\A\.(\d+)\z/
14
+ suffix = $1
15
+ when /\-alpha(\d+)/
16
+ decrement = true
17
+ suffix = 9000 + $1.to_i
18
+ when /\-pre(\d+)/
19
+ decrement = true
20
+ suffix = 9500 + $1.to_i
21
+ when /\-rc(\d+)/
22
+ decrement = true
23
+ suffix = 9900 + $1.to_i
24
+ else
25
+ suffix = "0000"
26
+ end
27
+ ver = Bio::BIORUBY_VERSION.reverse.collect do |i|
28
+ if decrement then
29
+ i -= 1
30
+ i < 0 ? (i += 10) : decrement = false
31
+ end
32
+ i
33
+ end.reverse
34
+ ver.push suffix if suffix
35
+ %><%=
36
+ (env_ver || ver.join('.'))
37
+ ###### Above is executed in ERB environment ######
38
+ %>"
39
+
40
+ s.author = "BioRuby project"
41
+ s.email = "staff@bioruby.org"
42
+ s.homepage = "http://bioruby.org/"
43
+ s.rubyforge_project = "bioruby"
44
+ s.summary = "Bioinformatics library"
45
+ s.description = "BioRuby is a library for bioinformatics (biology + information science)."
46
+
47
+ s.platform = Gem::Platform::RUBY
48
+ s.files = [
49
+ <% ###### Below is executed in ERB environment ######
50
+ # Gets file list from the "git ls-files" command.
51
+ files = (`git ls-files` rescue nil).to_s.split(/\r?\n/)
52
+ files.delete_if { |x| x.empty? }
53
+ # When git-ls-files isn't available, creates a list from current files.
54
+ if !($?.success?) or files.size <= 0 then
55
+ files =
56
+ [ "README.rdoc", "README_DEV.rdoc",
57
+ "ChangeLog", "KNOWN_ISSUES.rdoc",
58
+ "Rakefile", "bioruby.gemspec.erb",
59
+ "bioruby.gemspec", "setup.rb",
60
+ "extconf.rb", "rdoc.zsh"
61
+ ] + Dir.glob("{bin,doc,etc,lib,sample,test}/**/*").delete_if do |item|
62
+ case item
63
+ when /(\A|\/)CVS(\z|\/)/, /(\A|\/)rdoc(\z|\/)/, /\~\z/
64
+ true
65
+ else
66
+ false
67
+ end
68
+ end
69
+ end
70
+ %><%=
71
+ files.sort.collect { |x| x.dump }.join(",\n ")
72
+ ###### Above is executed in ERB environment ######
73
+ %>
74
+ ]
75
+
76
+ s.has_rdoc = true
77
+ s.extra_rdoc_files = [
78
+ <%= ###### Below is executed in ERB environment ######
79
+ # Files whose suffix are .rdoc are selected.
80
+ rdoc_files = files.find_all { |item| /\.rdoc\z/ =~ item }
81
+ # Fail safe settings
82
+ if rdoc_files.empty? then
83
+ rdoc_files = [ 'README.rdoc', 'README_DEV.rdoc',
84
+ 'doc/Changes-1.3.rdoc' ]
85
+ end
86
+ rdoc_files.push "ChangeLog" unless rdoc_files.include?("ChangeLog")
87
+ rdoc_files.sort.collect { |x| x.dump }.join(",\n ")
88
+ ###### Above is executed in ERB environment ######
89
+ %>
90
+ ]
91
+ s.rdoc_options << '--main' << 'README.rdoc'
92
+ s.rdoc_options << '--title' << 'BioRuby API documentation'
93
+ s.rdoc_options << '--exclude' << '\.yaml\z'
94
+ s.rdoc_options << '--line-numbers' << '--inline-source'
95
+
96
+ s.require_path = 'lib'
97
+ s.autorequire = 'bio'
98
+
99
+ s.bindir = "bin"
100
+ s.executables = [
101
+ <%= ###### Below is executed in ERB environment ######
102
+ # Files in bin/ directory are selected.
103
+ exec_files = files.find_all { |item| /\Abin\// =~ item }
104
+ # Non-executable files are removed from the list.
105
+ exec_files.delete_if { |item| !File.executable?(item) }
106
+ # strip "bin/"
107
+ exec_files.collect! { |item| item.sub(/\Abin\//, '') }
108
+ # Fail safe settings
109
+ if exec_files.empty? then
110
+ exec_files = [ "bioruby", "br_biofetch.rb", "br_biogetseq.rb", "br_bioflat.rb", "br_pmfetch.rb" ]
111
+ end
112
+ exec_files.sort.collect { |x| x.dump }.join(",\n ")
113
+ ###### Above is executed in ERB environment ######
114
+ %>
115
+ ]
116
+ s.default_executable = "bioruby"
117
+ end
@@ -338,6 +338,13 @@ In 1.1.0:
338
338
  instead of a string or nil: score, percent_identity, percent_positive,
339
339
  percent_gaps.
340
340
 
341
+ --- BioRuby Shell
342
+
343
+ In 1.1.0:
344
+
345
+ * Shell commands seq, ent, obj are renamed to getseq, getent, getobj,
346
+ respectively.
347
+
341
348
  === Deleted files
342
349
 
343
350
  : lib/bio/db/genbank.rb
@@ -0,0 +1,239 @@
1
+ = Incompatible and important changes since the BioRuby 1.2.1 release
2
+
3
+ A lot of changes have been made to the BioRuby after the version 1.2.1
4
+ is released.
5
+
6
+ == New features
7
+
8
+ === Support for sequence output with improvements of Bio::Sequence
9
+
10
+ The outputting of EMBL and GenBank formatted text are now supported in the
11
+ Bio::Sequence class. See the document of Bio::Sequence#output for details.
12
+ You can also create Bio::Sequence objects from many kinds of data such as
13
+ Bio::GenBank, Bio::EMBL, and Bio::FastaFormat by using the to_biosequence
14
+ method.
15
+
16
+ === BioSQL support
17
+
18
+ BioSQL support is completely rewritten by using ActiveRecord.
19
+
20
+ === Bio::Blast
21
+
22
+ Bio::Blast#reports can parse NCBI default (-m 0) format and tabular (-m 8)
23
+ format, in addition to XML (-m 7) format.
24
+
25
+ Bio::Blast::Report now supports XML format with multiple query sequences
26
+ generated by blastall 2.2.14 or later.
27
+
28
+ Bio::Blast.remote supports DDBJ, in addition to GenomeNet.
29
+ In addition, a list of available blast databases on remote sites
30
+ can be obtained by using Bio::Blast::Remote::DDBJ.databases and
31
+ Bio::Blast::Remote::GenomeNet.databases methods. Note that the above
32
+ remote blast methods may be changed in the future to support NCBI.
33
+
34
+ Bio::Blast::RPSBlast::Report is newly added, a parser for NCBI RPS Blast
35
+ (Reversed Position Specific Blast) default (-m 0 option) results.
36
+
37
+ === Bio::GFF::GFF2 and Bio::GFF::GFF3
38
+
39
+ The outputting of GFF2/GFF3-formatted text is now supported. However, many
40
+ incompatible changes have been made (See below for details).
41
+
42
+ === Bio::Hinv
43
+
44
+ H-Invitational Database web service (REST) client class is newly added.
45
+
46
+ === Bio::NCBI::REST
47
+
48
+ NCBI E-Utilities client class is newly added.
49
+
50
+ === Bio::PAML::Codeml and Bio::PAML::Codeml::Report
51
+
52
+ Bio::PAML::Codeml, wrapper for PAML codeml program, and
53
+ Bio::PAML::Codeml::Report, parser for codeml result are newly added,
54
+ though some of them are still under construction and too specific to
55
+ particular use cases.
56
+
57
+ === Bio::Locations
58
+
59
+ New method Bio::Locations#to_s is added to support output of features.
60
+
61
+ === Bio::TogoWS::REST
62
+
63
+ TogoWS REST client class is newly added. Information about TogoWS REST service
64
+ can be found on http://togows.dbcls.jp/site/en/rest.html.
65
+
66
+ == Deprecated classes
67
+
68
+ === Bio::Features
69
+
70
+ Bio::Features is obsoleted and changed to an array of Bio::Feature object
71
+ with some backward compatibility methods. The backward compatibility methods
72
+ will soon be removed in the future.
73
+
74
+ === Bio::References
75
+
76
+ Bio::References is obsoleted and changed to an array of Bio::Reference object
77
+ with some backward compatibility methods. The backward compatibility methods
78
+ will soon be removed in the future.
79
+
80
+ == Incompatible changes
81
+
82
+ === Bio::BIORUBY_VERSION
83
+
84
+ Definition of the constant Bio::BIORUBY_VERSION is moved from lib/bio.rb to
85
+ lib/bio/version.rb. Normally, the autoload mechanism of Ruby correctly loads
86
+ the version.rb, but special scripts directly using bio.rb may be needed to
87
+ be changed.
88
+
89
+ Bio::BIORUBY_VERSION is changed to be frozen.
90
+
91
+ New constants Bio::BIORUBY_EXTRA_VERSION and Bio::BIORUBY_VERSION_ID are
92
+ added. See their RDoc for details.
93
+
94
+ === Bio::Sequence
95
+
96
+ Bio::Sequence#date is removed. Alternatively, date_created or date_modified
97
+ can be used.
98
+
99
+ Bio::Sequence#taxonomy is changed to be an alias of classification, and
100
+ the data type is changed to an array of string.
101
+
102
+ === Bio::Locations and Bio::Location
103
+
104
+ A carat in a location (e.g. "123^124") is now parsed, instead of being
105
+ replaced by "..". To distinguish from normal "..", a new attribute
106
+ Bio::Location#carat is used.
107
+
108
+ "order(...)" or "group(...)" are also parsed, instead of being regarded
109
+ as "join(...)". To distinguish from "join(...)", a new attribute
110
+ Bio::Locations#operator is used. When "order(...)" or "group(...)",
111
+ the attribute is set to :order or :group, respectively. Note that
112
+ "group(...)" is already deprecated in EMBL/GenBank/DDBJ.
113
+
114
+ === Bio::Blast
115
+
116
+ Return value of Bio::Blast#exec_* is changed to String instead of Report
117
+ object. Parsing the string is now processed in Bio::Blast#query method.
118
+
119
+ Bio::Blast#exec_genomenet_tab and Bio::Blast#server="genomenet_tab" is
120
+ deprecated.
121
+
122
+ Bio::Blast#options=() can now change the following attributes: program, db,
123
+ format, matrix, and filter.
124
+
125
+ Bio::Blast.reports now supports default (-m 0) and tabular (-m 8) formats.
126
+ Old implementation (only supports XML) is renamed to Bio::Blast.reports_xml,
127
+ to keep compatibility for older BLAST XML documents which might not be parsed
128
+ by the new Bio::Blast.reports nor Bio::FlatFile, although we are not sure
129
+ whether such documents really exist or not.
130
+
131
+ === Bio::Blast::Default::Report and Bio::Blast::WU::Report
132
+
133
+ Iteration#lambda, #kappa, #entropy, #gapped_lambda, #gapped_kappa,
134
+ and #gapped_entropy, and the same methods in the Report class are
135
+ changed to return float or nil instead of string or nil.
136
+
137
+ === Bio::Blat
138
+
139
+ When reading BLAT psl (or pslx) data by using Bio::FlatFile, it checks
140
+ each query name and returns a new entry object when the query name is
141
+ changed from previous queries. This is, data is stored to two or more
142
+ Bio::Blat::Report objects, instead of previous version's behavior
143
+ (always reads all data at once and stores to a Bio::Blat::Report object).
144
+
145
+ === Bio::GFF, Bio::GFF::GFF2 and Bio::GFF::GFF3
146
+
147
+ Bio::GFF::Record#comments is renamed to #comment, and #comments= is
148
+ renamed to #comment=, because they only allow a single String (or nil)
149
+ and the plural form "comments" may be confusable. The "comments" and
150
+ "comments=" methods can still be used, but warning messages will be
151
+ shown when using in GFF2::Record and GFF3::Record objects.
152
+
153
+ See below about GFF2 and/or GFF3 specific changes.
154
+
155
+ === Bio::GFF::GFF2 and Bio::GFF::GFF3
156
+
157
+ Bio::GFF::GFF2::Record.new and Bio::GFF::GFF3::Record.new can also
158
+ get 9 arguments corresponding to GFF columns, which helps to create
159
+ Record object directly without formatted text.
160
+
161
+ Bio::GFF::GFF2::Record#start, #end, and #frame return Integer or nil,
162
+ and #score returns Float or nil, instead of String or nil.
163
+ The same changes are also made to Bio::GFF::GFF3::Record.
164
+
165
+ Bio::GFF::GFF2::Record#attributes and Bio::GFF::GFF3::Record#attributes
166
+ are changed to return a nested Array, containing [ tag, value ] pairs,
167
+ because of supporting multiple tags in the same tag names. If you want
168
+ to get a Hash, use Record#attributes_to_hash method, though some
169
+ tag-value pairs in the same tag names may be lost. Note that
170
+ Bio::GFF::Record#attribute still returns a Hash for compatibility.
171
+
172
+ New methods for getting, setting and manipulating attributes are added
173
+ to Bio::GFF::GFF2::Record and Bio::GFF::GFF3::Record classes:
174
+ attribute, get_attribute, get_attributes, set_attribute, replace_attributes,
175
+ add_attribute, delete_attribute, delete_attributes, sort_attributes_by_tag!.
176
+ It is recommended to use these methods instead of directly manipulating
177
+ the array returned by Record#attributes.
178
+
179
+ Bio::GFF::GFF2#to_s, Bio::GFF::GFF3#to_s, Bio::GFF::GFF2::Record#to_s,
180
+ and Bio::GFF::GFF3::Record#to_s are added to support output of
181
+ GFF2/GFF3 data.
182
+
183
+ === Bio::GFF::GFF2
184
+
185
+ GFF2 attribute values are now automatically unescaped. In addition,
186
+ if a value of an attribute is consisted of two or more tokens delimited
187
+ by spaces, an object of the new class Bio::GFF::GFF2::Record::Value is
188
+ returned instead of String. The new class Bio::GFF::GFF2::Record::Value
189
+ aims to store a parsed value of an attribute. If you really want to get
190
+ unparsed string, Bio::GFF::GFF2::Record::Value#to_s can be used.
191
+
192
+ The metadata (lines beginning with "##") are parsed to
193
+ Bio::GFF::GFF2::MetaData objects and are stored to Bio::GFF::GFF2#metadata
194
+ as an array, except the "##gff-version" line. The "##gff-version" version
195
+ string is stored to the Bio::GFF::GFF2#gff_version as a string.
196
+
197
+ === Bio::GFF::GFF3
198
+
199
+ Aliases of columns which are renamed in the GFF3 specification are added
200
+ to the Bio::GFF::GFF3::Record class: seqid (column 1; alias of "seqname"),
201
+ feature_type (column 3; alias of "feature"; in the GFF3 spec, it is
202
+ called "type", but because "type" is already used by Ruby, we use
203
+ "feature_type"), phase (column 8; formerly "frame"). Original names can
204
+ still be used because they are only aliases.
205
+
206
+ Sequences bundled within GFF3 after "##FASTA" are now supported
207
+ (Bio::GFF::GFF3#sequences).
208
+
209
+ GFF3 attribute keys and values are automatically unescaped. Each attribute
210
+ value is stored as a string, except for special attributes listed below:
211
+ * Bio::GFF::GFF3::Record::Target to store a "Target" attribute.
212
+ * Bio::GFF::GFF3::Record::Gap to store a "Gap" attribute.
213
+
214
+ The metadata (lines beginning with "##") are parsed to
215
+ Bio::GFF::GFF3::MetaData objects and stored to Bio::GFF::GFF3#metadata
216
+ as an array, except "##gff-version", "##sequence-region", "###",
217
+ and "##FASTA" lines.
218
+ * "##gff-version" version string is stored to Bio::GFF::GFF3#gff_version.
219
+ * "##sequence-region" lines are parsed to Bio::GFF::GFF3::SequenceRegion
220
+ objects and stored to Bio::GFF::GFF3#sequence_regions as an array.
221
+ * "###" lines are parsed to Bio::GFF::GFF3::RecordBoundary objects.
222
+ * "##FASTA" is regarded as the beginning of bundled sequences.
223
+
224
+ === Bio::Pathway
225
+
226
+ Bio::Pathway#cliquishness is changed to calculate cliquishness (clustering
227
+ coefficient) for not only undirected graphs but also directed graphs.
228
+
229
+ In Bio::Pathway#to_matrix, dump_matrix, dump_list, and depth_first_search
230
+ methods, to avoid dependency to the order of objects in Hash#each (and
231
+ each_keys etc.), Bio::Pathway#index is used to specify preferences of
232
+ nodes in a graph.
233
+
234
+ === Bio::SQL and BioSQL related classes
235
+
236
+ BioSQL support is completely rewritten by using ActiveRecord. See documents
237
+ in lib/bio/io/sql.rb, lib/bio/io/biosql, and lib/bio/db/biosql for details
238
+ of changes and usage of the classes/modules.
239
+
@@ -1,49 +1,79 @@
1
+ # This document is generated with a version of rd2html (part of Hiki)
2
+ #
3
+ # A possible test run could be from rdtool (on Debian package rdtool)
4
+ #
5
+ # ruby -I lib ./bin/rd2 ~/cvs/opensource/bioruby/doc/Tutorial.rd
6
+ #
7
+ # or with style sheet:
8
+ #
9
+ # ruby -I lib ./bin/rd2 -r rd/rd2html-lib.rb --with-c
10
+ ss=bioruby.css ~/cvs/opensource/bioruby/doc/Tutorial.rd > ~/bioruby.html
11
+ #
12
+ # in Debian:
13
+ #
14
+ # rd2 -r rd/rd2html-lib --with-css="/home/wrk/izip/cvs/opensource/bioruby/lib/bio/shell/rails/vendor/plugins/bioruby/generators/bioruby/templates/bioruby.css" Tutorial.rd > index.html
15
+ #
16
+ # A common problem is tabs in the text file! TABs are not allowed.
17
+ #
18
+ # To add tests run Toshiaki's bioruby shell and paste in the query plus
19
+ # results.
20
+ #
21
+ # To run the embedded Ruby doctests you can get the doctest.rb from Pjotr.
22
+
1
23
  =begin
24
+ #doctest Testing bioruby
2
25
 
3
- See the document in the CVS repository ./doc/((<Tutorial.rd|URL:http://cvs.open-bio.org/cgi-bin/viewcvs/viewcvs.cgi/*checkout*/bioruby/doc/Tutorial.rd?rev=HEAD&cvsroot=bioruby&content-type=text/plain>)) - for a potentially more up-to-date edition. This one was updated:
26
+ = BioRuby Tutorial
4
27
 
5
- $Id: Tutorial.rd,v 1.13 2007/07/09 12:28:07 pjotr Exp $
28
+ Editor: PjotrPrins <p .at. bioruby.org>
6
29
 
7
- Translated into English: Naohisa Goto <ng@bioruby.org>
30
+ * Copyright (C) 2001-2003 KATAYAMA Toshiaki <k .at. bioruby.org>
31
+ * Copyright (C) 2005-2008 Pjotr Prins, Naohisa Goto and others
8
32
 
9
- Editor: PjotrPrins <p@bioruby.org>
33
+ The latest version resides in the CVS repository ./doc/((<Tutorial.rd|URL:http://cvs.open-bio.org/cgi-bin/viewcvs/viewcvs.cgi/*checkout*/bioruby/doc/Tutorial.rd?rev=HEAD&cvsroot=bioruby&content-type=text/plain>)). This one was updated:
10
34
 
11
- Copyright (C) 2001-2003 KATAYAMA Toshiaki <k@bioruby.org>, 2005-2007 Pjotr Prins, Naohisa Goto and others
35
+ $Id: Tutorial.rd,v 1.22 2008/05/19 12:22:05 pjotr Exp $
12
36
 
13
- IMPORTANT NOTICE: This page is maintained in the BioRuby CVS
14
- repository. Please edit the file there otherwise changes may get
15
- lost. See ((<BioRuby Developer Information>)) for CVS and mailing list
16
- access.
17
-
18
- = BioRuby Tutorial
37
+ in preparation for the ((<BioHackathlon 2008|URL:http://hackathon.dbcls.jp/>))
19
38
 
20
39
  == Introduction
21
40
 
22
- This is a tutorial for using Bioruby. For BioRuby you need to install
23
- Ruby and the BioRuby package on your computer. For each following the
24
- instruction on the respective websites. (EDITOR's NOTE: include URL's)
25
-
26
- (EDITOR's NOTE: describe rdoc use for individual classes)
41
+ This is a tutorial for using Bioruby. A basic knowledge of Ruby is required.
42
+ If you want to know more about the programming langauge Ruby we recommend the
43
+ excellent book ((<Programming Ruby|URL:http://www.pragprog.com/titles/ruby>))
44
+ by Dave Thomas and Andy Hunt - some of it is online
45
+ ((<here|URL:http://www.rubycentral.com/pickaxe/>)).
27
46
 
28
- For further information on the Ruby language see the section 'Further
29
- reading' at the end.
47
+ For BioRuby you need to install Ruby and the BioRuby package on your computer
30
48
 
31
49
  You can check whether Ruby is installed on your computer and what
32
50
  version it has with the
33
51
 
34
- % ruby -v
52
+ % ruby -v
35
53
 
36
54
  command. Showing something like:
37
55
 
38
56
  ruby 1.8.5 (2006-08-25) [powerpc-linux]
39
57
 
58
+ If you see no such thing you'll have to install Ruby using your installation
59
+ manager. For more information see the
60
+ ((<Ruby|URL:http://www.ruby-lang.org/en/>)) website.
61
+
62
+ Once Ruby is works download and install Bioruby using the links on the
63
+ ((<Bioruby|URL:http://bioruby.org/>)) website.
64
+
65
+ A lot of BioRuby's documentation exists in the source code and unit tests. To
66
+ really dive in you will need the latest source code tree. The embedded rdoc
67
+ documentation can be viewed online at
68
+ ((<bioruby's rdoc|URL:http://bioruby.org/rdoc/>)). But first lets start!
40
69
 
41
70
  == Trying Bioruby
42
71
 
43
72
  Bioruby comes with its own shell. After unpacking the sources run the
44
73
  following command
45
74
 
46
- $BIORUBY/bin/bioruby
75
+ ./bin/bioruby or
76
+ ruby -I lib bin/bioruby
47
77
 
48
78
  and you should see a prompt
49
79
 
@@ -52,10 +82,14 @@ and you should see a prompt
52
82
  Now test the following:
53
83
 
54
84
  bioruby> seq = Bio::Sequence::NA.new("atgcatgcaaaa")
55
- bioruby> puts seq
56
- atgcatgcaaaa
57
- bioruby> puts seq.complement
58
- ttttgcatgcat
85
+ ==> "atgcatgcaaaa"
86
+
87
+ bioruby> seq.complement
88
+ ==> "ttttgcatgcat"
89
+
90
+ See the the Bioruby shell section below for more tweaking. If you have trouble running
91
+ examples also check the section below on trouble shooting. You can also post a
92
+ question to the mailing list. BioRuby developers usually try to help.
59
93
 
60
94
  == Working with nucleic / amino acid sequences (Bio::Sequence class)
61
95
 
@@ -68,33 +102,48 @@ calculated, and so on. When translating into amino acid sequences the
68
102
  frame can be specified and optionally the condon table selected (as
69
103
  defined in codontable.rb).
70
104
 
105
+ bioruby> seq = Bio::Sequence::NA.new("atgcatgcaaaa")
106
+ ==> "atgcatgcaaaa"
107
+
108
+ # complemental sequence (Bio::Sequence::NA object)
109
+ bioruby> seq.complement
110
+ ==> "ttttgcatgcat"
111
+
112
+ bioruby> seq.subseq(3,8) # gets subsequence of positions 3 to 8
113
+ ==> "gcatgc"
114
+ bioruby> seq.gc_percent
115
+ ==> 33
116
+ bioruby> seq.composition
117
+ ==> {"a"=>6, "c"=>2, "g"=>2, "t"=>2}
118
+ bioruby> seq.translate
119
+ ==> "MHAK"
120
+ bioruby> seq.translate(2) # translate from frame 2
121
+ ==> "CMQ"
122
+ bioruby> seq.translate(1,11) # codon table 11
123
+ ==> "MHAK"
124
+ bioruby> seq.translate.codes
125
+ ==> ["Met", "His", "Ala", "Lys"]
126
+ bioruby> seq.translate.names
127
+ ==> ["methionine", "histidine", "alanine", "lysine"]
128
+ bioruby> seq.translate.composition
129
+ ==> {"K"=>1, "A"=>1, "M"=>1, "H"=>1}
130
+ bioruby> seq.translate.molecular_weight
131
+ ==> 485.605
132
+ bioruby> seq.complement.translate
133
+ ==> "FCMH"
134
+
135
+ get a random sequence with the same NA count:
136
+
137
+ bioruby> counts = {'a'=>seq.count('a'),'c'=>seq.count('c'),'g'=>seq.count('g'),'t'=>seq.count('t')}
138
+ ==> {"a"=>6, "c"=>2, "g"=>2, "t"=>2}
139
+ bioruby!> randomseq = Bio::Sequence::NA.randomize(counts)
140
+ ==!> "aaacatgaagtc"
141
+
142
+ bioruby!> print counts
143
+ a6c2g2t2
144
+ bioruby!> p counts
145
+ {"a"=>6, "c"=>2, "g"=>2, "t"=>2}
71
146
 
72
- #!/usr/bin/env ruby
73
-
74
- require 'bio'
75
-
76
- seq = Bio::Sequence::NA.new("atgcatgcaaaa")
77
-
78
- puts seq # original sequence
79
- puts seq.complement # complemental sequence (Bio::Sequence::NA object)
80
- puts seq.subseq(3,8) # gets subsequence of positions 3 to 8
81
-
82
- p seq.gc_percent # GC percent (BioRuby 0.6.X: Float, BioRuby 0.7 or later: Integer)
83
- p seq.composition # nucleic acid compositions (Hash)
84
-
85
- puts seq.translate # translation (Bio::Sequence::AA object)
86
- puts seq.translate(2) # translation from frame 2 (default is frame 1)
87
- puts seq.translate(1,11) # using codon table No.11 (see http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi)
88
-
89
- p seq.translate.codes # shows three-letter codes (Array)
90
- p seq.translate.names # shows amino acid names (Array)
91
- p seq.translate.composition # amino acid compositions (Hash)
92
- p seq.translate.molecular_weight # calculating molecular weight (Float)
93
-
94
- puts seq.complement.translate # translation of complemental strand
95
-
96
- counts = {'a'=>seq.count('a'),'c'=>seq.count('c'),'g'=>seq.count('g'),'t'=>seq.count('t')}
97
- p randomseq = Bio::Sequence::NA.randomize(counts) # reshuffle sequence with same freq.
98
147
 
99
148
  The p, print and puts methods are standard Ruby ways of outputting to
100
149
  the screen. If you want to know more about standard Ruby commands you
@@ -105,9 +154,9 @@ Windows). For example
105
154
  % ri p
106
155
  % ri File.open
107
156
 
108
- Nucleic acid sequence is an object of +Bio::Sequence::NA+ class, and
109
- amino acid sequence is an object of +Bio::Sequence::AA+ class. Shared
110
- methods are in the parent +Bio::Sequence+ class.
157
+ Nucleic acid sequence is an object of Bio::Sequence::NA class, and
158
+ amino acid sequence is an object of Bio::Sequence::AA class. Shared
159
+ methods are in the parent Bio::Sequence class.
111
160
 
112
161
  As Bio::Sequence class inherits Ruby's String class, you can use
113
162
  String class methods. For example, to get a subsequence, you can
@@ -116,15 +165,12 @@ not only use subseq(from, to) but also String#[].
116
165
  Please take note that the Ruby's string's are base 0 - i.e. the first letter
117
166
  has index 0, for example:
118
167
 
119
- s = 'abc'
120
- puts s[0].chr
121
-
122
- >a
123
-
124
- puts s[0..1]
125
-
126
- >ab
127
-
168
+ bioruby> s = 'abc'
169
+ ==> "abc"
170
+ bioruby> s[0].chr
171
+ ==> "a"
172
+ bioruby> s[0..1]
173
+ ==> "ab"
128
174
 
129
175
  So when using String methods, you should subtract 1 from positions
130
176
  conventionally used in biology. (subseq method will throw an exception if you
@@ -136,55 +182,71 @@ way of writing concise and clear code using 'closures'. Each sliding
136
182
  window creates a subsequence which is supplied to the enclosed block
137
183
  through a variable named +s+.
138
184
 
139
- * Shows average percentage of GC content for 100 bases (stepping
140
- the default one base at a time)
185
+ Show average percentage of GC content for 20 bases (stepping the default one base at a time)
141
186
 
142
- seq.window_search(100) do |s|
143
- puts s.gc_percent
144
- end
187
+ bioruby> seq = Bio::Sequence::NA.new("atgcatgcaattaagctaatcccaattagatcatcccgatcatcaaaaaaaaaa")
188
+ ==> "atgcatgcaattaagctaatcccaattagatcatcccgatcatcaaaaaaaaaa"
145
189
 
190
+ bioruby> a=[]; seq.window_search(20) { |s| a.push s.gc_percent }
191
+ bioruby> a
192
+ ==> [30, 35, 40, 40, 35, 35, 35, 30, 25, 30, 30, 30, 35, 35, 35, 35, 35, 40, 45, 45, 45, 45, 40, 35, 40, 40, 40, 40, 40, 35, 35, 35, 30, 30, 30]
193
+
194
+
146
195
  Since the class of each subsequence is the same as original sequence
147
196
  (Bio::Sequence::NA or Bio::Sequence::AA or Bio::Sequence), you can
148
197
  use all methods on the subsequence. For example,
149
198
 
150
- * Shows translation results for 15 bases shifting a codon at a time
199
+ Shows translation results for 15 bases shifting a codon at a time
200
+
201
+ bioruby> a = []
202
+ bioruby> seq.window_search(15, 3) do |s|
203
+ bioruby> a.push s.translate
204
+ bioruby> end
205
+ bioruby> a
206
+ ==> ["MHAIK", "HAIKL", "AIKLI", "IKLIP", "KLIPI", "LIPIR", "IPIRS", "PIRSS", "IRSSR", "RSSRS", "SSRSS", "SRSSK", "RSSKK", "SSKKK"]
151
207
 
152
- seq.window_search(15, 3) do |s|
153
- puts s.translate
154
- end
155
208
 
156
209
  Finally, the window_search method returns the last leftover
157
210
  subsequence. This allows for example
158
211
 
159
- * Divide a genome sequence into sections of 10000bp and
160
- output FASTA formatted sequences. The 1000bp at the start and end of
161
- each subsequence overlapped. At the 3' end of the sequence the
162
- leftover subsequence shorter than 10000bp is also added
212
+ Divide a genome sequence into sections of 10000bp and
213
+ output FASTA formatted sequences (line width 60 chars). The 1000bp at the
214
+ start and end of each subsequence overlapped. At the 3' end of the sequence
215
+ the leftover is also added:
163
216
 
164
217
  i = 1
218
+ textwidth=60
165
219
  remainder = seq.window_search(10000, 9000) do |s|
166
- puts s.to_fasta("segment #{i}", 60)
220
+ puts s.to_fasta("segment #{i}", textwidth)
167
221
  i += 1
168
222
  end
169
- puts remainder.to_fasta("segment #{i}", 60)
223
+ if remainder
224
+ puts remainder.to_fasta("segment #{i}", textwidth)
225
+ end
170
226
 
171
227
  If you don't want the overlapping window, set window size and stepping
172
228
  size to equal values.
173
229
 
174
230
  Other examples
175
231
 
176
- * Count the codon usage
232
+ Count the codon usage
177
233
 
178
- codon_usage = Hash.new(0)
179
- seq.window_search(3, 3) do |s|
180
- codon_usage[s] += 1
181
- end
234
+ bioruby> codon_usage = Hash.new(0)
235
+ bioruby> seq.window_search(3, 3) do |s|
236
+ bioruby> codon_usage[s] += 1
237
+ bioruby> end
238
+ bioruby> codon_usage
239
+ ==> {"cat"=>1, "aaa"=>3, "cca"=>1, "att"=>2, "aga"=>1, "atc"=>1, "cta"=>1, "gca"=>1, "cga"=>1, "tca"=>3, "aag"=>1, "tcc"=>1, "atg"=>1}
182
240
 
183
- * Calculate molecular weight for each 10-aa peptide (or 10-nt nucleic acid)
184
241
 
185
- seq.window_search(10, 10) do |s|
186
- puts s.molecular_weight
187
- end
242
+ Calculate molecular weight for each 10-aa peptide (or 10-nt nucleic acid)
243
+
244
+ bioruby> a = []
245
+ bioruby> seq.window_search(10, 10) do |s|
246
+ bioruby> a.push s.molecular_weight
247
+ bioruby> end
248
+ bioruby> a
249
+ ==> [3096.2062, 3086.1962, 3056.1762, 3023.1262, 3073.2262]
188
250
 
189
251
  In most cases, sequences are read from files or retrieved from databases.
190
252
  For example:
@@ -210,6 +272,10 @@ For example, translates my_naseq.txt:
210
272
 
211
273
  % ruby na2aa.rb my_naseq.txt
212
274
 
275
+ or use a pipe!
276
+
277
+ % cat my_naseq.txt|ruby na2aa.rb
278
+
213
279
  Outputs
214
280
 
215
281
  VAIFPKAMTGAKNQSSDICLMPHVGLIRRGQRRIRHLVQMSDAA*
@@ -218,8 +284,9 @@ You can also write this, a bit fanciful, as a one-liner script.
218
284
 
219
285
  % ruby -r bio -e 'p Bio::Sequence::NA.new($<.read).translate' my_naseq.txt
220
286
 
221
- In the next section we will retrieve data from databases instead of
222
- using raw sequence files.
287
+ In the next section we will retrieve data from databases instead of using raw
288
+ sequence files. One generic example of the above can be found in
289
+ ./sample/na2aa.rb.
223
290
 
224
291
  == Parsing GenBank data (Bio::GenBank class)
225
292
 
@@ -243,7 +310,8 @@ the data:
243
310
 
244
311
  print ">#{gb.accession} " # Accession
245
312
  puts gb.definition # Definition
246
- puts gb.naseq # Nucleic acid sequence (Bio::Sequence::NA object)
313
+ puts gb.naseq # Nucleic acid sequence
314
+ # (Bio::Sequence::NA object)
247
315
  end
248
316
 
249
317
  But that has the disadvantage the code is tied to GenBank input. A more
@@ -251,9 +319,9 @@ generic method is to use Bio::FlatFile which allows you to use different
251
319
  input formats:
252
320
 
253
321
  #!/usr/bin/env ruby
254
-
322
+
255
323
  require 'bio'
256
-
324
+
257
325
  ff = Bio::FlatFile.new(Bio::GenBank, ARGF)
258
326
  ff.each_entry do |gb|
259
327
  definition = "#{gb.accession} #{gb.definition}"
@@ -288,9 +356,6 @@ Again another option is to use the Bio::DB.open class:
288
356
  puts gb.naseq.to_fasta(definition, 60)
289
357
  end
290
358
 
291
- (TRANSLATOR'S NOTE: Bio::DB.open have not been used so well.)
292
- (EDITOR's NOTE: Test code)
293
-
294
359
  Next, we are going to parse the GenBank 'features', which is normally
295
360
  very complicated:
296
361
 
@@ -333,12 +398,12 @@ very complicated:
333
398
  end
334
399
  end
335
400
 
336
- * Note: In this example Feature#assoc method makes a Hash from a
337
- feature object. It is useful because you can get data from the hash
338
- by using qualifiers as keys.
339
- (But there is a risk some information is lost when two or more
340
- qualifiers are the same. Therefore an Array is returned by
341
- Feature#feature)
401
+ Note: In this example Feature#assoc method makes a Hash from a
402
+ feature object. It is useful because you can get data from the hash
403
+ by using qualifiers as keys.
404
+ (But there is a risk some information is lost when two or more
405
+ qualifiers are the same. Therefore an Array is returned by
406
+ Feature#feature)
342
407
 
343
408
  Bio::Sequence#splicing splices subsequence from nucleic acid sequence
344
409
  according to location information used in GenBank, EMBL and DDBJ.
@@ -352,11 +417,11 @@ feature style location text but also Bio::Locations object. For more
352
417
  information about location format and Bio::Locations class, see
353
418
  bio/location.rb.
354
419
 
355
- * Splice according to location string used in a GenBank entry
420
+ Splice according to location string used in a GenBank entry
356
421
 
357
422
  naseq.splicing('join(2035..2050,complement(1775..1818),13..345')
358
423
 
359
- * Generate Bio::Locations object and pass the splicing method
424
+ Generate Bio::Locations object and pass the splicing method
360
425
 
361
426
  locs = Bio::Locations.new('join((8298.8300)..10206,1..855)')
362
427
  naseq.splicing(locs)
@@ -364,17 +429,16 @@ bio/location.rb.
364
429
  You can also use the splicing method for amino acid sequences
365
430
  (Bio::Sequence::AA objects).
366
431
 
367
- * Splicing peptide from a protein (e.g. signal peptide)
432
+ Splicing peptide from a protein (e.g. signal peptide)
368
433
 
369
434
  aaseq.splicing('21..119')
370
435
 
371
- (EDITOR's NOTE: why use STRINGs here?)
372
436
 
373
437
  === More databases
374
438
 
375
439
  Databases in BioRuby are essentially accessed like that of GenBank
376
- with classes like Bio::GenBank, Bio::KEGG::GENES,
377
- (EDITOR's NOTE: include complete list)
440
+ with classes like Bio::GenBank, Bio::KEGG::GENES. A full list can be found in
441
+ the ./lib/bio/db directory of the BioRuby source tree.
378
442
 
379
443
  In many cases the Bio::DatabaseClass acts as a factory pattern
380
444
  and recognises the database type automatically - returning a
@@ -401,7 +465,14 @@ database class?
401
465
  end
402
466
 
403
467
  An example that can take any input, filter using a regular expression to output
404
- to a FASTA file can be found in sample/any2fasta.rb.
468
+ to a FASTA file can be found in sample/any2fasta.rb. With this technique it is
469
+ possible to write a Unix type grep/sort pipe for sequence information. One
470
+ example using scripts in the BIORUBY sample folder:
471
+
472
+ fastagrep.rb '/At|Dm/' database.seq | fastasort.rb
473
+
474
+ greps the database for Arabidopsis and Drosophila entries and sorts the output
475
+ to FASTA.
405
476
 
406
477
  Other methods to extract specific data from database objects can be
407
478
  different between databases, though some methods are common (see the
@@ -427,35 +498,30 @@ multiple Bio::Reference objects as an Array. And some classes have a
427
498
  Bio::Alignment class in bio/alignment.rb is a container class like Ruby's Hash,
428
499
  Array and BioPerl's Bio::SimpleAlign. A very simple example is:
429
500
 
430
- require 'bio'
431
-
432
- seqs = [ 'atgca', 'aagca', 'acgca', 'acgcg' ]
433
- seqs = seqs.collect{ |x| Bio::Sequence::NA.new(x) }
434
-
501
+ bioruby> seqs = [ 'atgca', 'aagca', 'acgca', 'acgcg' ]
502
+ bioruby> seqs = seqs.collect{ |x| Bio::Sequence::NA.new(x) }
435
503
  # creates alignment object
436
- a = Bio::Alignment.new(seqs)
437
-
438
- # shows consensus sequence
439
- p a.consensus # ==> "a?gc?"
440
-
504
+ bioruby> a = Bio::Alignment.new(seqs)
505
+ bioruby> a.consensus
506
+ ==> "a?gc?"
441
507
  # shows IUPAC consensus
442
- p a.consensus_iupac # ==> "ahgcr"
443
-
508
+ a.consensus_iupac
509
+ ==> "ahgcr"
444
510
  # iterates over each seq
445
511
  a.each { |x| p x }
446
- # ==>
447
- # "atgca"
448
- # "aagca"
449
- # "acgca"
450
- # "acgcg"
512
+ # ==>
513
+ # "atgca"
514
+ # "aagca"
515
+ # "acgca"
516
+ # "acgcg"
451
517
  # iterates over each site
452
518
  a.each_site { |x| p x }
453
- # ==>
454
- # ["a", "a", "a", "a"]
455
- # ["t", "a", "c", "c"]
456
- # ["g", "g", "g", "g"]
457
- # ["c", "c", "c", "c"]
458
- # ["a", "a", "a", "g"]
519
+ # ==>
520
+ # ["a", "a", "a", "a"]
521
+ # ["t", "a", "c", "c"]
522
+ # ["g", "g", "g", "g"]
523
+ # ["c", "c", "c", "c"]
524
+ # ["a", "a", "a", "g"]
459
525
 
460
526
  # doing alignment by using CLUSTAL W.
461
527
  # clustalw command must be installed.
@@ -469,21 +535,22 @@ library of commonly used REs (from REBASE) which can be used to cut single
469
535
  stranded RNA or dubbel stranded DNA into fragments. To list all enzymes:
470
536
 
471
537
  rebase = Bio::RestrictionEnzyme.rebase
472
- rebase.each do |enzyme_name, info|
473
- p enzyme_name
538
+ rebase.each do |enzyme_name, info|
539
+ p enzyme_name
474
540
  end
475
541
 
476
542
  and cut a sequence with an enzyme follow up with:
477
543
 
478
- res = seq.cut_with_enzyme('EcoRII', {:max_permutations => 0}, {:view_ranges => true})
544
+ res = seq.cut_with_enzyme('EcoRII', {:max_permutations => 0},
545
+ {:view_ranges => true})
479
546
  if res.kind_of? Symbol #error
480
547
  err = Err.find_by_code(res.to_s)
481
548
  unless err
482
549
  err = Err.new(:code => res.to_s)
483
550
  end
484
551
  end
485
- res.each do |frag|
486
- em = EnzymeMatch.new
552
+ res.each do |frag|
553
+ em = EnzymeMatch.new
487
554
 
488
555
  em.p_left = frag.p_left
489
556
  em.p_right = frag.p_right
@@ -493,7 +560,7 @@ and cut a sequence with an enzyme follow up with:
493
560
  em.err = nil
494
561
  em.enzyme = ar_enz
495
562
  em.sequence = ar_seq
496
- p em
563
+ p em
497
564
  end
498
565
 
499
566
 
@@ -510,21 +577,21 @@ local machine.
510
577
  Install the fasta program on your machine (the command name looks like
511
578
  fasta34. FASTA can be downloaded from ftp://ftp.virginia.edu/pub/fasta/).
512
579
  First, you must prepare your FASTA-formatted database sequence file
513
- target.pep and FASTA-formatted query.pep. (TRANSLATOR'S NOTE: I think
514
- we should provide sample data to readers.)
580
+ target.pep and FASTA-formatted query.pep.
515
581
 
516
582
  #!/usr/bin/env ruby
517
583
 
518
584
  require 'bio'
519
585
 
520
- # Creates FASTA factory object ("ssearch" instead of "fasta34" can also work)
586
+ # Creates FASTA factory object ("ssearch" instead of
587
+ # "fasta34" can also work)
521
588
  factory = Bio::Fasta.local('fasta34', ARGV.pop)
522
589
  (EDITOR's NOTE: not consistent pop command)
523
590
 
524
- # Reads FASTA-formatted files (TRANSLATOR'S NOTE: something wrong in Japanese text)
525
591
  ff = Bio::FlatFile.new(Bio::FastaFormat, ARGF)
526
592
 
527
- # Iterates over each entry. the variable "entry" is a Bio::FastaFormat object.
593
+ # Iterates over each entry. the variable "entry" is a
594
+ # Bio::FastaFormat object:
528
595
  ff.each do |entry|
529
596
  # shows definition line (begins with '>') to the standard error output
530
597
  $stderr.puts "Searching ... " + entry.definition
@@ -536,7 +603,8 @@ we should provide sample data to readers.)
536
603
  report.each do |hit|
537
604
  # If E-value is smaller than 0.0001
538
605
  if hit.evalue < 0.0001
539
- # shows identifier of query and hit, E-value, start and end positions of homologous region (TRANSLATOR'S NOTE: should I change Japanese document?)
606
+ # shows identifier of query and hit, E-value, start and
607
+ # end positions of homologous region
540
608
  print "#{hit.query_id} : evalue #{hit.evalue}\t#{hit.target_id} at "
541
609
  p hit.lap_at
542
610
  end
@@ -550,7 +618,6 @@ We named above script as f_search.rb. You can execute as follows:
550
618
  In above script, the variable "factory" is a factory object for executing
551
619
  FASTA many times easily. Instead of using Fasta#query method,
552
620
  Bio::Sequence#fasta method can be used.
553
- (TRANSLATOR'S NOTE: Bio::Sequence#fasta are not so frequently used.)
554
621
 
555
622
  seq = ">test seq\nYQVLEEIGRGSFGSVRKVIHIPTKKLLVRKDIKYGHMNSKE"
556
623
  seq.fasta(factory)
@@ -566,7 +633,6 @@ Bio::Fasta#query returns Bio::Fasta::Report object.
566
633
  We can get almost all information described in FASTA report text
567
634
  with the Report object. For example, getting information for hits:
568
635
 
569
-
570
636
  report.each do |hit|
571
637
  puts hit.evalue # E-value
572
638
  puts hit.sw # Smith-Waterman score (*)
@@ -575,15 +641,19 @@ with the Report object. For example, getting information for hits:
575
641
  puts hit.query_id # identifier of query sequence
576
642
  puts hit.query_def # definition(comment line) of query sequence
577
643
  puts hit.query_len # length of query sequence
578
- puts hit.query_seq # query sequence (TRANSLATOR'S NOTE: sequence of homologous region of query sequence)
644
+ puts hit.query_seq # sequence of homologous region
579
645
  puts hit.target_id # identifier of hit sequence
580
646
  puts hit.target_def # definition(comment line) of hit sequence
581
647
  puts hit.target_len # length of hit sequence
582
- puts hit.target_seq # hit sequence (TRANSLATOR'S NOTE: sequence of homologous region of hit sequence)
583
- puts hit.query_start # start position of homologous region in query sequence
584
- puts hit.query_end # end position of homologous region in query sequence
585
- puts hit.target_start # start posiotion of homologous region in hit(target) sequence
586
- puts hit.target_end # end position of homologous region in hit(target) sequence
648
+ puts hit.target_seq # hit of homologous region of hit sequence
649
+ puts hit.query_start # start position of homologous
650
+ # region in query sequence
651
+ puts hit.query_end # end position of homologous region
652
+ # in query sequence
653
+ puts hit.target_start # start posiotion of homologous region
654
+ # in hit(target) sequence
655
+ puts hit.target_end # end position of homologous region
656
+ # in hit(target) sequence
587
657
  puts hit.lap_at # array of above four numbers
588
658
  end
589
659
 
@@ -676,25 +746,25 @@ There are some additional BLAST methods, for example, bit_score and
676
746
  midline.
677
747
 
678
748
  report.each do |hit|
679
- puts hit.bit_score # bit score (*)
680
- puts hit.query_seq # query sequence (TRANSLATOR'S NOTE: sequence of homologous region of query sequence)
681
- puts hit.midline # middle line string of alignment of homologous region (*)
682
- puts hit.target_seq # hit sequence (TRANSLATOR'S NOTE: sequence of homologous region of query sequence)
683
-
684
- puts hit.evalue # E-value
685
- puts hit.identity # % identity
686
- puts hit.overlap # length of overlapping region
687
- puts hit.query_id # identifier of query sequence
688
- puts hit.query_def # definition(comment line) of query sequence
689
- puts hit.query_len # length of query sequence
690
- puts hit.target_id # identifier of hit sequence
691
- puts hit.target_def # definition(comment line) of hit sequence
692
- puts hit.target_len # length of hit sequence
693
- puts hit.query_start # start position of homologous region in query sequence
694
- puts hit.query_end # end position of homologous region in query sequence
695
- puts hit.target_start # start position of homologous region in hit(target) sequence
696
- puts hit.target_end # end position of homologous region in hit(target) sequence
697
- puts hit.lap_at # array of above four numbers
749
+ puts hit.bit_score
750
+ puts hit.query_seq
751
+ puts hit.midline
752
+ puts hit.target_seq
753
+
754
+ puts hit.evalue
755
+ puts hit.identity
756
+ puts hit.overlap
757
+ puts hit.query_id
758
+ puts hit.query_def
759
+ puts hit.query_len
760
+ puts hit.target_id
761
+ puts hit.target_def
762
+ puts hit.target_len
763
+ puts hit.query_start
764
+ puts hit.query_end
765
+ puts hit.target_start
766
+ puts hit.target_end
767
+ puts hit.lap_at
698
768
  end
699
769
 
700
770
  For simplicity and API compatibility, some information such as score
@@ -1131,39 +1201,66 @@ to be written...
1131
1201
 
1132
1202
  == The BioRuby example programs
1133
1203
 
1134
- Some sample programs are stored in samples/ directry.
1135
- Some programs are obsolete. Since samples are not enough,
1136
- practical and interesting samples are welcome.
1204
+ Some sample programs are stored in ./samples/ directory. Run for example:
1137
1205
 
1138
- to be written...
1206
+ ./sample/na2aa.rb test/data/fasta/example1.txt
1207
+
1208
+ == Unit testing and doctests
1139
1209
 
1140
- (EDITOR's NOTE: I would like some examples automatically
1141
- included - with output)
1210
+ BioRuby comes with an extensive testing framework with over 1300 tests and 2700
1211
+ assertions. To run the unit tests:
1212
+
1213
+ cd test
1214
+ ruby runner.rb
1215
+
1216
+ We have also started with doctest for Ruby. We are porting the examples
1217
+ in this tutorial to doctest - more info upcoming.
1142
1218
 
1143
1219
  == Further reading
1144
1220
 
1145
- See the BioRuby in anger Wiki and the class documentation for more
1146
- information on BioRuby.
1221
+ See the BioRuby in anger Wiki. A lot of BioRuby's documentation exists in the
1222
+ source code and unit tests. To really dive in you will need the latest source
1223
+ code tree. The embedded rdoc documentation can be viewed online at
1224
+ ((<URL:http://bioruby.org/rdoc/>)).
1225
+
1226
+ == BioRuby Shell
1227
+
1228
+ The BioRuby shell implementation you find in ./lib/bio/shell. It is very interesting
1229
+ as it uses IRB (the Ruby intepreter) which is a powerful environment described in
1230
+ ((<Programming Ruby's irb chapter|URL:http://ruby-doc.org/docs/ProgrammingRuby/html/irb.html>)). IRB commands can directly be typed in the shell, e.g.
1231
+
1232
+ bioruby!> IRB.conf[:PROMPT_MODE]
1233
+ ==!> :PROMPT_C
1147
1234
 
1148
- The best book to get for understanding and getting productive with the
1149
- Ruby language is 'Programming Ruby' by Dave Thomas and Andy
1150
- Hunt. Strongly recommended!
1235
+ optionally you also may want to install the optional Ruby readline support -
1236
+ with Debian libreadline-ruby. To edit a previous line you may have to press
1237
+ line down (arrow down) first.
1238
+
1239
+ = Helpful tools
1240
+
1241
+ Apart from rdoc you may also want to use rtags - which allows jumping around
1242
+ source code by clicking on class and method names.
1243
+
1244
+ cd bioruby/lib
1245
+ rtags -R --vi
1246
+
1247
+ For a tutorial see ((<URL:http://rtags.rubyforge.org/>))
1151
1248
 
1152
1249
  = APPENDIX
1153
1250
 
1154
1251
  == KEGG API
1155
1252
 
1156
- Please refer to KEGG_API.rd.ja (TRANSLATOR'S NOTE: English version: ((<URL:http://www.genome.jp/kegg/soap/doc/keggapi_manual.html>)) ) and
1253
+ Please refer to KEGG_API.rd.ja (English version: ((<URL:http://www.genome.jp/kegg/soap/doc/keggapi_manual.html>)) ) and
1157
1254
 
1158
1255
  * ((<URL:http://www.genome.jp/kegg/soap/>))
1159
1256
 
1160
1257
  == Comparing BioProjects
1161
1258
 
1162
- For a quick functional comparison of BioRuby, BioPerl, BioPython and Bioconductor (R) see ((<http://sciruby.codeforpeople.com/sr.cgi/BioProjects>))
1259
+ For a quick functional comparison of BioRuby, BioPerl, BioPython and Bioconductor (R) see ((<URL:http://sciruby.codeforpeople.com/sr.cgi/BioProjects>))
1163
1260
 
1164
1261
  == Using BioRuby with R
1165
1262
 
1166
- Using Ruby with R Pjotr wrote a section on SciRuby. See ((<ULR:http://sciruby.codeforpeople.com/sr.cgi/RubyWithRlang>))
1263
+ Using Ruby with R Pjotr wrote a section on SciRuby. See ((<URL:http://sciruby.codeforpeople.com/sr.cgi/RubyWithRlang>))
1167
1264
 
1168
1265
  == Using BioPerl or BioPython from Ruby
1169
1266
 
@@ -1180,5 +1277,20 @@ painful, as the gem standard for packages evolved late and some still
1180
1277
  force you to copy things by hand. Therefore read the README's
1181
1278
  carefully that come with each package.
1182
1279
 
1183
- =end
1280
+ == Trouble shooting
1184
1281
 
1282
+ * Error: in `require': no such file to load -- bio (LoadError)
1283
+
1284
+ Ruby fails to find the BioRuby libraries - add it to the RUBYLIB path, or pass
1285
+ it to the interpeter. For example:
1286
+
1287
+ ruby -I~/cvs/bioruby/lib yourprogram.rb
1288
+
1289
+ == Modifying this page
1290
+
1291
+ IMPORTANT NOTICE: This page is maintained in the BioRuby CVS
1292
+ repository. Please edit the file there otherwise changes may get
1293
+ lost. See ((<BioRuby Developer Information>)) for CVS and mailing list
1294
+ access.
1295
+
1296
+ =end