bio 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (201) hide show
  1. data/bin/bioruby +107 -0
  2. data/bin/br_biofetch.rb +59 -0
  3. data/bin/br_bioflat.rb +294 -0
  4. data/bin/br_biogetseq.rb +57 -0
  5. data/bin/br_pmfetch.rb +431 -0
  6. data/doc/BioRuby.rd.ja +225 -0
  7. data/doc/Changes-0.7.rd +236 -0
  8. data/doc/Design.rd.ja +341 -0
  9. data/doc/KEGG_API.rd +1437 -0
  10. data/doc/KEGG_API.rd.ja +1399 -0
  11. data/doc/TODO.rd.ja +138 -0
  12. data/doc/Tutorial.rd +1138 -0
  13. data/doc/Tutorial.rd.ja +2110 -0
  14. data/etc/bioinformatics/seqdatabase.ini +210 -0
  15. data/lib/bio.rb +256 -0
  16. data/lib/bio/alignment.rb +1906 -0
  17. data/lib/bio/appl/bl2seq/report.rb +350 -0
  18. data/lib/bio/appl/blast.rb +269 -0
  19. data/lib/bio/appl/blast/format0.rb +1402 -0
  20. data/lib/bio/appl/blast/format8.rb +95 -0
  21. data/lib/bio/appl/blast/report.rb +652 -0
  22. data/lib/bio/appl/blast/rexml.rb +151 -0
  23. data/lib/bio/appl/blast/wublast.rb +553 -0
  24. data/lib/bio/appl/blast/xmlparser.rb +222 -0
  25. data/lib/bio/appl/blat/report.rb +392 -0
  26. data/lib/bio/appl/clustalw.rb +191 -0
  27. data/lib/bio/appl/clustalw/report.rb +154 -0
  28. data/lib/bio/appl/emboss.rb +68 -0
  29. data/lib/bio/appl/fasta.rb +262 -0
  30. data/lib/bio/appl/fasta/format10.rb +428 -0
  31. data/lib/bio/appl/fasta/format6.rb +37 -0
  32. data/lib/bio/appl/genscan/report.rb +570 -0
  33. data/lib/bio/appl/hmmer.rb +129 -0
  34. data/lib/bio/appl/hmmer/report.rb +556 -0
  35. data/lib/bio/appl/mafft.rb +222 -0
  36. data/lib/bio/appl/mafft/report.rb +119 -0
  37. data/lib/bio/appl/psort.rb +555 -0
  38. data/lib/bio/appl/psort/report.rb +473 -0
  39. data/lib/bio/appl/sim4.rb +134 -0
  40. data/lib/bio/appl/sim4/report.rb +501 -0
  41. data/lib/bio/appl/sosui/report.rb +166 -0
  42. data/lib/bio/appl/spidey/report.rb +604 -0
  43. data/lib/bio/appl/targetp/report.rb +283 -0
  44. data/lib/bio/appl/tmhmm/report.rb +238 -0
  45. data/lib/bio/command.rb +166 -0
  46. data/lib/bio/data/aa.rb +354 -0
  47. data/lib/bio/data/codontable.rb +740 -0
  48. data/lib/bio/data/na.rb +226 -0
  49. data/lib/bio/db.rb +340 -0
  50. data/lib/bio/db/aaindex.rb +280 -0
  51. data/lib/bio/db/embl/common.rb +332 -0
  52. data/lib/bio/db/embl/embl.rb +446 -0
  53. data/lib/bio/db/embl/sptr.rb +954 -0
  54. data/lib/bio/db/embl/swissprot.rb +32 -0
  55. data/lib/bio/db/embl/trembl.rb +31 -0
  56. data/lib/bio/db/embl/uniprot.rb +32 -0
  57. data/lib/bio/db/fantom.rb +604 -0
  58. data/lib/bio/db/fasta.rb +869 -0
  59. data/lib/bio/db/genbank/common.rb +299 -0
  60. data/lib/bio/db/genbank/ddbj.rb +34 -0
  61. data/lib/bio/db/genbank/genbank.rb +354 -0
  62. data/lib/bio/db/genbank/genpept.rb +73 -0
  63. data/lib/bio/db/genbank/refseq.rb +31 -0
  64. data/lib/bio/db/gff.rb +106 -0
  65. data/lib/bio/db/go.rb +497 -0
  66. data/lib/bio/db/kegg/brite.rb +51 -0
  67. data/lib/bio/db/kegg/cell.rb +88 -0
  68. data/lib/bio/db/kegg/compound.rb +130 -0
  69. data/lib/bio/db/kegg/enzyme.rb +125 -0
  70. data/lib/bio/db/kegg/expression.rb +173 -0
  71. data/lib/bio/db/kegg/genes.rb +293 -0
  72. data/lib/bio/db/kegg/genome.rb +362 -0
  73. data/lib/bio/db/kegg/glycan.rb +213 -0
  74. data/lib/bio/db/kegg/keggtab.rb +418 -0
  75. data/lib/bio/db/kegg/kgml.rb +299 -0
  76. data/lib/bio/db/kegg/ko.rb +178 -0
  77. data/lib/bio/db/kegg/reaction.rb +97 -0
  78. data/lib/bio/db/litdb.rb +131 -0
  79. data/lib/bio/db/medline.rb +317 -0
  80. data/lib/bio/db/nbrf.rb +199 -0
  81. data/lib/bio/db/pdb.rb +38 -0
  82. data/lib/bio/db/pdb/atom.rb +60 -0
  83. data/lib/bio/db/pdb/chain.rb +117 -0
  84. data/lib/bio/db/pdb/model.rb +106 -0
  85. data/lib/bio/db/pdb/pdb.rb +1682 -0
  86. data/lib/bio/db/pdb/residue.rb +122 -0
  87. data/lib/bio/db/pdb/utils.rb +234 -0
  88. data/lib/bio/db/prosite.rb +616 -0
  89. data/lib/bio/db/rebase.rb +417 -0
  90. data/lib/bio/db/transfac.rb +387 -0
  91. data/lib/bio/feature.rb +201 -0
  92. data/lib/bio/io/brdb.rb +103 -0
  93. data/lib/bio/io/das.rb +471 -0
  94. data/lib/bio/io/dbget.rb +212 -0
  95. data/lib/bio/io/ddbjxml.rb +614 -0
  96. data/lib/bio/io/fastacmd.rb +123 -0
  97. data/lib/bio/io/fetch.rb +114 -0
  98. data/lib/bio/io/flatfile.rb +496 -0
  99. data/lib/bio/io/flatfile/bdb.rb +266 -0
  100. data/lib/bio/io/flatfile/index.rb +1308 -0
  101. data/lib/bio/io/flatfile/indexer.rb +778 -0
  102. data/lib/bio/io/higet.rb +92 -0
  103. data/lib/bio/io/keggapi.rb +863 -0
  104. data/lib/bio/io/pubmed.rb +189 -0
  105. data/lib/bio/io/registry.rb +308 -0
  106. data/lib/bio/io/soapwsdl.rb +114 -0
  107. data/lib/bio/io/sql.rb +428 -0
  108. data/lib/bio/location.rb +650 -0
  109. data/lib/bio/pathway.rb +991 -0
  110. data/lib/bio/reference.rb +308 -0
  111. data/lib/bio/sequence.rb +593 -0
  112. data/lib/bio/shell.rb +51 -0
  113. data/lib/bio/shell/core.rb +512 -0
  114. data/lib/bio/shell/plugin/codon.rb +228 -0
  115. data/lib/bio/shell/plugin/entry.rb +85 -0
  116. data/lib/bio/shell/plugin/flatfile.rb +119 -0
  117. data/lib/bio/shell/plugin/keggapi.rb +187 -0
  118. data/lib/bio/shell/plugin/midi.rb +448 -0
  119. data/lib/bio/shell/plugin/obda.rb +63 -0
  120. data/lib/bio/shell/plugin/seq.rb +238 -0
  121. data/lib/bio/shell/session.rb +214 -0
  122. data/lib/bio/util/color_scheme.rb +214 -0
  123. data/lib/bio/util/color_scheme/buried.rb +78 -0
  124. data/lib/bio/util/color_scheme/helix.rb +78 -0
  125. data/lib/bio/util/color_scheme/hydropathy.rb +83 -0
  126. data/lib/bio/util/color_scheme/nucleotide.rb +50 -0
  127. data/lib/bio/util/color_scheme/strand.rb +78 -0
  128. data/lib/bio/util/color_scheme/taylor.rb +69 -0
  129. data/lib/bio/util/color_scheme/turn.rb +78 -0
  130. data/lib/bio/util/color_scheme/zappo.rb +69 -0
  131. data/lib/bio/util/contingency_table.rb +337 -0
  132. data/lib/bio/util/sirna.rb +306 -0
  133. data/lib/bioruby.rb +34 -0
  134. data/sample/biofetch.rb +475 -0
  135. data/sample/color_scheme_na.rb +99 -0
  136. data/sample/dbget +37 -0
  137. data/sample/fasta2tab.rb +99 -0
  138. data/sample/fsplit.rb +51 -0
  139. data/sample/gb2fasta.rb +31 -0
  140. data/sample/gb2tab.rb +325 -0
  141. data/sample/gbtab2mysql.rb +161 -0
  142. data/sample/genes2nuc.rb +33 -0
  143. data/sample/genes2pep.rb +33 -0
  144. data/sample/genes2tab.rb +81 -0
  145. data/sample/genome2rb.rb +29 -0
  146. data/sample/genome2tab.rb +76 -0
  147. data/sample/goslim.rb +311 -0
  148. data/sample/gt2fasta.rb +47 -0
  149. data/sample/pmfetch.rb +42 -0
  150. data/sample/pmsearch.rb +42 -0
  151. data/sample/psortplot_html.rb +222 -0
  152. data/sample/ssearch2tab.rb +96 -0
  153. data/sample/tdiary.rb +158 -0
  154. data/sample/tfastx2tab.rb +100 -0
  155. data/sample/vs-genes.rb +212 -0
  156. data/test/data/SOSUI/sample.report +11 -0
  157. data/test/data/TMHMM/sample.report +21 -0
  158. data/test/data/blast/eco:b0002.faa +15 -0
  159. data/test/data/blast/eco:b0002.faa.m0 +128 -0
  160. data/test/data/blast/eco:b0002.faa.m7 +65 -0
  161. data/test/data/blast/eco:b0002.faa.m8 +1 -0
  162. data/test/data/embl/AB090716.embl +65 -0
  163. data/test/data/genscan/sample.report +63 -0
  164. data/test/data/prosite/prosite.dat +2233 -0
  165. data/test/data/refseq/nm_126355.entret +64 -0
  166. data/test/data/uniprot/p53_human.uniprot +1456 -0
  167. data/test/runner.rb +10 -0
  168. data/test/unit/bio/appl/blast/test_report.rb +427 -0
  169. data/test/unit/bio/appl/blast/test_xmlparser.rb +400 -0
  170. data/test/unit/bio/appl/genscan/test_report.rb +195 -0
  171. data/test/unit/bio/appl/sosui/test_report.rb +94 -0
  172. data/test/unit/bio/appl/targetp/test_report.rb +159 -0
  173. data/test/unit/bio/appl/test_blast.rb +159 -0
  174. data/test/unit/bio/appl/test_fasta.rb +142 -0
  175. data/test/unit/bio/appl/tmhmm/test_report.rb +139 -0
  176. data/test/unit/bio/data/test_aa.rb +103 -0
  177. data/test/unit/bio/data/test_codontable.rb +120 -0
  178. data/test/unit/bio/data/test_na.rb +89 -0
  179. data/test/unit/bio/db/embl/test_common.rb +130 -0
  180. data/test/unit/bio/db/embl/test_embl.rb +227 -0
  181. data/test/unit/bio/db/embl/test_sptr.rb +268 -0
  182. data/test/unit/bio/db/embl/test_uniprot.rb +44 -0
  183. data/test/unit/bio/db/kegg/test_genes.rb +58 -0
  184. data/test/unit/bio/db/test_fasta.rb +263 -0
  185. data/test/unit/bio/db/test_gff.rb +140 -0
  186. data/test/unit/bio/db/test_prosite.rb +1450 -0
  187. data/test/unit/bio/io/test_ddbjxml.rb +87 -0
  188. data/test/unit/bio/io/test_soapwsdl.rb +45 -0
  189. data/test/unit/bio/shell/plugin/test_seq.rb +175 -0
  190. data/test/unit/bio/test_alignment.rb +1028 -0
  191. data/test/unit/bio/test_command.rb +71 -0
  192. data/test/unit/bio/test_db.rb +109 -0
  193. data/test/unit/bio/test_feature.rb +128 -0
  194. data/test/unit/bio/test_location.rb +51 -0
  195. data/test/unit/bio/test_pathway.rb +485 -0
  196. data/test/unit/bio/test_sequence.rb +386 -0
  197. data/test/unit/bio/test_shell.rb +31 -0
  198. data/test/unit/bio/util/test_color_scheme.rb +45 -0
  199. data/test/unit/bio/util/test_contingency_table.rb +106 -0
  200. data/test/unit/bio/util/test_sirna.rb +258 -0
  201. metadata +295 -0
data/doc/TODO.rd.ja ADDED
@@ -0,0 +1,138 @@
1
+ =begin
2
+
3
+ $Id: TODO.rd.ja,v 1.16 2004/08/24 00:18:03 k Exp $
4
+
5
+ Copyright (C) 2001, 2002 KATAYAMA Toshiaki <k@bioruby.org>
6
+
7
+ = TODO
8
+
9
+ * �����꡼��
10
+
11
+ * �ɥ�����ȹ���
12
+ * ���塼�ȥꥢ���ɵ�
13
+ * �Ѹ첽
14
+ * RDoc
15
+
16
+ * ����ץ����� (*2tab �Ȥ�)
17
+ * sample/README.rd[.ja]
18
+
19
+ * cvs.open-bio.org �Υɥ�����ȹ���
20
+
21
+ * Bio::SQL update, name space
22
+
23
+ * Bio::Fetch
24
+ * E-Utils
25
+ * http://www.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html
26
+
27
+
28
+ * Bio::Location location coordinate system (style)
29
+
30
+ * GenBank, EMBL, SPTR, BioSQL �����硢�ե����ޥå�����Ѵ�
31
+ * genbank/ @moltype
32
+ * bio/db/seqentry.rb �� bio/seqentry.rb ������˥���ƥ� SeqEntry ����
33
+ * InternalSeq �Ȥ��Ǥ⡩
34
+ * lib/bio/db/genbank/, embl/ ��ե��������
35
+ * Bio::SQL::Sequence ��ޤ᤿������¤
36
+ * genpept, refseq(NP_)
37
+ * Bio::Reference ���饹 �� EMBL �ˤ�?
38
+ * test GenBank >350k
39
+
40
+ * Blast, Fasta, Hmmer
41
+ * blast, hmmer �Υ�ݡ��� -> Bio::FlatFile ����⥢������
42
+ * bio/appl/fasta/report.rb, bio/appl/hmmer/report.rb ����ƥʳ��Ф�
43
+ * lib/bio/appl/blast/xmlparser.rb " problem
44
+ * lib/bio/appl/megablast.rb �ɲá�
45
+ * Blast HSP tiling [bioperl-l 2002 8/13,4,5 ?]
46
+ * Bio::Blast::Report::Hit#total_alignment_length
47
+ * Bio::Blast::Report::Hit#total_identical_residues
48
+ * Bio::Blast::Report::Hit#total_convserved_residues
49
+
50
+ * MAFFT, T-Coffee, CLUSTALW �� ���饤�����
51
+ * ���饤���ȥ��֥�������
52
+ * class Bio::Alignment ?
53
+ * class Bio::Alignment::Pairwise or class Bio::PairwiseAlignment ?
54
+ * class Bio::Alignment::Multiple or class Bio::MultipleAlignment ?
55
+ * ���饤���ȥӥ塼����ۤ���
56
+ * Blast refactering
57
+ * AlignFactory -> SearchIO��
58
+ * Bio::Align �� Bio::Blast::Report �ʤɤΥ��饤���Ȥ�
59
+ �ݻ�����ΤˤĤ���
60
+ * ���饤����ȥ��饹�� CIGAR �ե����ޥå�
61
+
62
+ * KGML�ѡ���
63
+ * Pathway���饹��dijkstra -> Graph
64
+ * Tree���饹
65
+
66
+ * lib/bio/db/kegg/keggtab.rb �� lib/bio/data/keggorg.rb �����硩
67
+
68
+ * Bio::LSID [bioperl-l 2002 7/15 seq namespace method]
69
+
70
+ * �����ǥ��󥰥�������
71
+ * camelCase �ػ�
72
+ * = �κ����϶�����
73
+ * tab �ʤ� 2 ���ڡ�������ǥ��
74
+ * �ǽ�Ū�ˤ� RDoc �ղá� (Rimport ��Ȥ��Τ���������)
75
+ * ���饹̾���⥸�塼��̾�� camelCase���������ʸ���� _ �Ƕ��ڤ�
76
+ * Array, Hash�ʾ�˥ͥ��Ȥ����饯�饹��������
77
+
78
+ === DB
79
+
80
+ * ����ȥ����� (GO etc.) ������
81
+
82
+ * ������շ� InterPro (incl. Pfam etc.) ������
83
+
84
+ * SSDB
85
+
86
+ * COG
87
+
88
+ * PDB or PDBj �ѡ���
89
+ * challange CASP? :)
90
+
91
+ === ����
92
+
93
+ * Ruby 1.8
94
+ * stringIO
95
+ * autoload
96
+ * Array.new(num) { Hoge.new }
97
+
98
+ * ext/
99
+ * �������٤��᥽�å� C �� ?
100
+
101
+ * ����֤�ѥ��벽���뤿��Υ��饹
102
+
103
+ * �ƥ��ȴĶ�������
104
+ * Test::Unit �Ȥ���
105
+ * �����ե����롢�ͥåȥ����������ɤ����뤫
106
+ * ���󥹥ȡ������ make check �ߤ����ʤ��Ȥ򤹤뤫
107
+
108
+ * ����å�
109
+ * ���ΤȤ������ޤ�Ȥ����̤Ϥʤ������Τ�ʤ����ɡ� GUI �Ȥ��ǡ�
110
+
111
+ * �ɥ����������
112
+ * Thai
113
+ * Korea
114
+ * Russia
115
+ * French
116
+
117
+ == OTHERS
118
+
119
+ * midi
120
+
121
+ * q--p
122
+ * RSS
123
+ * tDiary
124
+
125
+ * gb.bioruby.org �� GFF, DAS �б�
126
+ * �ݥ������������ץ�ʥ��ޥ�ɥ饤��/CGI��
127
+ * Ming��
128
+
129
+ * R, SVM/RVM, ����� ��
130
+
131
+ * Ruby/Cocoa, Ruby/Gtk �ˤ�� GUI �ʥե����ȥ꡼
132
+ * gsequence
133
+ * biograph
134
+ * GO �� xml �� GtkTree �� ����­��̡�
135
+
136
+ * ��
137
+
138
+ =end
data/doc/Tutorial.rd ADDED
@@ -0,0 +1,1138 @@
1
+ =begin
2
+
3
+ $Id: Tutorial.rd,v 1.9 2005/11/01 04:31:48 nakao Exp $
4
+
5
+ Copyright (C) 2001-2003 KATAYAMA Toshiaki <k@bioruby.org>
6
+
7
+ Translated into English: Naohisa Goto <ng@bioruby.org>
8
+
9
+ Edited by: PjotrPrins
10
+
11
+ NOTE: This page is a work in progress at this point
12
+
13
+ IMPORTANT NOTICE: This page is maintained in the BioRuby CVS
14
+ repository. Please edit the file there otherwise changes may get
15
+ lost. See ((<BioRuby Developer Information>)) for CVS and mailing list
16
+ access.
17
+
18
+ = BioRuby Tutorial
19
+
20
+ == Introduction
21
+
22
+ This is a tutorial for using Bioruby. For BioRuby you need to install
23
+ Ruby and the BioRuby package on your computer. For each following the
24
+ instruction on the respective websites. (EDITOR's NOTE: include URL's)
25
+
26
+ (EDITOR's NOTE: describe rdoc use for individual classes)
27
+
28
+ For further information on the Ruby language see the section 'Further
29
+ reading' at the end.
30
+
31
+ You can check whether Ruby is installed on your computer and what
32
+ version it has with the
33
+
34
+ % ruby -v
35
+
36
+ command. Showing something like:
37
+
38
+ ruby 1.8.2 (2005-04-11) [powerpc-linux]
39
+
40
+
41
+ == Trying Bioruby
42
+
43
+ Bioruby comes with its own shell. After unpacking the sources run the
44
+ following command
45
+
46
+ $BIORUBY/bin/bioruby
47
+
48
+ and you should see a prompt
49
+
50
+ bioruby>
51
+
52
+ Now test the following:
53
+
54
+ bioruby> seq = Bio::Sequence::NA.new("atgcatgcaaaa")
55
+ bioruby> puts seq
56
+ atgcatgcaaaa
57
+ bioruby> puts seq.complement
58
+ ttttgcatgcat
59
+
60
+ == Working with nucleic / amino acid sequences (Bio::Sequence class)
61
+
62
+ The Bio::Sequence class allows the usual sequence transformations and
63
+ translations. In the example below the DNA sequence "atgcatgcaaaa" is
64
+ converted into the complemental strand, spliced into a subsequence,
65
+ next the nucleic acid composition is calculated and the sequence is
66
+ translated into the amino acid sequence, the molecular weight
67
+ calculated, and so on. When translating into amino acid sequences the
68
+ frame can be specified and optionally the condon table selected (as
69
+ defined in codontable.rb).
70
+
71
+
72
+ #!/usr/bin/env ruby
73
+
74
+ require 'bio'
75
+
76
+ seq = Bio::Sequence::NA.new("atgcatgcaaaa")
77
+
78
+ puts seq # original sequence
79
+ puts seq.complement # complemental sequence (Bio::Sequence::NA object)
80
+ puts seq.subseq(3,8) # gets subsequence of positions 3 to 8
81
+
82
+ p seq.gc_percent # GC percent (BioRuby 0.6.X: Float, BioRuby 0.7 or later: Integer)
83
+ p seq.composition # nucleic acid compositions (Hash)
84
+
85
+ puts seq.translate # translation (Bio::Sequence::AA object)
86
+ puts seq.translate(2) # translation from frame 2 (default is frame 1)
87
+ puts seq.translate(1,11) # using codon table No.11 (see http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi)
88
+
89
+ p seq.translate.codes # shows three-letter codes (Array)
90
+ p seq.translate.names # shows amino acid names (Array)
91
+ p seq.translate.composition # amino acid compositions (Hash)
92
+ p seq.translate.molecular_weight # calculating molecular weight (Float)
93
+
94
+ puts seq.complement.translate # translation of complemental strand
95
+
96
+ The p, print and puts methods are standard Ruby ways of outputting to
97
+ the screen. If you want to know more about standard Ruby commands you
98
+ can use the 'ri' command on the command line (or the help command in
99
+ Windows). For example
100
+
101
+ % ri puts
102
+ % ri p
103
+ % ri File.open
104
+
105
+ Nucleic acid sequence is an object of +Bio::Sequence::NA+ class, and
106
+ amino acid sequence is an object of +Bio::Sequence::AA+ class. Shared
107
+ methods are in the parent +Bio::Sequence+ class.
108
+
109
+ As Bio::Sequence class inherits Ruby's String class, you can use
110
+ String class methods. For example, to get a subsequence, you can
111
+ not only use subseq(from, to) but also String#[].
112
+
113
+ Please take note that the Ruby's string's are base 0 - i.e. the first letter
114
+ has index 0, for example:
115
+
116
+ s = 'abc'
117
+ puts s[0..0]
118
+
119
+ >a
120
+
121
+ So when using String methods, you should subtract 1 from positions
122
+ conventionally used in biology. (subseq method returns nil if you
123
+ specify positions smaller than or equal to 0 for either one of the
124
+ "from" or "to".)
125
+
126
+ (EDITOR'S NOTE: should 'subseq' not throw an exception instead?)
127
+
128
+ The window_search(window_size, step_size) method shows a typical Ruby
129
+ way of writing concise and clear code using 'closures'. Each sliding
130
+ window creates a subsequence which is supplied to the enclosed block
131
+ through a variable named +s+.
132
+
133
+ * Shows average percentage of GC content for 100 bases (stepping
134
+ the default one base at a time)
135
+
136
+ seq.window_search(100) do |s|
137
+ puts s.gc_percent
138
+ end
139
+
140
+ Since the class of each subsequence is the same as original sequence
141
+ (Bio::Sequence::NA or Bio::Sequence::AA or Bio::Sequence), you can
142
+ use all methods on the subsequence. For example,
143
+
144
+ * Shows translation results for 15 bases shifting a codon at a time
145
+
146
+ seq.window_search(15, 3) do |s|
147
+ puts s.translate
148
+ end
149
+
150
+ Finally, the window_search method returns the last leftover
151
+ subsequence. This allows for example
152
+
153
+ * Divide a genome sequence into sections of 10000bp and
154
+ output FASTA formatted sequences. The 1000bp at the start and end of
155
+ each subsequence overlapped. At the 3' end of the sequence the
156
+ leftover subsequence shorter than 10000bp is also added
157
+
158
+ i = 1
159
+ remainder = seq.window_search(10000, 9000) do |s|
160
+ puts s.to_fasta("segment #{i}", 60)
161
+ i += 1
162
+ end
163
+ puts remainder.to_fasta("segment #{i}", 60)
164
+
165
+ If you don't want the overlapping window, set window size and stepping
166
+ size to equal values.
167
+
168
+ Other examples
169
+
170
+ * Count the codon usage
171
+
172
+ codon_usage = Hash.new(0)
173
+ seq.window_search(3, 3) do |s|
174
+ codon_usage[s] += 1
175
+ end
176
+
177
+ * Calculate molecular weight for each 10-aa peptide (or 10-nt nucleic acid)
178
+
179
+ seq.window_search(10, 10) do |s|
180
+ puts s.molecular_weight
181
+ end
182
+
183
+ In most cases, sequences are read from files or retrieved from databases.
184
+ For example:
185
+
186
+ require 'bio'
187
+
188
+ input_seq = ARGF.read # reads all files in arguments
189
+
190
+ my_naseq = Bio::Sequence::NA.new(input_seq)
191
+ my_aaseq = my_naseq.translate
192
+
193
+ puts my_aaseq
194
+
195
+ Save the program as na2aa.rb. Prepare a nucleic acid sequence
196
+ described below and saves it as my_naseq.txt:
197
+
198
+ gtggcgatctttccgaaagcgatgactggagcgaagaaccaaagcagtgacatttgtctg
199
+ atgccgcacgtaggcctgataagacgcggacagcgtcgcatcaggcatcttgtgcaaatg
200
+ tcggatgcggcgtga
201
+
202
+ na2aa.rb translates a nucleic acid sequence to a protein sequence.
203
+ For example, translates my_naseq.txt:
204
+
205
+ % ruby na2aa.rb my_naseq.txt
206
+
207
+ Outputs
208
+
209
+ VAIFPKAMTGAKNQSSDICLMPHVGLIRRGQRRIRHLVQMSDAA*
210
+
211
+ You can also write this, a bit fanciful, as a one-liner script.
212
+
213
+ % ruby -r bio -e 'p Bio::Sequence::NA.new($<.read).translate' my_naseq.txt
214
+
215
+ In the next section we will retrieve data from databases instead of
216
+ using raw sequence files.
217
+
218
+ == Parsing GenBank data (Bio::GenBank class)
219
+
220
+ We assume that you already have some GenBank data files. (If you don't,
221
+ download some .seq files from ftp://ftp.ncbi.nih.gov/genbank/)
222
+
223
+ As an example we fetch the ID, definition and sequence of each entry
224
+ from the GenBank format and convert it to FASTA. This is also an example
225
+ script in the BioRuby distribution.
226
+
227
+ A first attempt could be to use the Bio::GenBank class for reading in
228
+ the data:
229
+
230
+ #!/usr/bin/env ruby
231
+
232
+ require 'bio'
233
+
234
+ # Read all lines from STDIN split by the GenBank delimiter
235
+ while entry = gets(Bio::GenBank::DELIMITER)
236
+ gb = Bio::GenBank.new(entry) # creates GenBank object
237
+
238
+ print ">#{gb.accession} " # Accession
239
+ puts gb.definition # Definition
240
+ puts gb.naseq # Nucleic acid sequence (Bio::Sequence::NA object)
241
+ end
242
+
243
+ But that has the disadvantage the code is tied to GenBank input. A more
244
+ generic method is to use Bio::FlatFile which allows you to use different
245
+ input formats:
246
+
247
+ #!/usr/bin/env ruby
248
+
249
+ require 'bio'
250
+
251
+ ff = Bio::FlatFile.new(Bio::GenBank, ARGF)
252
+ ff.each_entry do |gb|
253
+ definition = "#{gb.accession} #{gb.definition}"
254
+ puts gb.naseq.to_fasta(definition, 60)
255
+ end
256
+
257
+ For example, in turn, reading FASTA format files:
258
+
259
+ #!/usr/bin/env ruby
260
+
261
+ require 'bio'
262
+
263
+ ff = Bio::FlatFile.new(Bio::FastaFormat, ARGF)
264
+ ff.each_entry do |f|
265
+ puts "definition : " + f.definition
266
+ puts "nalen : " + f.nalen.to_s
267
+ puts "naseq : " + f.naseq
268
+ end
269
+
270
+ In above two scripts, the first arguments of Bio::FlatFile.new are
271
+ database classes of BioRuby. This is expanded on in a later section.
272
+
273
+ Again another option is to use the Bio::DB.open class:
274
+
275
+ #!/usr/bin/env ruby
276
+
277
+ require 'bio'
278
+
279
+ ff = Bio::GenBank.open("gbvrl1.seq")
280
+ ff.each_entry do |gb|
281
+ definition = "#{gb.accession} #{gb.definition}"
282
+ puts gb.naseq.to_fasta(definition, 60)
283
+ end
284
+
285
+ (TRANSLATOR'S NOTE: Bio::DB.open have not been used so well.)
286
+ (EDITOR's NOTE: Test code)
287
+
288
+ Next, we are going to parse the GenBank 'features', which is normally
289
+ very complicated:
290
+
291
+ #!/usr/bin/env ruby
292
+
293
+ require 'bio'
294
+
295
+ ff = Bio::FlatFile.new(Bio::GenBank, ARGF)
296
+
297
+ # iterates over each GenBank entry
298
+ ff.each_entry do |gb|
299
+
300
+ # shows accession and organism
301
+ puts "# #{gb.accession} - #{gb.organism}"
302
+
303
+ # iterates over each element in 'features'
304
+ gb.features.each do |feature|
305
+ position = feature.position
306
+ hash = feature.assoc # put into Hash
307
+
308
+ # skips the entry if "/translation=" is not found
309
+ next unless hash['translation']
310
+
311
+ # collects gene name and so on and joins it into a string
312
+ gene_info = [
313
+ hash['gene'], hash['product'], hash['note'], hash['function']
314
+ ].compact.join(', ')
315
+
316
+ # shows nucleic acid sequence
317
+ puts ">NA splicing('#{position}') : #{gene_info}"
318
+ puts gb.naseq.splicing(position)
319
+
320
+ # shows amino acid sequence translated from nucleic acid sequence
321
+ puts ">AA translated by splicing('#{position}').translate"
322
+ puts gb.naseq.splicing(position).translate
323
+
324
+ # shows amino acid sequence in the database entry (/translation=)
325
+ puts ">AA original translation"
326
+ puts hash['translation']
327
+ end
328
+ end
329
+
330
+ * Note: In this example Feature#assoc method makes a Hash from a
331
+ feature object. It is useful because you can get data from the hash
332
+ by using qualifiers as keys.
333
+ (But there is a risk some information is lost when two or more
334
+ qualifiers are the same. Therefore an Array is returned by
335
+ Feature#feature)
336
+
337
+ Bio::Sequence#splicing splices subsequence from nucleic acid sequence
338
+ according to location information used in GenBank, EMBL and DDBJ.
339
+
340
+ When the specified translation table is different from the default
341
+ (universal), or when the first codon is not "atg" or the protein
342
+ contains selenocysteine, the two amino acid sequences will differ.
343
+
344
+ The Bio::Sequence#splicing method takes not only DDBJ/EMBL/GenBank
345
+ feature style location text but also Bio::Locations object. For more
346
+ information about location format and Bio::Locations class, see
347
+ bio/location.rb.
348
+
349
+ * Splice according to location string used in a GenBank entry
350
+
351
+ naseq.splicing('join(2035..2050,complement(1775..1818),13..345')
352
+
353
+ * Generate Bio::Locations object and pass the splicing method
354
+
355
+ locs = Bio::Locations.new('join((8298.8300)..10206,1..855)')
356
+ naseq.splicing(locs)
357
+
358
+ You can also use the splicing method for amino acid sequences
359
+ (Bio::Sequence::AA objects).
360
+
361
+ * Splicing peptide from a protein (e.g. signal peptide)
362
+
363
+ aaseq.splicing('21..119')
364
+
365
+ (EDITOR's NOTE: why use STRINGs here?)
366
+
367
+ === More databases
368
+
369
+ Databases in BioRuby are essentially accessed like that of GenBank
370
+ with classes like Bio::GenBank, Bio::KEGG::GENES,
371
+ (EDITOR's NOTE: include complete list)
372
+
373
+ In many cases the Bio::DatabaseClass acts as a factory pattern
374
+ and recognises the database type automatically - returning a
375
+ parsed object. For example using Bio::FlatFile
376
+
377
+ Bio::FlatFile class as described above. The first argument of the
378
+ Bio::FlatFile.new is database class name in BioRuby (such as Bio::GenBank,
379
+ Bio::KEGG::GENES and so on).
380
+
381
+ ff = Bio::FlatFile.new(Bio::DatabaseClass, ARGF)
382
+
383
+ Isn't it wonderful that Bio::FlatFile automagically recognizes each
384
+ database class?
385
+
386
+ #!/usr/bin/env ruby
387
+
388
+ require 'bio'
389
+
390
+ ff = Bio::FlatFile.auto(ARGF)
391
+ ff.each_entry do |entry|
392
+ p entry.entry_id # identifier of the entry
393
+ p entry.definition # definition of the entry
394
+ p entry.seq # sequence data of the entry
395
+ end
396
+
397
+ Other methods to extract specific data from database objects can be
398
+ different between databases, though some methods are common (see the
399
+ guidelines for common methods as described in bio/db.rb).
400
+
401
+ * entry_id --> gets ID of the entry
402
+ * definition --> gets definition of the entry
403
+ * reference --> gets references as Bio::Reference object
404
+ * organism --> gets species
405
+ * seq, naseq, aaseq --> returns sequence as corresponding sequence object
406
+
407
+ Refer to the documents of each database to find the exact naming
408
+ of the included methods.
409
+
410
+ In principal BioRuby uses the following conventions: when a method
411
+ name is plural the method returns some object as an Array. For
412
+ example, some classes have a "references" method which returns
413
+ multiple Bio::Reference objects as an Array. And some classes have a
414
+ "reference" method which returns a single Bio::Reference object.
415
+
416
+ === Alignments (Bio::Alignment)
417
+
418
+ Bio::Alignment class in bio/alignment.rb is a container class like Ruby's Hash,
419
+ Array and BioPerl's Bio::SimpleAlign. A very simple example is:
420
+
421
+ require 'bio'
422
+
423
+ seqs = [ 'atgca', 'aagca', 'acgca', 'acgcg' ]
424
+ seqs = seqs.collect{ |x| Bio::Sequence::NA.new(x) }
425
+
426
+ # creates alignment object
427
+ a = Bio::Alignment.new(seqs)
428
+
429
+ # shows consensus sequence
430
+ p a.consensus # ==> "a?gc?"
431
+
432
+ # shows IUPAC consensus
433
+ p a.consensus_iupac # ==> "ahgcr"
434
+
435
+ # iterates over each seq
436
+ a.each { |x| p x }
437
+ # ==>
438
+ # "atgca"
439
+ # "aagca"
440
+ # "acgca"
441
+ # "acgcg"
442
+ # iterates over each site
443
+ a.each_site { |x| p x }
444
+ # ==>
445
+ # ["a", "a", "a", "a"]
446
+ # ["t", "a", "c", "c"]
447
+ # ["g", "g", "g", "g"]
448
+ # ["c", "c", "c", "c"]
449
+ # ["a", "a", "a", "g"]
450
+
451
+ # doing alignment by using CLUSTAL W.
452
+ # clustalw command must be installed.
453
+ factory = Bio::ClustalW.new
454
+ a2 = a.do_align(factory)
455
+
456
+
457
+ == Sequence homology search by using the FASTA program (Bio::Fasta)
458
+
459
+ Let's start with a query.pep file which contains a sequence in FASTA
460
+ format. In this example we are going to execute a homology search
461
+ from a remote internet site or on your local machine. Note that you
462
+ can use the ssearch program instead of fasta when you use them in your
463
+ local machine.
464
+
465
+ === using FASTA in local machine
466
+
467
+ Install the fasta program on your machine (the command name looks like
468
+ fasta34. FASTA can be downloaded from ftp://ftp.virginia.edu/pub/fasta/).
469
+ First, you must prepare your FASTA-formatted database sequence file
470
+ target.pep and FASTA-formatted query.pep. (TRANSLATOR'S NOTE: I think
471
+ we should provide sample data to readers.)
472
+
473
+ #!/usr/bin/env ruby
474
+
475
+ require 'bio'
476
+
477
+ # Creates FASTA factory object ("ssearch" instead of "fasta34" can also work)
478
+ factory = Bio::Fasta.local('fasta34', ARGV.pop)
479
+ (EDITOR's NOTE: not consistent pop command)
480
+
481
+ # Reads FASTA-formatted files (TRANSLATOR'S NOTE: something wrong in Japanese text)
482
+ ff = Bio::FlatFile.new(Bio::FastaFormat, ARGF)
483
+
484
+ # Iterates over each entry. the variable "entry" is a Bio::FastaFormat object.
485
+ ff.each do |entry|
486
+ # shows definition line (begins with '>') to the standard error output
487
+ $stderr.puts "Searching ... " + entry.definition
488
+
489
+ # executes homology search. Returns Bio::Fasta::Report object.
490
+ report = factory.query(entry)
491
+
492
+ # Iterates over each hit
493
+ report.each do |hit|
494
+ # If E-value is smaller than 0.0001
495
+ if hit.evalue < 0.0001
496
+ # shows identifier of query and hit, E-value, start and end positions of homologous region (TRANSLATOR'S NOTE: should I change Japanese document?)
497
+ print "#{hit.query_id} : evalue #{hit.evalue}\t#{hit.target_id} at "
498
+ p hit.lap_at
499
+ end
500
+ end
501
+ end
502
+
503
+ We named above script as f_search.rb. You can execute as follows:
504
+
505
+ % ./f_search.rb query.pep target.pep > f_search.out
506
+
507
+ In above script, the variable "factory" is a factory object for executing
508
+ FASTA many times easily. Instead of using Fasta#query method,
509
+ Bio::Sequence#fasta method can be used.
510
+ (TRANSLATOR'S NOTE: Bio::Sequence#fasta are not so frequently used.)
511
+
512
+ seq = ">test seq\nYQVLEEIGRGSFGSVRKVIHIPTKKLLVRKDIKYGHMNSKE"
513
+ seq.fasta(factory)
514
+
515
+ When you want to add options to FASTA command, you can set the
516
+ third argument of Bio::Fasta.local method. For example, setting ktup to 1
517
+ and getting top-10 hits:
518
+
519
+ factory = Bio::Fasta.local('fasta34', 'target.pep', '-b 10')
520
+ factory.ktup = 1
521
+
522
+ Bio::Fasta#query returns Bio::Fasta::Report object.
523
+ We can get almost all information described in FASTA report text
524
+ with the Report object. For example, getting information for hits:
525
+
526
+
527
+ report.each do |hit|
528
+ puts hit.evalue # E-value
529
+ puts hit.sw # Smith-Waterman score (*)
530
+ puts hit.identity # % identity
531
+ puts hit.overlap # length of overlapping region
532
+ puts hit.query_id # identifier of query sequence
533
+ puts hit.query_def # definition(comment line) of query sequence
534
+ puts hit.query_len # length of query sequence
535
+ puts hit.query_seq # query sequence (TRANSLATOR'S NOTE: sequence of homologous region of query sequence)
536
+ puts hit.target_id # identifier of hit sequence
537
+ puts hit.target_def # definition(comment line) of hit sequence
538
+ puts hit.target_len # length of hit sequence
539
+ puts hit.target_seq # hit sequence (TRANSLATOR'S NOTE: sequence of homologous region of hit sequence)
540
+ puts hit.query_start # start position of homologous region in query sequence
541
+ puts hit.query_end # end position of homologous region in query sequence
542
+ puts hit.target_start # start posiotion of homologous region in hit(target) sequence
543
+ puts hit.target_end # end position of homologous region in hit(target) sequence
544
+ puts hit.lap_at # array of above four numbers
545
+ end
546
+
547
+ Most of above methods are common with the Bio::Blast::Report described
548
+ below. Please refer to document of Bio::Fasta::Report class for
549
+ FASTA-specific details.
550
+
551
+ If you need original output text of FASTA program you can use the "output"
552
+ method of the factory object after the "query" method.
553
+
554
+ report = factory.query(entry)
555
+ puts factory.output
556
+
557
+
558
+ === using FASTA from a remote internet site
559
+
560
+ * Note: Currently, only GenomeNet (fasta.genome.jp) is
561
+ supported. check the class documentation for updates.
562
+
563
+ For accessing a remote site the Bio::Fasta.remote method is used
564
+ instead of Bio::Fasta.local. When using a remote method, the
565
+ databases available may be limited, but, otherwise, you can do the
566
+ same things as with a local method.
567
+
568
+ Available databases in GenomeNet:
569
+
570
+ * Protein database
571
+ * nr-aa, genes, vgenes.pep, swissprot, swissprot-upd, pir, prf, pdbstr
572
+
573
+ * Nucleic acid database
574
+ * nr-nt, genbank-nonst, gbnonst-upd, dbest, dbgss, htgs, dbsts,
575
+ embl-nonst, embnonst-upd, genes-nt, genome, vgenes.nuc
576
+
577
+ Select the databases you require. Next, give the search program from
578
+ the type of query sequence and database.
579
+
580
+ * When query is a amino acid sequence
581
+ * When protein database, program is "fasta".
582
+ * When nucleic database, program is "tfasta".
583
+
584
+ * When query is a nucleic acid sequence
585
+ * When nucleic database, program is "fasta".
586
+ * (When protein database, you would fail to search.)
587
+
588
+ For example:
589
+
590
+ program = 'fasta'
591
+ database = 'genes'
592
+
593
+ factory = Bio::Fasta.remote(program, database)
594
+
595
+ and try out the same commands as with the local search shown earlier.
596
+
597
+ == Homology search by using BLAST (Bio::Blast class)
598
+
599
+ The BLAST interface is very similar to that of FASTA and
600
+ both local and remote execution are supported. Basically
601
+ replace above examples Bio::Fasta with Bio::Blast!
602
+
603
+ For example the BLAST version of f_search.rb is:
604
+
605
+ # create BLAST factory object
606
+ factory = Bio::Blast.local('blastp', ARGV.pop)
607
+
608
+ For remote execution of BLAST in GenomeNet, Bio::Blast.remote is used.
609
+ The parameter "program" is different from FASTA - as you can expect:
610
+
611
+ * When query is a amino acid sequence
612
+ * When protein database, program is "blastp".
613
+ * When nucleic database, program is "tblastn".
614
+
615
+ * When query is a nucleic acid sequence
616
+ * When protein database, program is "blastx"
617
+ * When nucleic database, program is "blastn".
618
+ * ("tblastx" for six-frame search.)
619
+
620
+ Bio::BLAST uses "-m 7" XML output of BLAST by default when either
621
+ XMLParser or REXML (both of them are XML parser libraries for Ruby -
622
+ of the two XMLParser is the fastest) is installed on your computer. In
623
+ Ruby version 1.8.0, or later, REXML is bundled with Ruby's
624
+ distribution.
625
+
626
+ When no XML parser library is present, Bio::BLAST uses "-m 8" tabular
627
+ deliminated format. Available information is limited with the
628
+ "-m 8" format so installing an XML parser is recommended.
629
+
630
+ Again, the methods in Bio::Fasta::Report and Bio::Blast::Report (and
631
+ Bio::Fasta::Report::Hit and Bio::Blast::Report::Hit) are similar.
632
+ There are some additional BLAST methods, for example, bit_score and
633
+ midline.
634
+
635
+ report.each do |hit|
636
+ puts hit.bit_score # bit score (*)
637
+ puts hit.query_seq # query sequence (TRANSLATOR'S NOTE: sequence of homologous region of query sequence)
638
+ puts hit.midline # middle line string of alignment of homologous region (*)
639
+ puts hit.target_seq # hit sequence (TRANSLATOR'S NOTE: sequence of homologous region of query sequence)
640
+
641
+ puts hit.evalue # E-value
642
+ puts hit.identity # % identity
643
+ puts hit.overlap # length of overlapping region
644
+ puts hit.query_id # identifier of query sequence
645
+ puts hit.query_def # definition(comment line) of query sequence
646
+ puts hit.query_len # length of query sequence
647
+ puts hit.target_id # identifier of hit sequence
648
+ puts hit.target_def # definition(comment line) of hit sequence
649
+ puts hit.target_len # length of hit sequence
650
+ puts hit.query_start # start position of homologous region in query sequence
651
+ puts hit.query_end # end position of homologous region in query sequence
652
+ puts hit.target_start # start position of homologous region in hit(target) sequence
653
+ puts hit.target_end # end position of homologous region in hit(target) sequence
654
+ puts hit.lap_at # array of above four numbers
655
+ end
656
+
657
+ For simplicity and API compatibility, some information such as score
658
+ are extracted from the first Hsp (High-scoring Segment Pair).
659
+
660
+ Check the documentation for Bio::Blast::Report to see what can be
661
+ retrieved. For now suffice to state that Bio::Blast::Report has a
662
+ hierarchical structure mirroring the general BLAST output stream:
663
+
664
+ * In a Bio::Blast::Report object, @iteratinos is an array of
665
+ Bio::Blast::Report::Iteration objects.
666
+ * In a Bio::Blast::Report::Iteration object, @hits is an array of
667
+ Bio::Blast::Report::Hits objects.
668
+ * In a Bio::Blast::Report::Hits object, @hsps is an array of
669
+ Bio::Blast::Report::Hsp objects.
670
+
671
+ See bio/appl/blast.rb and bio/appl/blast/*.rb for more information.
672
+
673
+ === Parsing existing BLAST output files
674
+
675
+ When you already have BLAST output files and you want to parse them,
676
+ you can directly create Bio::Blast::Report objects without the
677
+ Bio::Blast factory object. For this purpose use Bio::Blast.reports,
678
+ which supports the "-m 0" default and "-m 7" XML type output format.
679
+
680
+ #!/usr/bin/env ruby
681
+
682
+ require 'bio'
683
+
684
+ # Iterates over each XML result.
685
+ # The variable "report" is a Bio::Blast::Report object.
686
+ Bio::Blast.reports(ARGF) do |report|
687
+ puts "Hits for " + report.query_def + " against " + report.db
688
+ report.each do |hit|
689
+ print hit.target_id, "\t", hit.evalue, "\n" if hit.evalue < 0.001
690
+ end
691
+ end
692
+
693
+ Save the script as hits_under_0.001.rb and to process BLAST output
694
+ files *.xml, you can
695
+
696
+ % ruby hits_under_0.001.rb *.xml
697
+
698
+ Sometimes BLAST XML output may be wrong and can not be parsed. We
699
+ recommended to install BLAST 2.2.5 or later, and try combinations of
700
+ the -D and -m options when you encounter problems.
701
+
702
+
703
+ === Add remote BLAST search sites
704
+
705
+ Note: this section is an advanced topic
706
+
707
+ Here a more advanced application for using BLAST sequence homology
708
+ search services. BioRuby currently only supports GenomeNet. If you
709
+ want to add other sites, you must write the following:
710
+
711
+ * the calling CGI (command-line options must be processed for the site).
712
+ * make sure you get BLAST output text as supported format by BioRuby
713
+ (e.g. "-m 8", "-m 7" or default("-m 0")).
714
+
715
+ In addition, you must write a private class method in Bio::Blast
716
+ named "exec_MYSITE" to get query sequence and to pass the result to
717
+ Bio::Blast::Report.new(or Bio::Blast::Default::Report.new):
718
+
719
+ factory = Bio::Blast.remote(program, db, option, 'MYSITE')
720
+
721
+ When you write above routines, please send to the BioRuby project and
722
+ they may be included.
723
+
724
+ == Generate a reference list using PubMed (Bio::PubMed)
725
+
726
+ Below script is an example which seaches PubMed and creates a reference list.
727
+
728
+ #!/usr/bin/env ruby
729
+
730
+ require 'bio'
731
+
732
+ ARGV.each do |id|
733
+ entry = Bio::PubMed.query(id) # searches PubMed and get entry
734
+ medline = Bio::MEDLINE.new(entry) # creates Bio::MEDLINE object from entry text
735
+ reference = medline.reference # converts into Bio::Reference object
736
+ puts reference.bibtex # shows BibTeX formatted text
737
+ end
738
+
739
+ We named the script pmfetch.rb.
740
+
741
+ % ./pmfetch.rb 11024183 10592278 10592173
742
+
743
+ To give some PubMed ID (PMID) in arguments, the script retrieves informations
744
+ from NCBI, parses MEDLINE format text, converts into BibTeX format and
745
+ shows them.
746
+
747
+ A keyword search is also available.
748
+
749
+ #!/usr/bin/env ruby
750
+
751
+ require 'bio'
752
+
753
+ # Concatinates argument keyword list to a string
754
+ keywords = ARGV.join(' ')
755
+
756
+ # PubMed keyword search
757
+ entries = Bio::PubMed.search(keywords)
758
+
759
+ entries.each do |entry|
760
+ medline = Bio::MEDLINE.new(entry) # creates Bio::MEDLINE object from text
761
+ reference = medline.reference # converts into Bio::Reference object
762
+ puts reference.bibtex # shows BibTeX format text
763
+ end
764
+
765
+ We named the script pmsearch.rb.
766
+
767
+ % ./pmsearch.rb genome bioinformatics
768
+
769
+ To give keywords in arguments, the script searches PubMed by given
770
+ keywords and shows bibliography informations in a BibTex format. Other
771
+ output formats are also avaialble like the bibitem method described
772
+ below. Some journal formats like nature and nar can be used, but lack
773
+ bold and italic font output.
774
+
775
+ (EDITORs NOTE: do we have some simple object that can be queried for
776
+ author, title etc.?)
777
+
778
+ Nowadays using NCBI E-Utils is recommended. Use Bio::PubMed.esearch
779
+ and Bio::PubMed.efetch instead of above methods.
780
+
781
+
782
+ #!/usr/bin/env ruby
783
+
784
+ require 'bio'
785
+
786
+ keywords = ARGV.join(' ')
787
+
788
+ options = {
789
+ 'maxdate' => '2003/05/31',
790
+ 'retmax' => 1000,
791
+ }
792
+
793
+ entries = Bio::PubMed.esearch(keywords, options)
794
+
795
+ Bio::PubMed.efetch(entries).each do |entry|
796
+ medline = Bio::MEDLINE.new(entry)
797
+ reference = medline.reference
798
+ puts reference.bibtex
799
+ end
800
+
801
+ The script works same as pmsearch.rb. But, by using NCBI E-Utils, more
802
+ options are available. For example published dates to search and
803
+ maximum number of hits to show results can be specified.
804
+
805
+ See the ((<help page of
806
+ E-Utils|URL:http://eutils.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html>))
807
+ for more details.
808
+
809
+
810
+
811
+ === More about BibTeX
812
+
813
+ In this section, we explain the simple usage of TeX for the BibTeX format
814
+ bibliography list collected by above scripts. For example, to save
815
+ BibTeX format bibliography data to a file named genoinfo.bib.
816
+
817
+ % ./pmfetch.rb 10592173 >> genoinfo.bib
818
+ % ./pmsearch.rb genome bioinformatics >> genoinfo.bib
819
+
820
+ The BibTeX can be used with Tex or LaTeX to form bibliography
821
+ information with your journal article. For more information
822
+ on BibTex see (EDITORS NOTE: insert URL). A quick example:
823
+
824
+ Save this to hoge.tex:
825
+
826
+ \documentclass{jarticle}
827
+ \begin{document}
828
+ \bibliographystyle{plain}
829
+ foo bar KEGG database~\cite{PMID:10592173} baz hoge fuga.
830
+ \bibliography{genoinfo}
831
+ \end{document}
832
+
833
+ Then,
834
+
835
+ % latex hoge
836
+ % bibtex hoge # processes genoinfo.bib
837
+ % latex hoge # creates bibliography list
838
+ % latex hoge # inserts correct bibliography reference
839
+
840
+ Now, you get hoge.dvi and hoge.ps - the latter you can view any
841
+ Postscript viewer.
842
+
843
+ === Bio::Reference#bibitem
844
+
845
+ When you don't want to create a bib file, you can use
846
+ Bio::Reference#bibitem method instead of Bio::Reference#bibtex.
847
+ In above pmfetch.rb and pmsearch.rb scripts, change
848
+
849
+ puts reference.bibtex
850
+ to
851
+ puts reference.bibitem
852
+
853
+
854
+ Output documents should be bundled in \begin{thebibliography}
855
+ and \end{thebibliography}. Save the following to hoge.tex
856
+
857
+ \documentclass{jarticle}
858
+ \begin{document}
859
+ foo bar KEGG database~\cite{PMID:10592173} baz hoge fuga.
860
+
861
+ \begin{thebibliography}{00}
862
+
863
+ \bibitem{PMID:10592173}
864
+ Kanehisa, M., Goto, S.
865
+ KEGG: kyoto encyclopedia of genes and genomes.,
866
+ {\em Nucleic Acids Res}, 28(1):27--30, 2000.
867
+
868
+ \end{thebibliography}
869
+ \end{document}
870
+
871
+ and run
872
+
873
+ % latex hoge # creates bibliography list
874
+ % latex hoge # inserts corrent bibliography reference
875
+
876
+
877
+ = OBDA
878
+
879
+ OBDA (Open Bio Database Access) is a standardized method of sequence
880
+ database access developed by the Open Bioinformatics Foundation. It
881
+ was created during the BioHackathon by BioPerl, BioJava, BioPython,
882
+ BioRuby and other projects' members (2002).
883
+
884
+ * BioRegistry (Directory)
885
+ * Mechanism to specify how and where to retrieve sequence data for each database.
886
+
887
+ * BioFlat
888
+ * Flatfile indexing by using binary tree or BDB(Berkeley DB).
889
+
890
+ * BioFetch
891
+ * Server-client model for getting entry from database via http.
892
+
893
+ * BioSQL
894
+ * Schemas to store sequence data to relational database such as
895
+ MySQL and PostgreSQL, and methods to retrieve entries from the database.
896
+
897
+ Here we give a quick overview. Check out
898
+ ((<URL:http://obda.open-bio.org/>)) for more extensive details.
899
+
900
+ The specification is stored on CVS repository at cvs.open-bio.org,
901
+ also available via http from:
902
+ ((<URL:http://cvs.open-bio.org/cgi-bin/viewcvs/viewcvs.cgi/obda-specs/?cvsroot=obf-common>))
903
+
904
+ == BioRegistry
905
+
906
+ BioRegistry allows for locating retrieval methods and database
907
+ locations through configuration files. The priorities are
908
+
909
+ * The file specified with method's parameter
910
+ * ~/.bioinformatics/seqdatabase.ini
911
+ * /etc/bioinformatics/seqdatabase.ini
912
+ * http://www.open-bio.org/registry/seqdatabase.ini
913
+
914
+ Note that the last locaation refers to www.open-bio.org and is only used
915
+ when all local configulation files are not available.
916
+
917
+ In the current BioRuby implementation all local configulation files
918
+ are read. For databases with the same name settings encountered first
919
+ are used. This means that if you don't like some settings of a
920
+ database in system global configuration file
921
+ (/etc/bioinformatics/seqdatabase.ini), you can easily override it by
922
+ writing settings to ~/.bioinformatics/seqdatabase.ini.
923
+
924
+ The syntax of the configuration file is called a stanza format. For example
925
+
926
+ [DatabaseName]
927
+ protocol=ProtocolName
928
+ location=ServeName
929
+
930
+ You can write a description like above entry for every database.
931
+
932
+ The database name is a local label for yourself, so you can name it
933
+ freely and it can differ from the name of the actual databases. In the
934
+ actual specification of BioRegistry where there are two or more
935
+ settings for a database of the same name, it is proposed that
936
+ connection to the database is tried sequentially with the order
937
+ written in configuration files. However, this has not (yet) been
938
+ implemented in BioRuby.
939
+
940
+ In addition, for some protocol, you must set additional options
941
+ other than locations (e.g. user name of MySQL). In the BioRegistory
942
+ specification, current available protocols are:
943
+
944
+ * index-flat
945
+ * index-berkeleydb
946
+ * biofetch
947
+ * biosql
948
+ * bsane-corba
949
+ * xembl
950
+
951
+ In BioRuby, you can use index-flat, index-berkleydb, biofetch and biosql.
952
+ Note that the BioRegistry specification sometimes gets updated and BioRuby
953
+ does not always follow quickly.
954
+
955
+ Here an example. Create a Bio::Registry object. It reads the configuration
956
+ files:
957
+
958
+ reg = Bio::Registry.new
959
+
960
+ # connects to the database "genbank"
961
+ serv = reg.get_database('genbank')
962
+
963
+ # gets entry of the ID
964
+ entry = serv.get_by_id('AA2CG')
965
+
966
+
967
+ The variable "serv" is a server object corresponding to the setting
968
+ written in configuration files. The class of the object is one of
969
+ Bio::SQL, Bio::Fetch, and so on. Note that Bio::Registry#get_database("name")
970
+ returns nil if no database is found.
971
+
972
+ After that, you can use get_by_id method and some specific methods.
973
+ Please refer to below documents.
974
+
975
+ == BioFlat
976
+
977
+ BioFlat is a mechanism to create index files of flat files and to retrieve
978
+ these entries fast. There are two index types. index-flat is a simple index
979
+ performing binary search without using an external library of Ruby. index-berkeleydb
980
+ uses Berkeley DB for indexing - but requires installing bdb on your computer,
981
+ as well as the BDB Ruby package. For creating the index itself, you can use
982
+ br_bioflat.rb command bundled with BioRuby.
983
+
984
+ % br_bioflat.rb --makeindex database_name [--format data_format] filename...
985
+
986
+ The format can be omitted because BioRuby has autodetection. If that
987
+ does not work you can try specifying data format as a name of BioRuby
988
+ database class.
989
+
990
+ Search and retrieve data from database:
991
+
992
+ % br_bioflat.rb database_name identifier
993
+
994
+ For example, to create index of GenBank files gbbct*.seq and get entry
995
+ from the database:
996
+
997
+ % br_bioflat.rb --makeindex my_bctdb --format GenBank gbbct*.seq
998
+ % br_bioflat.rb my_bctdb A16STM262
999
+
1000
+ If you have Berkeley DB on your system and installed the bdb extension
1001
+ module of Ruby (see http://raa.ruby-lang.org/project/bdb/), you can
1002
+ create and search indexes with Berkeley DB - a very fast alternative
1003
+ that uses little computer memory. When creating the index, use the
1004
+ "--makeindex-bdb" option instead of "--makeindex".
1005
+
1006
+ % br_bioflat.rb --makeindex-bdb database_name [--format data_format] filename...
1007
+
1008
+ == BioFetch
1009
+
1010
+ Note: this section is an advanced topic
1011
+
1012
+ BioFetch is a database retrieval mechanism via CGI. CGI Parameters,
1013
+ options and error codes are standardized. There client access via
1014
+ http is possible giving the database name, identifiers and format to
1015
+ retrieve entries.
1016
+
1017
+ The BioRuby project has a BioFetch server in bioruby.org. It uses
1018
+ GenomeNet's DBGET system as a backend. The source code of the
1019
+ server is in sample/ directory. Currently, there are only two
1020
+ BioFetch servers in the world: bioruby.org and EBI.
1021
+
1022
+ Here are some methods to retrieve entries from our BioFetch server.
1023
+
1024
+ (1) Using a web browser
1025
+
1026
+ http://bioruby.org/cgi-bin/biofetch.rb
1027
+
1028
+ (2) Using the br_biofetch.rb command
1029
+
1030
+ % br_biofetch.rb db_name entry_id
1031
+
1032
+ (3) Directly using Bio::Fetch in a script
1033
+
1034
+ serv = Bio::Fetch.new(server_url)
1035
+ entry = serv.fetch(db_name, entry_id)
1036
+
1037
+ (4) Indirectly using Bio::Fetch via BioRegistry in script
1038
+
1039
+ reg = Bio::Registry.new
1040
+ serv = reg.get_database('genbank')
1041
+ entry = serv.get_by_id('AA2CG')
1042
+
1043
+ If you want to use (4), you, obviously, have to include some settings
1044
+ in seqdatabase.ini. E.g.
1045
+
1046
+ [genbank]
1047
+ protocol=biofetch
1048
+ location=http://bioruby.org/cgi-bin/biofetch.rb
1049
+ biodbname=genbank
1050
+
1051
+ === The combination of BioFetch, Bio::KEGG::GENES and Bio::AAindex1
1052
+
1053
+ Bioinformatics is often about glueing things together. Here we give an
1054
+ example to get the bacteriorhodopsin gene (VNG1467G) of the archaea
1055
+ Halobacterium from KEGG GENES database and to get alpha-helix index
1056
+ data (BURA740101) from the AAindex (Amino acid indices and similarity
1057
+ matrices) database, and show the helix score for each 15-aa length
1058
+ overlapping window.
1059
+
1060
+ #!/usr/bin/env ruby
1061
+
1062
+ require 'bio'
1063
+
1064
+ entry = Bio::Fetch.query('hal', 'VNG1467G')
1065
+ aaseq = Bio::KEGG::GENES.new(entry).aaseq
1066
+
1067
+ entry = Bio::Fetch.query('aax1', 'BURA740101')
1068
+ helix = Bio::AAindex1.new(entry).index
1069
+
1070
+ position = 1
1071
+ win_size = 15
1072
+
1073
+ aaseq.window_search(win_size) do |subseq|
1074
+ score = subseq.total(helix)
1075
+ puts [ position, score ].join("\t")
1076
+ position += 1
1077
+ end
1078
+
1079
+ The special method Bio::Fetch.query uses preset BioFetch server
1080
+ in bioruby.org. (The server internally get data from GenomeNet.
1081
+ Because the KEGG/GENES database and AAindex database are not available
1082
+ from other BioFetch servers, we used bioruby.org server with
1083
+ Bio::Fetch.query method.)
1084
+
1085
+ == BioSQL
1086
+
1087
+ to be written...
1088
+
1089
+ == The BioRuby example programs
1090
+
1091
+ Some sample programs are stored in samples/ directry.
1092
+ Some programs are obsolete. Since samples are not enough,
1093
+ practical and interesting samples are welcome.
1094
+
1095
+ to be written...
1096
+
1097
+ (EDITOR's NOTE: I would like some examples automatically
1098
+ included - with output)
1099
+
1100
+ == Further reading
1101
+
1102
+ See the BioRuby in anger Wiki and the class documentation for more
1103
+ information on BioRuby.
1104
+
1105
+ The best book to get for understanding and getting productive with the
1106
+ Ruby language is 'Programming Ruby' by Dave Thomas and Andy
1107
+ Hunt. Strongly recommended!
1108
+
1109
+ = APPENDIX
1110
+
1111
+ == KEGG API
1112
+
1113
+ Please refer to KEGG_API.rd.ja (TRANSLATOR'S NOTE: English version: ((<URL:http://www.genome.jp/kegg/soap/doc/keggapi_manual.html>)) ) and
1114
+
1115
+ * ((<URL:http://www.genome.jp/kegg/soap/>))
1116
+
1117
+ == Using BioRuby with R
1118
+
1119
+ The R libraries can be accessed from Ruby using the @@FIXME
1120
+ package. This allows at least use of the standard R library
1121
+ functions. Unfortunately there is no binding for dynamic R - so at
1122
+ this point you'll have to create some command line interface.
1123
+
1124
+ == Using BioPerl from Ruby
1125
+
1126
+ == Installing required external library
1127
+
1128
+ At this point for using BioRuby no additional libraries are needed.
1129
+ This may change, so keep an eye on the Bioruby website. Also when
1130
+ a package is missing BioRuby should show an informative message.
1131
+
1132
+ At this point installing third party Ruby packages can be a bit
1133
+ painful, as the gem standard for packages evolved late and some still
1134
+ force you to copy things by hand. Therefore read the README's
1135
+ carefully that come with each package.
1136
+
1137
+ =end
1138
+