bio 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. data/bin/bioruby +107 -0
  2. data/bin/br_biofetch.rb +59 -0
  3. data/bin/br_bioflat.rb +294 -0
  4. data/bin/br_biogetseq.rb +57 -0
  5. data/bin/br_pmfetch.rb +431 -0
  6. data/doc/BioRuby.rd.ja +225 -0
  7. data/doc/Changes-0.7.rd +236 -0
  8. data/doc/Design.rd.ja +341 -0
  9. data/doc/KEGG_API.rd +1437 -0
  10. data/doc/KEGG_API.rd.ja +1399 -0
  11. data/doc/TODO.rd.ja +138 -0
  12. data/doc/Tutorial.rd +1138 -0
  13. data/doc/Tutorial.rd.ja +2110 -0
  14. data/etc/bioinformatics/seqdatabase.ini +210 -0
  15. data/lib/bio.rb +256 -0
  16. data/lib/bio/alignment.rb +1906 -0
  17. data/lib/bio/appl/bl2seq/report.rb +350 -0
  18. data/lib/bio/appl/blast.rb +269 -0
  19. data/lib/bio/appl/blast/format0.rb +1402 -0
  20. data/lib/bio/appl/blast/format8.rb +95 -0
  21. data/lib/bio/appl/blast/report.rb +652 -0
  22. data/lib/bio/appl/blast/rexml.rb +151 -0
  23. data/lib/bio/appl/blast/wublast.rb +553 -0
  24. data/lib/bio/appl/blast/xmlparser.rb +222 -0
  25. data/lib/bio/appl/blat/report.rb +392 -0
  26. data/lib/bio/appl/clustalw.rb +191 -0
  27. data/lib/bio/appl/clustalw/report.rb +154 -0
  28. data/lib/bio/appl/emboss.rb +68 -0
  29. data/lib/bio/appl/fasta.rb +262 -0
  30. data/lib/bio/appl/fasta/format10.rb +428 -0
  31. data/lib/bio/appl/fasta/format6.rb +37 -0
  32. data/lib/bio/appl/genscan/report.rb +570 -0
  33. data/lib/bio/appl/hmmer.rb +129 -0
  34. data/lib/bio/appl/hmmer/report.rb +556 -0
  35. data/lib/bio/appl/mafft.rb +222 -0
  36. data/lib/bio/appl/mafft/report.rb +119 -0
  37. data/lib/bio/appl/psort.rb +555 -0
  38. data/lib/bio/appl/psort/report.rb +473 -0
  39. data/lib/bio/appl/sim4.rb +134 -0
  40. data/lib/bio/appl/sim4/report.rb +501 -0
  41. data/lib/bio/appl/sosui/report.rb +166 -0
  42. data/lib/bio/appl/spidey/report.rb +604 -0
  43. data/lib/bio/appl/targetp/report.rb +283 -0
  44. data/lib/bio/appl/tmhmm/report.rb +238 -0
  45. data/lib/bio/command.rb +166 -0
  46. data/lib/bio/data/aa.rb +354 -0
  47. data/lib/bio/data/codontable.rb +740 -0
  48. data/lib/bio/data/na.rb +226 -0
  49. data/lib/bio/db.rb +340 -0
  50. data/lib/bio/db/aaindex.rb +280 -0
  51. data/lib/bio/db/embl/common.rb +332 -0
  52. data/lib/bio/db/embl/embl.rb +446 -0
  53. data/lib/bio/db/embl/sptr.rb +954 -0
  54. data/lib/bio/db/embl/swissprot.rb +32 -0
  55. data/lib/bio/db/embl/trembl.rb +31 -0
  56. data/lib/bio/db/embl/uniprot.rb +32 -0
  57. data/lib/bio/db/fantom.rb +604 -0
  58. data/lib/bio/db/fasta.rb +869 -0
  59. data/lib/bio/db/genbank/common.rb +299 -0
  60. data/lib/bio/db/genbank/ddbj.rb +34 -0
  61. data/lib/bio/db/genbank/genbank.rb +354 -0
  62. data/lib/bio/db/genbank/genpept.rb +73 -0
  63. data/lib/bio/db/genbank/refseq.rb +31 -0
  64. data/lib/bio/db/gff.rb +106 -0
  65. data/lib/bio/db/go.rb +497 -0
  66. data/lib/bio/db/kegg/brite.rb +51 -0
  67. data/lib/bio/db/kegg/cell.rb +88 -0
  68. data/lib/bio/db/kegg/compound.rb +130 -0
  69. data/lib/bio/db/kegg/enzyme.rb +125 -0
  70. data/lib/bio/db/kegg/expression.rb +173 -0
  71. data/lib/bio/db/kegg/genes.rb +293 -0
  72. data/lib/bio/db/kegg/genome.rb +362 -0
  73. data/lib/bio/db/kegg/glycan.rb +213 -0
  74. data/lib/bio/db/kegg/keggtab.rb +418 -0
  75. data/lib/bio/db/kegg/kgml.rb +299 -0
  76. data/lib/bio/db/kegg/ko.rb +178 -0
  77. data/lib/bio/db/kegg/reaction.rb +97 -0
  78. data/lib/bio/db/litdb.rb +131 -0
  79. data/lib/bio/db/medline.rb +317 -0
  80. data/lib/bio/db/nbrf.rb +199 -0
  81. data/lib/bio/db/pdb.rb +38 -0
  82. data/lib/bio/db/pdb/atom.rb +60 -0
  83. data/lib/bio/db/pdb/chain.rb +117 -0
  84. data/lib/bio/db/pdb/model.rb +106 -0
  85. data/lib/bio/db/pdb/pdb.rb +1682 -0
  86. data/lib/bio/db/pdb/residue.rb +122 -0
  87. data/lib/bio/db/pdb/utils.rb +234 -0
  88. data/lib/bio/db/prosite.rb +616 -0
  89. data/lib/bio/db/rebase.rb +417 -0
  90. data/lib/bio/db/transfac.rb +387 -0
  91. data/lib/bio/feature.rb +201 -0
  92. data/lib/bio/io/brdb.rb +103 -0
  93. data/lib/bio/io/das.rb +471 -0
  94. data/lib/bio/io/dbget.rb +212 -0
  95. data/lib/bio/io/ddbjxml.rb +614 -0
  96. data/lib/bio/io/fastacmd.rb +123 -0
  97. data/lib/bio/io/fetch.rb +114 -0
  98. data/lib/bio/io/flatfile.rb +496 -0
  99. data/lib/bio/io/flatfile/bdb.rb +266 -0
  100. data/lib/bio/io/flatfile/index.rb +1308 -0
  101. data/lib/bio/io/flatfile/indexer.rb +778 -0
  102. data/lib/bio/io/higet.rb +92 -0
  103. data/lib/bio/io/keggapi.rb +863 -0
  104. data/lib/bio/io/pubmed.rb +189 -0
  105. data/lib/bio/io/registry.rb +308 -0
  106. data/lib/bio/io/soapwsdl.rb +114 -0
  107. data/lib/bio/io/sql.rb +428 -0
  108. data/lib/bio/location.rb +650 -0
  109. data/lib/bio/pathway.rb +991 -0
  110. data/lib/bio/reference.rb +308 -0
  111. data/lib/bio/sequence.rb +593 -0
  112. data/lib/bio/shell.rb +51 -0
  113. data/lib/bio/shell/core.rb +512 -0
  114. data/lib/bio/shell/plugin/codon.rb +228 -0
  115. data/lib/bio/shell/plugin/entry.rb +85 -0
  116. data/lib/bio/shell/plugin/flatfile.rb +119 -0
  117. data/lib/bio/shell/plugin/keggapi.rb +187 -0
  118. data/lib/bio/shell/plugin/midi.rb +448 -0
  119. data/lib/bio/shell/plugin/obda.rb +63 -0
  120. data/lib/bio/shell/plugin/seq.rb +238 -0
  121. data/lib/bio/shell/session.rb +214 -0
  122. data/lib/bio/util/color_scheme.rb +214 -0
  123. data/lib/bio/util/color_scheme/buried.rb +78 -0
  124. data/lib/bio/util/color_scheme/helix.rb +78 -0
  125. data/lib/bio/util/color_scheme/hydropathy.rb +83 -0
  126. data/lib/bio/util/color_scheme/nucleotide.rb +50 -0
  127. data/lib/bio/util/color_scheme/strand.rb +78 -0
  128. data/lib/bio/util/color_scheme/taylor.rb +69 -0
  129. data/lib/bio/util/color_scheme/turn.rb +78 -0
  130. data/lib/bio/util/color_scheme/zappo.rb +69 -0
  131. data/lib/bio/util/contingency_table.rb +337 -0
  132. data/lib/bio/util/sirna.rb +306 -0
  133. data/lib/bioruby.rb +34 -0
  134. data/sample/biofetch.rb +475 -0
  135. data/sample/color_scheme_na.rb +99 -0
  136. data/sample/dbget +37 -0
  137. data/sample/fasta2tab.rb +99 -0
  138. data/sample/fsplit.rb +51 -0
  139. data/sample/gb2fasta.rb +31 -0
  140. data/sample/gb2tab.rb +325 -0
  141. data/sample/gbtab2mysql.rb +161 -0
  142. data/sample/genes2nuc.rb +33 -0
  143. data/sample/genes2pep.rb +33 -0
  144. data/sample/genes2tab.rb +81 -0
  145. data/sample/genome2rb.rb +29 -0
  146. data/sample/genome2tab.rb +76 -0
  147. data/sample/goslim.rb +311 -0
  148. data/sample/gt2fasta.rb +47 -0
  149. data/sample/pmfetch.rb +42 -0
  150. data/sample/pmsearch.rb +42 -0
  151. data/sample/psortplot_html.rb +222 -0
  152. data/sample/ssearch2tab.rb +96 -0
  153. data/sample/tdiary.rb +158 -0
  154. data/sample/tfastx2tab.rb +100 -0
  155. data/sample/vs-genes.rb +212 -0
  156. data/test/data/SOSUI/sample.report +11 -0
  157. data/test/data/TMHMM/sample.report +21 -0
  158. data/test/data/blast/eco:b0002.faa +15 -0
  159. data/test/data/blast/eco:b0002.faa.m0 +128 -0
  160. data/test/data/blast/eco:b0002.faa.m7 +65 -0
  161. data/test/data/blast/eco:b0002.faa.m8 +1 -0
  162. data/test/data/embl/AB090716.embl +65 -0
  163. data/test/data/genscan/sample.report +63 -0
  164. data/test/data/prosite/prosite.dat +2233 -0
  165. data/test/data/refseq/nm_126355.entret +64 -0
  166. data/test/data/uniprot/p53_human.uniprot +1456 -0
  167. data/test/runner.rb +10 -0
  168. data/test/unit/bio/appl/blast/test_report.rb +427 -0
  169. data/test/unit/bio/appl/blast/test_xmlparser.rb +400 -0
  170. data/test/unit/bio/appl/genscan/test_report.rb +195 -0
  171. data/test/unit/bio/appl/sosui/test_report.rb +94 -0
  172. data/test/unit/bio/appl/targetp/test_report.rb +159 -0
  173. data/test/unit/bio/appl/test_blast.rb +159 -0
  174. data/test/unit/bio/appl/test_fasta.rb +142 -0
  175. data/test/unit/bio/appl/tmhmm/test_report.rb +139 -0
  176. data/test/unit/bio/data/test_aa.rb +103 -0
  177. data/test/unit/bio/data/test_codontable.rb +120 -0
  178. data/test/unit/bio/data/test_na.rb +89 -0
  179. data/test/unit/bio/db/embl/test_common.rb +130 -0
  180. data/test/unit/bio/db/embl/test_embl.rb +227 -0
  181. data/test/unit/bio/db/embl/test_sptr.rb +268 -0
  182. data/test/unit/bio/db/embl/test_uniprot.rb +44 -0
  183. data/test/unit/bio/db/kegg/test_genes.rb +58 -0
  184. data/test/unit/bio/db/test_fasta.rb +263 -0
  185. data/test/unit/bio/db/test_gff.rb +140 -0
  186. data/test/unit/bio/db/test_prosite.rb +1450 -0
  187. data/test/unit/bio/io/test_ddbjxml.rb +87 -0
  188. data/test/unit/bio/io/test_soapwsdl.rb +45 -0
  189. data/test/unit/bio/shell/plugin/test_seq.rb +175 -0
  190. data/test/unit/bio/test_alignment.rb +1028 -0
  191. data/test/unit/bio/test_command.rb +71 -0
  192. data/test/unit/bio/test_db.rb +109 -0
  193. data/test/unit/bio/test_feature.rb +128 -0
  194. data/test/unit/bio/test_location.rb +51 -0
  195. data/test/unit/bio/test_pathway.rb +485 -0
  196. data/test/unit/bio/test_sequence.rb +386 -0
  197. data/test/unit/bio/test_shell.rb +31 -0
  198. data/test/unit/bio/util/test_color_scheme.rb +45 -0
  199. data/test/unit/bio/util/test_contingency_table.rb +106 -0
  200. data/test/unit/bio/util/test_sirna.rb +258 -0
  201. metadata +295 -0
data/sample/tdiary.rb ADDED
@@ -0,0 +1,158 @@
1
+ #
2
+ # tDiary : plugin/bio.rb
3
+ #
4
+ # Copyright (C) 2003 KATAYAMA Toshiaki <k@bioruby.org>
5
+ # Mitsuteru C. Nakao <n@bioruby.org>
6
+ # Itoshi NIKAIDO <itoshi@gsc.riken.go.jp>
7
+ # Takeya KASUKAWA <kasukawa@gsc.riken.go.jp>
8
+ #
9
+ # This library is free software; you can redistribute it and/or
10
+ # modify it under the terms of the GNU Lesser General Public
11
+ # License as published by the Free Software Foundation; either
12
+ # version 2 of the License, or (at your option) any later version.
13
+ #
14
+ # This library is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17
+ # Lesser General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU Lesser General Public
20
+ # License along with this library; if not, write to the Free Software
21
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22
+ #
23
+ # $Id: tdiary.rb,v 1.3 2003/03/17 04:24:47 k Exp $
24
+ #
25
+
26
+ =begin
27
+
28
+ == What's this?
29
+
30
+ This is a plugin for the ((<tDiary|URL:http://www.tdiary.org/>)) to create
31
+ various links for biological resources from your diary.
32
+
33
+ tDiary is an extensible web diary application written in Ruby.
34
+
35
+ == How to install
36
+
37
+ Just copy this file under the tDiary's plugin directory as bio.rb.
38
+
39
+ == Usage
40
+
41
+ --- pubmed(pmid, comment = nil)
42
+
43
+ Create a link to NCBI Entrez reference database by using PubMed ID.
44
+ See ((<URL:http://www.ncbi.nlm.nih.gov/entrez/query.fcgi>)) for more
45
+ information.
46
+
47
+ * tDiary style
48
+ * <%= pubmed 12345 %>
49
+ * <%= pubmed 12345, 'hogehoge' %>
50
+ * RD style
51
+ * ((% pubmed 12345 %))
52
+ * ((% pubmed 12345, 'hogehoge' %))
53
+
54
+ --- biofetch(db, entry_id)
55
+
56
+ Create a link to the BioFetch detabase entry retrieval system.
57
+ See ((<URL:http://biofetch.bioruby.org/>)) for more information.
58
+
59
+ * tDiary style
60
+ * <%= biofetch 'genbank', 'AA2CG' %>
61
+ * RD style
62
+ * ((% biofetch 'genbank', 'AA2CG' %))
63
+
64
+ --- amigo(go_id, comment = nil)
65
+
66
+ Create a link to the AmiGO GO term browser by using GO ID.
67
+ See ((<URL:http://www.godatabase.org/cgi-bin/go.cgi>)) for more
68
+ information.
69
+
70
+ * tDiary style
71
+ * <%= amigo '0003673' %>
72
+ * <%= amigo '0003673', 'The root of GO' %>
73
+ * RD style
74
+ * ((% amigo 0003673 %))
75
+ * ((% amigo 0003673, 'The root of GO' %))
76
+
77
+ --- fantom(id, comment = nil)
78
+
79
+ Create a link to FANTOM database by using Clone ID.
80
+ You can use RIKEN clone ID, Rearray ID, Seq ID and Accession Number.
81
+ See ((<URL:http://fantom2.gsc.riken.go.jp/db/>)) for more information.
82
+
83
+ * tDiary style
84
+ * <%= fantom 12345 %>
85
+ * <%= fantom 12345, 'hogehoge' %>
86
+ * RD style
87
+ * ((% fantom 12345 %))
88
+ * ((% fantom 12345, 'hogehoge' %))
89
+
90
+ --- rtps(id, comment = nil)
91
+
92
+ Create a link to FANTOM RTPS database by using Clone ID.
93
+ You can use only RTPS ID.
94
+ See ((<URL:http://fantom2.gsc.riken.go.jp/RTPS/>)) for more information.
95
+
96
+ * tDiary style
97
+ * <%= rtps 12345 %>
98
+ * <%= rtps 12345, 'hogehoge' %>
99
+ * RD style
100
+ * ((% rtps 12345 %))
101
+ * ((% rtps 12345, 'hogehoge' %))
102
+
103
+ == References
104
+
105
+ * Analysis of the mouse transcriptome based on functional annotation of
106
+ 60,770 full-length cDNAs, The FANTOM Consortium and the RIKEN Genome
107
+ Exploration Research Group Phase I & II Team, Nature 420:563-573, 2002
108
+
109
+ * Functional annotation of a full-length mouse cDNA collection,
110
+ The RIKEN Genome Exploration Research Group Phase II Team and
111
+ the FANTOM Consortium, Nature 409:685-690, 2001
112
+
113
+ =end
114
+
115
+ def pubmed(pmid, comment = nil)
116
+ pmid = pmid.to_s.strip
117
+ url = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi"
118
+ url << "?cmd=Retrieve&db=PubMed&dopt=Abstract&list_uids=#{pmid}"
119
+ if comment
120
+ %Q[<a href="#{url}">#{comment.to_s.strip}</a>]
121
+ else
122
+ %Q[<a href="#{url}">PMID:#{pmid}</a>]
123
+ end
124
+ end
125
+
126
+ def biofetch(db, entry_id)
127
+ url = "http://biofetch.bioruby.org/"
128
+ %Q[<a href="#{url}?db=#{db};id=#{entry_id};style=raw">#{db}:#{entry_id}</a>]
129
+ end
130
+
131
+ def amigo(go_id = '0003673', comment = nil)
132
+ go_id = go_id.to_s.strip
133
+ url = "http://www.godatabase.org/cgi-bin/go.cgi?query=#{go_id};view=query;action=query;search_constraint=terms"
134
+ comment = "AmiGO:#{go_id}" unless comment
135
+ %Q[<a href="#{url}">#{comment}</a>]
136
+ end
137
+
138
+ def fantom(id, comment = nil)
139
+ id = id.to_s.strip
140
+ url = "http://fantom2.gsc.riken.go.jp/db/link/id.cgi"
141
+ url << "?id=#{id}"
142
+ if comment
143
+ %Q[<a href="#{url}">#{comment.to_s.strip}</a>]
144
+ else
145
+ %Q[<a href="#{url}">FANTOM DB:#{id}</a>]
146
+ end
147
+ end
148
+
149
+ def rtps(id, comment = nil)
150
+ id = id.to_s.strip
151
+ url = "http://fantom2.gsc.riken.go.jp/RTPS/link/id.cgi"
152
+ url << "?id=#{id}"
153
+ if comment
154
+ %Q[<a href="#{url}">#{comment.to_s.strip}</a>]
155
+ else
156
+ %Q[<a href="#{url}">FANTOM RTPS DB:#{id}</a>]
157
+ end
158
+ end
@@ -0,0 +1,100 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # tfastx2tab.rb - convert TFASTX (-m 6) output into tab delimited data for MySQL
4
+ #
5
+ # Usage:
6
+ #
7
+ # % tfastx2tab.rb TFASTX-output-file[s] > tfastx_results.tab
8
+ # % mysql < tfastx_results.sql (use sample at the end of this file)
9
+ #
10
+ # Format accepted:
11
+ #
12
+ # % tfastx3[3][_t] -Q -H -m 6 query.f target.f ktup > TFASTX-output-file
13
+ #
14
+ # Copyright (C) 2001 KATAYAMA Toshiaki <k@bioruby.org>
15
+ #
16
+ # This program is free software; you can redistribute it and/or modify
17
+ # it under the terms of the GNU General Public License as published by
18
+ # the Free Software Foundation; either version 2 of the License, or
19
+ # (at your option) any later version.
20
+ #
21
+ # This program is distributed in the hope that it will be useful,
22
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
23
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24
+ # GNU General Public License for more details.
25
+ #
26
+ # $Id: tfastx2tab.rb,v 0.1 2001/06/21 08:26:14 katayama Exp $
27
+ #
28
+
29
+ while gets
30
+
31
+ # query
32
+ if /^\S+: (\d+) aa$/
33
+ q_len = $1
34
+ end
35
+
36
+ # each hit
37
+ if /^>>([^>]\S+).*\((\d+) aa\)$/
38
+ target = $1
39
+ t_len = $2
40
+
41
+ # d = dummy variable
42
+ d, frame, d, initn, d, init1, d, opt, d, zscore, d, bits, d, evalue =
43
+ gets.split(/\s+/)
44
+ d, d, sw, ident, d, ugident, d, d, overlap, d, d, lap =
45
+ gets.split(/\s+/)
46
+
47
+ # query-hit pair
48
+ print "#{$FILENAME}\t#{q_len}\t#{target}\t#{t_len}"
49
+
50
+ # pick up values
51
+ ary = [
52
+ initn,
53
+ init1,
54
+ opt,
55
+ zscore,
56
+ bits,
57
+ evalue,
58
+ sw,
59
+ ident,
60
+ ugident,
61
+ overlap,
62
+ lap
63
+ ]
64
+
65
+ # print values
66
+ for i in ary
67
+ i.tr!('^0-9.:e\-','')
68
+ print "\t#{i}"
69
+ end
70
+
71
+ print "\t#{frame}\n"
72
+
73
+ end
74
+ end
75
+
76
+ =begin MySQL tfastx_results.sql sample
77
+
78
+ CREATE DATABASE IF NOT EXISTS db_name;
79
+ CREATE TABLE IF NOT EXISTS db_name.table_name (
80
+ query varchar(25) not NULL,
81
+ q_len integer unsigned default 0,
82
+ target varchar(25) not NULL,
83
+ t_len integer unsigned default 0,
84
+ initn integer unsigned default 0,
85
+ init1 integer unsigned default 0,
86
+ opt integer unsigned default 0,
87
+ zscore float default 0.0,
88
+ bits float default 0.0,
89
+ evalue float default 0.0,
90
+ sw integer unsigned default 0,
91
+ ident float default 0.0,
92
+ ugident float default 0.0,
93
+ overlap integer unsigned default 0,
94
+ lap_at varchar(25) default NULL,
95
+ frame varchar(5) default NULL
96
+ );
97
+ LOAD DATA LOCAL INFILE 'tfastx_results.tab' INTO TABLE db_name.table_name;
98
+
99
+ =end
100
+
@@ -0,0 +1,212 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # vs-genes.rb - homology/motif search wrapper
4
+ #
5
+ # FASTA/BLAST/Pfam interface for the multiple query in the FASTA format
6
+ #
7
+ # Copyright (C) 2001 KATAYAMA Toshiaki <k@bioruby.org>
8
+ #
9
+ # This program is free software; you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation; either version 2 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # This program is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # $Id: vs-genes.rb,v 0.1 2001/06/21 08:26:31 katayama Exp $
20
+ #
21
+
22
+ def usage(cpu, ktup, skip, resultdir, verbose)
23
+ print <<-END
24
+
25
+ Usage:
26
+
27
+ % #{$0} -p PROG -q QUERY -t TARGET [-c #] [-k #] [-s #] [-d DIR] [-v on]
28
+
29
+ options
30
+ -p PROG : (fasta3|ssearch3|tfasta3|fastx3|tfastx3)[3]
31
+ or
32
+ (blastp|blastn|blastx|tblastn|tblastx)
33
+ or
34
+ (hmmpfam|hmmpfam_n)
35
+ -q QUERY : query nucleotide or peptide sequences in the FASTA format
36
+ -t TARGET : target DB (FASTA or BLAST2 formatdb or Pfam format)
37
+
38
+ optional arguments
39
+ -c num : number of CPUs (for the SMP machines, default is #{cpu})
40
+ -k num : FASTA ktup value (2 for pep, 6 for nuc, default is #{ktup})
41
+ -s num : skip query (for the resume session, default is #{skip})
42
+ -d DIR : result output directory (default is "#{resultdir}")
43
+ -v on/off : verbose output of processing if on (default is "#{verbose}")
44
+
45
+ END
46
+
47
+ exit 1
48
+ end
49
+
50
+
51
+ ### initialize
52
+
53
+ def init
54
+ arg = {}
55
+
56
+ # default values
57
+ arg['c'] = 1 # num of CPUs
58
+ arg['k'] = 2 # ktup value for FASTA
59
+ arg['s'] = 0 # skip query
60
+ arg['d'] = "./result" # result directory
61
+ arg['v'] = 'off' # verbose mode
62
+
63
+ # parse options
64
+ ARGV.join(' ').scan(/-(\w) (\S+)/).each do |key, val|
65
+ arg[key] = val
66
+ end
67
+
68
+ # check program, query, target or print usage
69
+ unless arg['p'] and arg['q'] and arg['t']
70
+ usage(arg['c'], arg['k'], arg['s'], arg['d'], arg['v'])
71
+ end
72
+
73
+ # create result output directory
74
+ unless test(?d, "#{arg['d']}")
75
+ Dir.mkdir("#{arg['d']}", 0755)
76
+ end
77
+
78
+ # print status
79
+ if arg['v'] != 'off'
80
+ puts "PROG : #{arg['p']}"
81
+ puts " ktup : #{arg['k']}" if arg['p'] =~ /fast/
82
+ puts "QUERY : #{arg['q']}"
83
+ puts " skip : #{arg['s']}"
84
+ puts "TARGET : #{arg['t']}"
85
+ puts "RESULT : #{arg['d']}"
86
+ end
87
+
88
+ return arg
89
+ end
90
+
91
+
92
+ ### generate command line
93
+
94
+ def cmd_line(arg, orf)
95
+ # program with default command line options # query -> target DB
96
+ opt = {
97
+ # FASTA : "-b n" for best n scores, "-d n" for best n alignment
98
+ 'fasta3' => "fasta3 -Q -H -m 6", # pep -> pep or nuc -> nuc
99
+ 'ssearch3' => "ssearch3 -Q -H -m 6", # pep -> pep or nuc -> nuc
100
+ 'tfasta3' => "tfasta3 -Q -H -m 6", # pep -> nuc
101
+ 'fastx3' => "fastx3 -Q -H -m 6", # nuc -> pep
102
+ 'tfastx3' => "tfastx3 -Q -H -m 6", # pep -> nuc (with frameshifts)
103
+
104
+ 'fasta33' => "fasta33 -Q -H -m 6", # pep -> pep or nuc -> nuc
105
+ 'ssearch33' => "ssearch33 -Q -H -m 6", # pep -> pep or nuc -> nuc
106
+ 'tfasta33' => "tfasta33 -Q -H -m 6", # pep -> nuc
107
+ 'fastx33' => "fastx33 -Q -H -m 6", # nuc -> pep
108
+ 'tfastx33' => "tfastx33 -Q -H -m 6", # pep -> nuc (with frameshifts)
109
+
110
+ # BLAST : outputs XML
111
+ 'blastp' => "blastall -m 7 -p blastp -d", # pep -> pep
112
+ 'blastn' => "blastall -m 7 -p blastn -d", # nuc -> nuc
113
+ 'blastx' => "blastall -m 7 -p blastx -d", # nuc -> pep
114
+ 'tblastn' => "blastall -m 7 -p tblastn -d", # pep -> nuc
115
+ 'tblastx' => "blastall -m 7 -p tblastx -d", # nuc -> nuc (by trans)
116
+
117
+ # Pfam : "-A n" for best n alignment, "-E n" for E value cutoff etc.
118
+ 'hmmpfam' => "hmmpfam", # pep -> Pfam DB
119
+ 'hmmpfam_n' => "hmmpfam -n", # nuc -> Pfam DB
120
+ }
121
+
122
+ # arguments used in the command line
123
+ cpu = arg['c'].to_i
124
+ ktup = arg['k']
125
+ target = arg['t']
126
+ query = arg['d'] + "/query." + orf
127
+ result = arg['d'] + "/" + orf
128
+
129
+ prog = opt[arg['p']]
130
+
131
+ if cpu > 1 # use multiple CPUs
132
+ case arg['p']
133
+ when /(fast|ssearch)/
134
+ prog += " -T #{cpu}"
135
+ prog.sub!(' ', '_t ') # rename program with "_t"
136
+ when /pfam/
137
+ prog += " --cpu #{cpu}"
138
+ end
139
+ end
140
+
141
+ # generate complete command line to execute
142
+ case arg['p']
143
+ when /fast/
144
+ command = "#{prog} #{query} #{target} #{ktup} > #{result}"
145
+ when /ssearch/
146
+ command = "#{prog} #{query} #{target} > #{result}"
147
+ when /blast/
148
+ command = "#{prog} #{target} -i #{query} > #{result}"
149
+ when /pfam/
150
+ command = "#{prog} #{target} #{query} > #{result}"
151
+ end
152
+
153
+ return command
154
+ end
155
+
156
+
157
+ ### main
158
+
159
+ begin
160
+ arg = init
161
+ count = 0
162
+
163
+ open(arg['q'], "r") do |f|
164
+ while seq = f.gets("\n>")
165
+ count += 1
166
+
167
+ # skip (-s option)
168
+ next unless count > arg['s'].to_i
169
+
170
+ # clean up
171
+ seq.sub!(/^>?[ \t]*/, '') # delete '>' and SPACEs or TABs at the head
172
+ seq.sub!(/>$/, '') # delete '>' at the tail (separator)
173
+
174
+ # get ORF name
175
+ if seq[/^$/] # no definition (e.g. ">\nSEQ>" or ">\n>")
176
+ next # -> useless for the multiple query
177
+ else
178
+ orf = seq[/^\S+/] # the first word in the definition line
179
+ end
180
+
181
+ # KEGG uses ">DB:ENTRY" format in the definition line
182
+ if orf =~ /:/
183
+ db,orf = orf.split(/:/)
184
+ end
185
+
186
+ # add time if the same ORF name was already used
187
+ if test(?f, "#{arg['d']}/#{orf}")
188
+ orf = "#{orf}.#{Time.now.to_f.to_s}"
189
+ end
190
+
191
+ # create temporal file of the query
192
+ open("#{arg['d']}/query.#{orf}", "w+") do |tmp|
193
+ tmp.print(">#{seq}")
194
+ end
195
+
196
+ command = cmd_line(arg, orf)
197
+
198
+ # print status
199
+ if arg['v'] != 'off'
200
+ puts "#{count} : #{orf} ..."
201
+ puts " #{command}"
202
+ end
203
+
204
+ # execute
205
+ system("#{command}")
206
+
207
+ # remove temporal file
208
+ File.delete("#{arg['d']}/query.#{orf}")
209
+ end
210
+ end
211
+ end
212
+