bio-sge 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/Gemfile ADDED
@@ -0,0 +1,14 @@
1
+ source "http://rubygems.org"
2
+ # Add dependencies required to use your gem here.
3
+ # Example:
4
+ # gem "activesupport", ">= 2.3.5"
5
+
6
+ # Add dependencies to develop your gem here.
7
+ # Include everything needed to run rake, tests, features, etc.
8
+ group :development do
9
+ gem "shoulda", ">= 0"
10
+ gem "bundler", "~> 1.0.0"
11
+ gem "jeweler", "~> 1.5.2"
12
+ gem "rcov", ">= 0"
13
+ gem "bio", ">= 1.4.1"
14
+ end
@@ -0,0 +1,22 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ bio (1.4.1)
5
+ git (1.2.5)
6
+ jeweler (1.5.2)
7
+ bundler (~> 1.0.0)
8
+ git (>= 1.2.5)
9
+ rake
10
+ rake (0.8.7)
11
+ rcov (0.9.9)
12
+ shoulda (2.11.3)
13
+
14
+ PLATFORMS
15
+ ruby
16
+
17
+ DEPENDENCIES
18
+ bio (>= 1.4.1)
19
+ bundler (~> 1.0.0)
20
+ jeweler (~> 1.5.2)
21
+ rcov
22
+ shoulda
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2010 Toshiaki Katayama
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,19 @@
1
+ = bio-sge
2
+
3
+ Description goes here.
4
+
5
+ == Contributing to bio-sge
6
+
7
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
8
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
9
+ * Fork the project
10
+ * Start a feature/bugfix branch
11
+ * Commit and push until you are happy with your contribution
12
+ * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
13
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
14
+
15
+ == Copyright
16
+
17
+ Copyright (c) 2010 Toshiaki Katayama. See LICENSE.txt for
18
+ further details.
19
+
@@ -0,0 +1,53 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'rake'
11
+
12
+ require 'jeweler'
13
+ Jeweler::Tasks.new do |gem|
14
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
15
+ gem.name = "bio-sge"
16
+ gem.homepage = "http://github.com/ktym/bioruby-sge"
17
+ gem.license = "MIT"
18
+ gem.summary = %Q{BioRuby plugin for Sun Grid Engine}
19
+ gem.description = %Q{Entries in a flatfile will be parased by the BioRuby's Bio::FlatFile.auto module. These entries are used as queries for the Sun Grid Engine (SGE) system. Huge amount of queries are automatically splitted into subdirectories. With a specified command line to be executed, queries are submited to the SGE as an array job.}
20
+ gem.email = "k@bioruby.org"
21
+ gem.authors = ["Toshiaki Katayama"]
22
+ # Include your dependencies below. Runtime dependencies are required when using your gem,
23
+ # and development dependencies are only needed for development (ie running rake tasks, tests, etc)
24
+ # gem.add_runtime_dependency 'jabber4r', '> 0.1'
25
+ # gem.add_development_dependency 'rspec', '> 1.2.3'
26
+ end
27
+ Jeweler::RubygemsDotOrgTasks.new
28
+
29
+ require 'rake/testtask'
30
+ Rake::TestTask.new(:test) do |test|
31
+ test.libs << 'lib' << 'test'
32
+ test.pattern = 'test/**/test_*.rb'
33
+ test.verbose = true
34
+ end
35
+
36
+ require 'rcov/rcovtask'
37
+ Rcov::RcovTask.new do |test|
38
+ test.libs << 'test'
39
+ test.pattern = 'test/**/test_*.rb'
40
+ test.verbose = true
41
+ end
42
+
43
+ task :default => :test
44
+
45
+ require 'rake/rdoctask'
46
+ Rake::RDocTask.new do |rdoc|
47
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
48
+
49
+ rdoc.rdoc_dir = 'rdoc'
50
+ rdoc.title = "bio-sge #{version}"
51
+ rdoc.rdoc_files.include('README*')
52
+ rdoc.rdoc_files.include('lib/**/*.rb')
53
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.0
@@ -0,0 +1,220 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # = Bio::SGE -- Sun Grid Engine array job submitter (Bio::FlatFile query to SGE)
4
+ #
5
+ # Copyright:: Copyright (C) 2009, 2010 Toshiaki Katayama <mailto:ktym at hgc dot jp>
6
+ # License:: Distributes under the same terms as Ruby
7
+ # Site:: http://kanehisa.hgc.jp/~k/sge/
8
+ # Download:: http://kanehisa.hgc.jp/~k/sge/sge.rb
9
+ #
10
+ # == USAGE (AS A COMMAND)
11
+ #
12
+ # Usage:
13
+ # % biosge.rb [options...] -q input_file -t db_file -c 'command --opts #{query} #{target}'
14
+ #
15
+ # Options:
16
+ # -q or --query file
17
+ # Specify a flatfile including multiple entries.
18
+ # -t or --target file
19
+ # Specify a database file to be used.
20
+ # -c or --command 'string'
21
+ # Specify a command line to be executed.
22
+ # The following identifiers can be used in the command line 'string'.
23
+ # '#{query}' fragmented query file name (== input_file)
24
+ # '#{target}' target database file name
25
+ # '#{work_dir}' current working directory
26
+ # '#{task_id}' SGE_TASK_ID
27
+ # '#{slice}' -- task_id / @@slice (integer >= 1)
28
+ # '#{input_file}' -- 'input/#{slice}/#{task_id}'
29
+ # '#{output_file}' -- 'output/#{slice}/#{task_id}'
30
+ # '#{error_file}' -- 'error/#{slice}/#{task_id}'
31
+ # -o or --sge_opts 'string'
32
+ # Additional options for the qsub command.
33
+ # '-l s_vmem=16G -l mem_req=16' to reserve 16GB RAM for each job
34
+ # '-l cpu_arch=xeon' to limit to use xeon CPUs only
35
+ # Resource reservation and backfill options:
36
+ # '-R y -l s_rt=12:0:0' to limit max exec time to 12h (SIGUSER1)
37
+ # '-R y -l h_rt=12:0:0' to limit max exec time to 12h (SIGKILL)
38
+ # '-R y -pe mpi-fillup 4' to reserve 4 threads for MPI
39
+ # -m or --task_min integer
40
+ # Start number of tasks (default is 1, increase to start from halfway).
41
+ # -M or --taks_max integer
42
+ # Last value (default is a total number of entries in query).
43
+ # -s or --task_step integer
44
+ # Number of processes per one job (default is 1000). Large value is
45
+ # recommended for short tasks with a large number of queries, and
46
+ # a small value (minimum is 1) can be used for time consuming tasks
47
+ # with a small number of queries.
48
+ # --clear
49
+ # Remove a SGE script and output/error/log directories
50
+ # --clean
51
+ # Remove a count file and the extracted input directory
52
+ # --distclean
53
+ # Exec both of --clear and --clean
54
+ # -h or --help
55
+ # Print this help message.
56
+ #
57
+ # Examples:
58
+ # % biosge.rb -q data/query.pep -t data/target.pep -c 'blastall -p blastp -i #{query} -d #{target}' -o '-l cpu_arch=xeon'
59
+ # % biosge.rb -q data/query.nuc -t /usr/local/db/blast/ncbi/nr -c 'blastall -p blastx -s 10 -i #{query} -d #{target}' -o '-l cpu_arch=xeon -l sjob -l s_vmem=4G,mem_req=4'
60
+ # % biosge.rb -q data/dme.nuc -t data/dme.genome -s 1 -c 'exonerate --bestn 1 --model est2genome --showtargetgff 1 --showvulgar yes #{query} #{target}'
61
+ # % biosge.rb -q data/hsa.pep -t data/Pfam-A.hmm -m 1000 -M 2000 -s 10 -c 'hmmscan --tblout output/#{slice}/#{task_id}.tbl #{target} #{query}'
62
+ # % biosge.rb -q data/refseq.gb -c 'bp_genbank2gff3.pl -out stdout #{query}'
63
+ # % biosge.rb --distclean
64
+ #
65
+ # See also:
66
+ # http://kanehisa.hgc.jp/~k/sge/
67
+ #
68
+ # == RESULTS
69
+ #
70
+ # The execution results will be stored in the following files and directories.
71
+ #
72
+ # count.txt # correspondence table of the file numbers and entry IDs
73
+ # input/ # extracted sequence files (one file, one sequence)
74
+ # output/ # outputs of the command (numberd same as the input files)
75
+ # error/ # errors of the command (numberd same as the input files)
76
+ # log/ # log files of the qsub run (stdout and stderr)
77
+ #
78
+ # You can confirm whether there were no system errors during the SGE execution
79
+ # by sizes and contents of files in the log/ directory.
80
+ #
81
+ # Then, check the error/ directory whether there was a problem or not in your
82
+ # jobs (some command may utilize the stderr to another purpose).
83
+ #
84
+ # Finally, main results can be obtained from files in the output/ directory.
85
+ #
86
+
87
+ require 'bio-sge'
88
+ require 'getoptlong'
89
+
90
+ def show_usage
91
+ prog = File.basename($0)
92
+ usage = %Q[
93
+ Usage:
94
+ % #{prog} \[options...\] -q input_file -t db_file -c 'command --opts \#{query} \#{target}'
95
+
96
+ Options:
97
+ -q or --query file
98
+ Specify a flatfile including multiple entries.
99
+ -t or --target file
100
+ Specify a database file to be used.
101
+ -c or --command 'string'
102
+ Specify a command line to be executed.
103
+ The following identifiers can be used in the command line 'string'.
104
+ '\#{query}' fragmented query file name (== input_file)
105
+ '\#{target}' target database file name
106
+ '\#{work_dir}' current working directory
107
+ '\#{task_id}' SGE_TASK_ID
108
+ '\#{slice}' -- task_id / @@slice (integer >= 0)
109
+ '\#{input_file}' -- "input/\#{slice}/\#{task_id}"
110
+ '\#{output_file}' -- "output/\#{slice}/\#{task_id}"
111
+ '\#{error_file}' -- "error/\#{slice}/\#{task_id}"
112
+ -o or --sge_opts 'string'
113
+ Additional options for the qsub command.
114
+ '-l s_vmem=16G -l mem_req=16' to reserve 16GB RAM for each job
115
+ '-l cpu_arch=xeon' to limit to use xeon CPUs only
116
+ Resource reservation and backfill options:
117
+ '-R y -l s_rt=12:0:0' to limit max exec time to 12h (SIGUSER1)
118
+ '-R y -l h_rt=12:0:0' to limit max exec time to 12h (SIGKILL)
119
+ '-R y -pe mpi-fillup 4' to reserve 4 threads for MPI
120
+ -m or --task_min integer
121
+ Start number of tasks (default is 1, increase to start from halfway).
122
+ -M or --taks_max integer
123
+ Last value (default is a total number of entries in query).
124
+ -s or --task_step integer
125
+ Number of processes per one job (default is 1000). Large value is
126
+ recommended for short tasks with a large number of queries, and
127
+ a small value (minimum is 1) can be used for time consuming tasks
128
+ with a small number of queries.
129
+ -h or --help
130
+ Print this help message.
131
+ --clear
132
+ Remove a SGE script and output/error/log directories
133
+ --clean
134
+ Remove a count file and the extracted input directory
135
+ --distclean
136
+ Exec both of --clear and --clean
137
+
138
+ Examples:
139
+ % #{prog} -q data/query.pep -t data/target.pep -c 'blastall -p blastp -i \#{query} -d \#{target}' -o '-l cpu_arch=xeon'
140
+ % #{prog} -q data/query.nuc -t /usr/local/db/blast/ncbi/nr -c 'blastall -p blastx -s 10 -i \#{query} -d \#{target}' -o '-l cpu_arch=xeon -l sjob -l s_vmem=4G,mem_req=4'
141
+ % #{prog} -q data/dme.nuc -t data/dme.genome -s 1 -c 'exonerate --bestn 1 --model est2genome --showtargetgff 1 --showvulgar yes \#{query} \#{target}'
142
+ % #{prog} -q data/hsa.pep -t data/Pfam-A.hmm -m 1000 -M 2000 -s 10 -c 'hmmscan --tblout output/\#{slice}/\#{task_id}.tbl \#{target} \#{query}'
143
+ % #{prog} -q data/refseq.gb -c 'bp_genbank2gff3.pl -out stdout \#{query}'
144
+ % #{prog} --distclean
145
+
146
+ See also:
147
+ http://kanehisa.hgc.jp/~k/sge/
148
+
149
+ ]
150
+ puts usage
151
+ exit
152
+ end
153
+
154
+ $opts = Hash.new
155
+
156
+ args = GetoptLong.new(
157
+ [ '--query', '-q', GetoptLong::REQUIRED_ARGUMENT ],
158
+ [ '--target', '-t', GetoptLong::REQUIRED_ARGUMENT ],
159
+ [ '--command', '-c', GetoptLong::REQUIRED_ARGUMENT ],
160
+ [ '--sge_opts', '-o', GetoptLong::REQUIRED_ARGUMENT ],
161
+ [ '--task_min', '-m', GetoptLong::REQUIRED_ARGUMENT ],
162
+ [ '--task_max', '-M', GetoptLong::REQUIRED_ARGUMENT ],
163
+ [ '--task_step', '-s', GetoptLong::REQUIRED_ARGUMENT ],
164
+ [ '--clear', GetoptLong::NO_ARGUMENT ],
165
+ [ '--clean', GetoptLong::NO_ARGUMENT ],
166
+ [ '--distclean', GetoptLong::NO_ARGUMENT ],
167
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ]
168
+ )
169
+
170
+ args.each_option do |name, value|
171
+ case name
172
+ when /--query/
173
+ $opts[:query] = value
174
+ when /--target/
175
+ $opts[:target] = value
176
+ when /--command/
177
+ $opts[:command] = value
178
+ when /--sge_opts/
179
+ $opts[:sge_opts] = value
180
+ when /--task_min/
181
+ $opts[:task_min] = value.to_i
182
+ when /--task_max/
183
+ $opts[:task_max] = value.to_i
184
+ when /--task_step/
185
+ $opts[:task_step] = value.to_i
186
+ when /--clear/
187
+ $opts[:clear] = true
188
+ when /--clean/
189
+ $opts[:clean] = true
190
+ when /--distclean/
191
+ $opts[:clear] = true
192
+ $opts[:clean] = true
193
+ when /--help/
194
+ $opts[:help] = true
195
+ end
196
+ end
197
+
198
+ if $opts[:clear]
199
+ sge = Bio::SGE.new
200
+ sge.clear
201
+ end
202
+
203
+ if $opts[:clean]
204
+ sge = Bio::SGE.new
205
+ sge.clean
206
+ end
207
+
208
+ show_usage if $opts[:help] or !$opts[:command]
209
+
210
+ sge = Bio::SGE.new { |opt|
211
+ opt.query = $opts[:query] if $opts[:query]
212
+ opt.target = $opts[:target] if $opts[:target]
213
+ opt.command = $opts[:command] if $opts[:command]
214
+ opt.sge_opts = $opts[:sge_opts] if $opts[:sge_opts]
215
+ opt.task_min = $opts[:task_min] if $opts[:task_min]
216
+ opt.task_max = $opts[:task_max] if $opts[:task_max]
217
+ opt.task_step = $opts[:task_step] if $opts[:task_step]
218
+ }
219
+ sge.prepare
220
+ sge.submit
@@ -0,0 +1,70 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{bio-sge}
8
+ s.version = "0.0.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Toshiaki Katayama"]
12
+ s.date = %q{2010-12-24}
13
+ s.default_executable = %q{biosge.rb}
14
+ s.description = %q{Entries in a flatfile will be parased by the BioRuby's Bio::FlatFile.auto module. These entries are used as queries for the Sun Grid Engine (SGE) system. Huge amount of queries are automatically splitted into subdirectories. With a specified command line to be executed, queries are submited to the SGE as an array job.}
15
+ s.email = %q{k@bioruby.org}
16
+ s.executables = ["biosge.rb"]
17
+ s.extra_rdoc_files = [
18
+ "LICENSE.txt",
19
+ "README.rdoc"
20
+ ]
21
+ s.files = [
22
+ ".document",
23
+ "Gemfile",
24
+ "Gemfile.lock",
25
+ "LICENSE.txt",
26
+ "README.rdoc",
27
+ "Rakefile",
28
+ "VERSION",
29
+ "bin/biosge.rb",
30
+ "bio-sge.gemspec",
31
+ "lib/bio-sge.rb",
32
+ "test/helper.rb",
33
+ "test/test_bio-sge.rb"
34
+ ]
35
+ s.homepage = %q{http://github.com/ktym/bioruby-sge}
36
+ s.licenses = ["MIT"]
37
+ s.require_paths = ["lib"]
38
+ s.rubygems_version = %q{1.3.7}
39
+ s.summary = %q{BioRuby plugin for Sun Grid Engine}
40
+ s.test_files = [
41
+ "test/helper.rb",
42
+ "test/test_bio-sge.rb"
43
+ ]
44
+
45
+ if s.respond_to? :specification_version then
46
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
47
+ s.specification_version = 3
48
+
49
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
50
+ s.add_development_dependency(%q<shoulda>, [">= 0"])
51
+ s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
52
+ s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
53
+ s.add_development_dependency(%q<rcov>, [">= 0"])
54
+ s.add_development_dependency(%q<bio>, [">= 1.4.1"])
55
+ else
56
+ s.add_dependency(%q<shoulda>, [">= 0"])
57
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
58
+ s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
59
+ s.add_dependency(%q<rcov>, [">= 0"])
60
+ s.add_dependency(%q<bio>, [">= 1.4.1"])
61
+ end
62
+ else
63
+ s.add_dependency(%q<shoulda>, [">= 0"])
64
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
65
+ s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
66
+ s.add_dependency(%q<rcov>, [">= 0"])
67
+ s.add_dependency(%q<bio>, [">= 1.4.1"])
68
+ end
69
+ end
70
+
@@ -0,0 +1,475 @@
1
+ #
2
+ # = Bio::SGE -- Sun Grid Engine array job submitter (Bio::FlatFile query to SGE)
3
+ #
4
+ # Copyright:: Copyright (C) 2009, 2010 Toshiaki Katayama <mailto:ktym at hgc dot jp>
5
+ # License:: Distributes under the same terms as Ruby
6
+ # Site:: http://kanehisa.hgc.jp/~k/sge/
7
+ # Download:: http://kanehisa.hgc.jp/~k/sge/sge.rb
8
+ #
9
+ # == USAGE (AS A LIBRARY)
10
+ #
11
+ # The Bio::SGE class extract entries in a biological flatfile as queries
12
+ # and execute a bulk submission to the Sun Grid Engine as an array job.
13
+ #
14
+ # This class takes a flatfile (e.g. multi FASTA file) as a 'query',
15
+ # a database file as a 'target', and a command line to be executed
16
+ # as a 'command' (see also SCRIPT VARIABLES section).
17
+ #
18
+ # The flatfile must be accepted by the Bio::FlatFile.auto class method
19
+ # of the BioRuby (http://bioruby.org/) package.
20
+ #
21
+ # Instantiation of the Bio::SGE object can be done by
22
+ #
23
+ # sge = Bio::SGE.new(query, target, command, sge_opts)
24
+ #
25
+ # or by assigning these values through accessors prior to a job submission
26
+ #
27
+ # sge = Bio::SGE.new
28
+ # sge.query = 'flat_file'
29
+ # sge.target = 'target_database_file'
30
+ # sge.command = 'command --to_be_executed --with_opts'
31
+ #
32
+ # or by assigning these values with a block parameter.
33
+ #
34
+ # sge = Bio::SGE.new { |opt|
35
+ # opt.query = 'flat_file'
36
+ # opt.target = 'target_database_file'
37
+ # opt.command = 'command --to_be_executed --with_opts'
38
+ # }
39
+ #
40
+ # Then, the "prepare" method will
41
+ #
42
+ # * create output directories
43
+ # * generate a SGE script to be submitted
44
+ # * extract each entry in the query as separate files
45
+ # (files are numbered by the order of appearance)
46
+ #
47
+ # and now you can submit your SGE job by the "submit" method.
48
+ #
49
+ # sge.prepare
50
+ # sge.submit
51
+ #
52
+ # The "submit" method will automatically take care of messy tasks such that
53
+ # (1) splitting array jobs according to the number of total jobs, (2) save
54
+ # stdout and stderr from SGE system to a separate log directory etc.
55
+ #
56
+ # == RESULTS
57
+ #
58
+ # The execution results will be stored in the following files and directories.
59
+ #
60
+ # count.txt # correspondence table of the file numbers and entry IDs
61
+ # input/ # extracted sequence files (one file, one sequence)
62
+ # output/ # outputs of the command (numberd same as the input files)
63
+ # error/ # errors of the command (numberd same as the input files)
64
+ # log/ # log files of the qsub run (stdout and stderr)
65
+ #
66
+ # You can confirm whether there were no system errors during the SGE execution
67
+ # by sizes and contents of files in the log/ directory.
68
+ #
69
+ # Then, check the error/ directory whether there was a problem or not in your
70
+ # jobs (some command may utilize the stderr to another purpose).
71
+ #
72
+ # Finally, main results can be obtained from files in the output/ directory.
73
+ #
74
+ # == ADVANCED USAGE
75
+ #
76
+ # You can individually call following methods instead of the "prepare" method.
77
+ #
78
+ # sge.setup # to prepare output directories
79
+ # sge.script # to generate a SGE script
80
+ # sge.extract # to extract each entry
81
+ #
82
+ # Therefore, if you want to reuse the sequence files already extracted to
83
+ # the input directory, just comment out the line calling "prepare" method
84
+ # (and also avoid to use "extract" method, of course).
85
+ #
86
+ # #sge.prepare # comment out this line in your script
87
+ # sge.script
88
+ # sge.setup
89
+ # #sge.extract # don't use this as well
90
+ #
91
+ # sge.submit # then submit
92
+ #
93
+ # Reversely, you can also clean up the working directory (e.g. to remove
94
+ # test or previous execution results) by the following methods.
95
+ #
96
+ # sge.clear # to remove a SGE script and output/error/log directories
97
+ # sge.clean # to remove a count file and the extracted input directory
98
+ # sge.distclean # to remove all of the above
99
+ #
100
+ # == SGE OPTIONS
101
+ #
102
+ # You can specify the "-t start-last:step" range values for a array job
103
+ # by following accessors (these are optional; see EXAMPLES section below).
104
+ #
105
+ # sge.task_min # start value (default is 1)
106
+ # sge.task_max # last value (default is a total number of entries in query)
107
+ # sge.task_step # number of processes per one job (default is 1000)
108
+ # sge.sge_opts # additional options for the qsub command
109
+ #
110
+ # For example, if you only need to calculate on sequences starting from 8421st
111
+ # upto 9064th, and want to invoke 100 processes per each qsub execution, you
112
+ # can specify them by the following way.
113
+ #
114
+ # sge.task_min = 8421
115
+ # sge.task_max = 9064
116
+ # sge.task_step = 100
117
+ #
118
+ # sge.submit
119
+ #
120
+ # == OVER ALL SKELETON
121
+ #
122
+ # #!/usr/bin/env ruby
123
+ #
124
+ # require 'sge'
125
+ #
126
+ # sge = Bio::SGE.new { |opt|
127
+ # opt.query = 'flat_file'
128
+ # opt.target = 'target_database_file'
129
+ # opt.command = 'command --to_be_executed --with_opts'
130
+ # opt.sge_opts = '-l cpu_arch=xeon'
131
+ # opt.task_min = 8421
132
+ # opt.task_max = 9064
133
+ # opt.task_step = 100
134
+ # }
135
+ # sge.clear # included in sge.distclean
136
+ # sge.clean # included in sge.distclean
137
+ # sge.script # included in sge.prepare
138
+ # sge.setup # included in sge.prepare
139
+ # sge.extract # included in sge.prepare
140
+ # sge.submit
141
+ #
142
+ # == SCRIPT VARIABLES
143
+ #
144
+ # In the 'command' specification, you can use following identifiers as variables.
145
+ #
146
+ # '#{query}' fragmented query file name (== input_file)
147
+ # '#{target}' target database file name
148
+ # '#{work_dir}' current working directory
149
+ #
150
+ # '#{task_id}' SGE_TASK_ID
151
+ # '#{slice}' -- task_id / @@slice (integer >= 1)
152
+ # '#{input_file}' -- 'input/#{slice}/#{task_id}'
153
+ # '#{output_file}' -- 'output/#{slice}/#{task_id}'
154
+ # '#{error_file}' -- 'error/#{slice}/#{task_id}'
155
+ #
156
+ # Note that these identifires must be kept in 'single quotes' to avoid variable
157
+ # expansion before the script generation (see EXAMPLES section in below).
158
+ #
159
+ # == EXAMPLES
160
+ #
161
+ # 1. Exonerate (Query = Multi fasta protein sequences; Target = Genomic DNA)
162
+ #
163
+ # #!/usr/bin/env ruby
164
+ #
165
+ # require 'sge'
166
+ #
167
+ # sge = Bio::SGE.new { |opt|
168
+ # opt.query = 'd.melanogaster.pep'
169
+ # opt.target = 'genomic_scaffolds'
170
+ # opt.command = 'exonerate --bestn 1 --model protein2genome --showtargetgff 1 --showvulgar yes #{query} #{target}'
171
+ # opt.sge_opts = '-l cpu_arch=xeon'
172
+ # }
173
+ # sge.prepare
174
+ # sge.submit
175
+ #
176
+ #
177
+ # 2. BLAST (Query = Multi fasta; Target = BLAST DB)
178
+ #
179
+ # #!/usr/bin/env ruby
180
+ #
181
+ # require 'sge'
182
+ #
183
+ # sge = Bio::SGE.new { |opt|
184
+ # opt.query = 'query.pep'
185
+ # opt.target = 'target.pep'
186
+ # opt.command = 'blastall -p blastp -i #{query} -d #{target}'
187
+ # opt.sge_opts = '-l cpu_arch=xeon'
188
+ # }
189
+ # sge.prepare
190
+ # sge.submit
191
+ #
192
+ #
193
+ # 3. HMMER (Query = Multi fasta protein sequences; Target = Pfam DB)
194
+ #
195
+ # #!/usr/bin/env ruby
196
+ #
197
+ # require 'sge'
198
+ #
199
+ # sge = Bio::SGE.new { |opt|
200
+ # opt.query = 'data/h.sapiens.pep'
201
+ # opt.target = 'db/Pfam_ls'
202
+ # opt.command = 'hmmscan --tblout output/#{slice}/#{task_id}.tbl #{target} #{query}'
203
+ # opt.sge_opts = '-l cpu_arch=xeon'
204
+ # }
205
+ # sge.prepare
206
+ # sge.submit
207
+ #
208
+ # 4. RefSeq to GFF (Query = RefSeq entries in GenBank format)
209
+ #
210
+ # #!/usr/bin/env ruby
211
+ #
212
+ # require 'sge'
213
+ #
214
+ # sge = Bio::SGE.new { |opt|
215
+ # opt.query = 'invertebrate6.genomic.gbff'
216
+ # opt.command = 'bp_genbank2gff3.pl -out stdout #{query}'
217
+ # }
218
+ # sge.prepare
219
+ # sge.submit
220
+ #
221
+ # == CHANGE LOG
222
+ #
223
+ # === 2010/12/24 v2.5
224
+ #
225
+ # * released as a BioRuby plugin
226
+ #
227
+ # === 2010/09/11 v2.4
228
+ #
229
+ # * changed to skip "extract" when "count.txt" file exists, so that
230
+ # user can easily re-submit the job (with different parameter or fix)
231
+ # just after the --clear. To extract again (starting from scratch),
232
+ # use --clean (with --clear) or --distclean first.
233
+ # * doc fix
234
+ #
235
+ # === 2010/05/21 v2.3
236
+ #
237
+ # * slice is changed to start from 1 (instead of 0) and have 1000 files
238
+ # per directory (instead of 10000).
239
+ #
240
+ # === 2010/03/25 v2.2
241
+ #
242
+ # * doc fix
243
+ #
244
+ # === 2009/12/10 v2.1
245
+ #
246
+ # * --clear, --clean, --distcrean options are supported.
247
+ #
248
+ # === 2009/12/07 v2.0
249
+ #
250
+ # * Extended to be used as a command.
251
+ #
252
+ # === 2009/11/13 v1.3
253
+ #
254
+ # * SGE class is moved under the Bio name space (Bio::SGE) as it tightly
255
+ # depends on the Bio::FlatFile (in BioRuby).
256
+ # * Bio::SGE is improved to accept options as a block parameter as well.
257
+ #
258
+ # === 2009/11/02 v1.2
259
+ #
260
+ # * slice functionality is fixed to properly create slice directories
261
+ # under the output and error directories
262
+ #
263
+ # === 2009/09/29 v1.1
264
+ #
265
+ # * slice (sub directory to supress the number of files in a single directory)
266
+ # is introduced not to overload file server (MDS)
267
+ # * fixed document
268
+ #
269
+ # === 2009/09/29 v1.0
270
+ #
271
+ # * SGE_TASK_LAST is introduced to avoid remainder jobs are submitted
272
+ # * documentation is rewrited in Rdoc format
273
+ # * web site is opend and released to public
274
+ #
275
+ # === 2009/09/29 v0.3
276
+ #
277
+ # * SGE_TASK_STEPSIZE is introduced not to overload the SGE manager
278
+ # by a bunch of short time jobs
279
+ #
280
+ # === 2009/09/23 v0.2
281
+ #
282
+ # * query/target variables are intoduced to allow commands having
283
+ # BLAST-like options for specifying query and target files
284
+ # * added documentation
285
+ #
286
+ # === 2009/09/17 v0.1
287
+ #
288
+ # * implemented FASTA file extraction and qsub submission functionality
289
+ #
290
+
291
+ require 'rubygems'
292
+ require 'fileutils'
293
+ require 'bio'
294
+
295
+ module Bio
296
+
297
+ class SGE
298
+
299
+ # Number of files per directory
300
+ @@slice = 1000
301
+
302
+ # Template string for script generation
303
+ @@template = <<'END'
304
+ #$ -S /usr/local/bin/ruby
305
+
306
+ work_dir = "%WORK_DIR%"
307
+
308
+ offset = ENV["SGE_TASK_ID"].to_i
309
+ limit = ENV["SGE_TASK_STEPSIZE"].to_i
310
+ last = ENV["SGE_TASK_LAST"].to_i
311
+
312
+ slice = slice_old = nil
313
+
314
+ offset.upto(offset + limit) do |task_id|
315
+ break if task_id > last
316
+
317
+ slice_old = slice
318
+ slice = (task_id - 1) / %SLICE% + 1
319
+ output_dir = "%OUTPUT_DIR%/#{slice}"
320
+ error_dir = "%ERROR_DIR%/#{slice}"
321
+ Dir.mkdir(output_dir) if slice_old != slice and ! File.directory?(output_dir)
322
+ Dir.mkdir(error_dir) if slice_old != slice and ! File.directory?(error_dir)
323
+
324
+ input_file = "%INPUT_DIR%/#{slice}/#{task_id}"
325
+ output_file = "%OUTPUT_DIR%/#{slice}/#{task_id}"
326
+ error_file = "%ERROR_DIR%/#{slice}/#{task_id}"
327
+
328
+ query = input_file
329
+ target = "%TARGET%"
330
+
331
+ if File.exists?(query)
332
+ system("%COMMAND% > #{output_file} 2> #{error_file}")
333
+ end
334
+ end
335
+ END
336
+
337
+ attr_accessor :query, :target, :command, :sge_opts, :count
338
+ attr_accessor :task_min, :task_max, :task_step
339
+ attr_accessor :work_dir, :log_dir, :input_dir, :output_dir, :error_dir
340
+
341
+ def initialize(query = nil, target = nil, command = nil, sge_opts = nil)
342
+ @work_dir = Dir.pwd
343
+ @query = "#{@work_dir}/#{query}"
344
+ @target = "#{@work_dir}/#{target}"
345
+ @command = command
346
+ @sge_opts = sge_opts
347
+
348
+ yield(self) if block_given?
349
+
350
+ @log_dir = "log"
351
+ @input_dir = "input"
352
+ @output_dir = "output"
353
+ @error_dir = "error"
354
+ @script_file = "script.rb"
355
+ @count_file = "count.txt"
356
+ end
357
+
358
+ def prepare
359
+ setup
360
+ script
361
+ extract
362
+ end
363
+
364
+ def submit
365
+ unless @count
366
+ $stderr.puts "Reading #{@count_file} ..."
367
+ @count = File.readlines(@count_file).last[/^\d+/].to_i
368
+ $stderr.puts "done."
369
+ end
370
+
371
+ task_min = @task_min || 1
372
+ task_max = @task_max || @count
373
+ task_step = @task_step || 1000
374
+
375
+ # system upper limit is 75000
376
+ limit = 50000
377
+ task_min.step(task_max, limit) do |offset|
378
+ opts = "#{@sge_opts} -o #{@log_dir} -e #{@log_dir} -cwd"
379
+ span = "-t #{offset}-#{[offset + limit, task_max].min}:#{task_step}"
380
+ qsub = "qsub #{opts} #{span} #{@script_file}"
381
+ $stderr.puts "Submitting ... #{qsub}"
382
+ system(qsub)
383
+ end
384
+ end
385
+
386
+ def rmtree(file)
387
+ $stderr.print "Deleting #{file} ... "
388
+ FileUtils.rmtree(file)
389
+ $stderr.puts "done."
390
+ end
391
+
392
+ def clear
393
+ rmtree(@script_file)
394
+ rmtree(@output_dir)
395
+ rmtree(@error_dir)
396
+ rmtree(@log_dir)
397
+ end
398
+
399
+ def clean
400
+ rmtree(@count_file)
401
+ rmtree(@input_dir)
402
+ end
403
+
404
+ def distclean
405
+ clear
406
+ clean
407
+ end
408
+
409
+ def mkpath(dir)
410
+ $stderr.print "Creating #{dir} ... "
411
+ if File.directory?(dir)
412
+ $stderr.puts "skip (already exists)."
413
+ else
414
+ FileUtils.mkpath(dir)
415
+ $stderr.puts "done."
416
+ end
417
+ end
418
+
419
+ def setup
420
+ mkpath(@log_dir)
421
+ mkpath(@input_dir)
422
+ mkpath(@output_dir)
423
+ mkpath(@error_dir)
424
+ end
425
+
426
+ def script
427
+ sge_script = @@template.dup
428
+ sge_script.gsub!('%WORK_DIR%', @work_dir)
429
+ sge_script.gsub!('%INPUT_DIR%', @input_dir)
430
+ sge_script.gsub!('%OUTPUT_DIR%', @output_dir)
431
+ sge_script.gsub!('%ERROR_DIR%', @error_dir)
432
+ sge_script.gsub!('%TARGET%', @target)
433
+ sge_script.gsub!('%COMMAND%', @command)
434
+ sge_script.gsub!('%SLICE%', @@slice.to_s)
435
+
436
+ File.open(@script_file, "w") do |file|
437
+ file.puts sge_script
438
+ end
439
+ end
440
+
441
+ def extract
442
+ return if File.exists?(@count_file)
443
+
444
+ slice = slice_old = nil
445
+ @count = 0
446
+ File.open(@count_file, "a") do |count_file|
447
+ Bio::FlatFile.auto(@query) do |ff|
448
+ ff.each do |entry|
449
+ @count += 1
450
+ $stderr.print "Extracting ... #{@count} (#{entry.entry_id}) "
451
+ if (@task_min and @count < @task_min) or (@task_max and @count > @task_max)
452
+ $stderr.puts "skip."
453
+ next
454
+ else
455
+ slice_old = slice
456
+ slice = (@count - 1) / @@slice + 1
457
+ slice_dir = "#{@input_dir}/#{slice}"
458
+ mkpath(slice_dir) if slice_old != slice
459
+ File.open("#{slice_dir}/#{@count}", "w") do |file|
460
+ file.puts ff.entry_raw
461
+ end
462
+ count_file.puts [@count, entry.entry_id].join("\t")
463
+ $stderr.puts "done."
464
+ end
465
+ end
466
+ end
467
+ end
468
+ end
469
+
470
+
471
+ end # class SGE
472
+
473
+ end # module Bio
474
+
475
+
@@ -0,0 +1,18 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'test/unit'
11
+ require 'shoulda'
12
+
13
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
14
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
15
+ require 'bio-sge'
16
+
17
+ class Test::Unit::TestCase
18
+ end
@@ -0,0 +1,7 @@
1
+ require 'helper'
2
+
3
+ class TestBioSge < Test::Unit::TestCase
4
+ should "probably rename this file and start testing for real" do
5
+ flunk "hey buddy, you should probably rename this file and start testing for real"
6
+ end
7
+ end
metadata ADDED
@@ -0,0 +1,155 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: bio-sge
3
+ version: !ruby/object:Gem::Version
4
+ hash: 31
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 0
10
+ version: 0.0.0
11
+ platform: ruby
12
+ authors:
13
+ - Toshiaki Katayama
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-12-24 00:00:00 +09:00
19
+ default_executable: biosge.rb
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ type: :development
23
+ prerelease: false
24
+ name: shoulda
25
+ version_requirements: &id001 !ruby/object:Gem::Requirement
26
+ none: false
27
+ requirements:
28
+ - - ">="
29
+ - !ruby/object:Gem::Version
30
+ hash: 3
31
+ segments:
32
+ - 0
33
+ version: "0"
34
+ requirement: *id001
35
+ - !ruby/object:Gem::Dependency
36
+ type: :development
37
+ prerelease: false
38
+ name: bundler
39
+ version_requirements: &id002 !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ~>
43
+ - !ruby/object:Gem::Version
44
+ hash: 23
45
+ segments:
46
+ - 1
47
+ - 0
48
+ - 0
49
+ version: 1.0.0
50
+ requirement: *id002
51
+ - !ruby/object:Gem::Dependency
52
+ type: :development
53
+ prerelease: false
54
+ name: jeweler
55
+ version_requirements: &id003 !ruby/object:Gem::Requirement
56
+ none: false
57
+ requirements:
58
+ - - ~>
59
+ - !ruby/object:Gem::Version
60
+ hash: 7
61
+ segments:
62
+ - 1
63
+ - 5
64
+ - 2
65
+ version: 1.5.2
66
+ requirement: *id003
67
+ - !ruby/object:Gem::Dependency
68
+ type: :development
69
+ prerelease: false
70
+ name: rcov
71
+ version_requirements: &id004 !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ">="
75
+ - !ruby/object:Gem::Version
76
+ hash: 3
77
+ segments:
78
+ - 0
79
+ version: "0"
80
+ requirement: *id004
81
+ - !ruby/object:Gem::Dependency
82
+ type: :development
83
+ prerelease: false
84
+ name: bio
85
+ version_requirements: &id005 !ruby/object:Gem::Requirement
86
+ none: false
87
+ requirements:
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ hash: 5
91
+ segments:
92
+ - 1
93
+ - 4
94
+ - 1
95
+ version: 1.4.1
96
+ requirement: *id005
97
+ description: Entries in a flatfile will be parased by the BioRuby's Bio::FlatFile.auto module. These entries are used as queries for the Sun Grid Engine (SGE) system. Huge amount of queries are automatically splitted into subdirectories. With a specified command line to be executed, queries are submited to the SGE as an array job.
98
+ email: k@bioruby.org
99
+ executables:
100
+ - biosge.rb
101
+ extensions: []
102
+
103
+ extra_rdoc_files:
104
+ - LICENSE.txt
105
+ - README.rdoc
106
+ files:
107
+ - .document
108
+ - Gemfile
109
+ - Gemfile.lock
110
+ - LICENSE.txt
111
+ - README.rdoc
112
+ - Rakefile
113
+ - VERSION
114
+ - bin/biosge.rb
115
+ - bio-sge.gemspec
116
+ - lib/bio-sge.rb
117
+ - test/helper.rb
118
+ - test/test_bio-sge.rb
119
+ has_rdoc: true
120
+ homepage: http://github.com/ktym/bioruby-sge
121
+ licenses:
122
+ - MIT
123
+ post_install_message:
124
+ rdoc_options: []
125
+
126
+ require_paths:
127
+ - lib
128
+ required_ruby_version: !ruby/object:Gem::Requirement
129
+ none: false
130
+ requirements:
131
+ - - ">="
132
+ - !ruby/object:Gem::Version
133
+ hash: 3
134
+ segments:
135
+ - 0
136
+ version: "0"
137
+ required_rubygems_version: !ruby/object:Gem::Requirement
138
+ none: false
139
+ requirements:
140
+ - - ">="
141
+ - !ruby/object:Gem::Version
142
+ hash: 3
143
+ segments:
144
+ - 0
145
+ version: "0"
146
+ requirements: []
147
+
148
+ rubyforge_project:
149
+ rubygems_version: 1.3.7
150
+ signing_key:
151
+ specification_version: 3
152
+ summary: BioRuby plugin for Sun Grid Engine
153
+ test_files:
154
+ - test/helper.rb
155
+ - test/test_bio-sge.rb