RubyGems - bio-sge - Versions diffs - 0.0.0 - Mend

bio-sge 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

data/.document ADDED

@@ -0,0 +1,5 @@
+lib/**/*.rb
+bin/*
+-
+features/**/*.feature
+LICENSE.txt

data/Gemfile ADDED

@@ -0,0 +1,14 @@
+source "http://rubygems.org"
+# Add dependencies required to use your gem here.
+# Example:
+#   gem "activesupport", ">= 2.3.5"
+# Add dependencies to develop your gem here.
+# Include everything needed to run rake, tests, features, etc.
+group :development do
+  gem "shoulda", ">= 0"
+  gem "bundler", "~> 1.0.0"
+  gem "jeweler", "~> 1.5.2"
+  gem "rcov", ">= 0"
+  gem "bio", ">= 1.4.1"
+end

data/Gemfile.lock ADDED

@@ -0,0 +1,22 @@
+GEM
+  remote: http://rubygems.org/
+  specs:
+    bio (1.4.1)
+    git (1.2.5)
+    jeweler (1.5.2)
+      bundler (~> 1.0.0)
+      git (>= 1.2.5)
+      rake
+    rake (0.8.7)
+    rcov (0.9.9)
+    shoulda (2.11.3)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  bio (>= 1.4.1)
+  bundler (~> 1.0.0)
+  jeweler (~> 1.5.2)
+  rcov
+  shoulda

data/LICENSE.txt ADDED

@@ -0,0 +1,20 @@
+Copyright (c) 2010 Toshiaki Katayama
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.rdoc ADDED

@@ -0,0 +1,19 @@
+= bio-sge
+Description goes here.
+== Contributing to bio-sge
+* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
+* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
+* Fork the project
+* Start a feature/bugfix branch
+* Commit and push until you are happy with your contribution
+* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
+* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
+== Copyright
+Copyright (c) 2010 Toshiaki Katayama. See LICENSE.txt for
+further details.

data/Rakefile ADDED

@@ -0,0 +1,53 @@
+require 'rubygems'
+require 'bundler'
+begin
+  Bundler.setup(:default, :development)
+rescue Bundler::BundlerError => e
+  $stderr.puts e.message
+  $stderr.puts "Run `bundle install` to install missing gems"
+  exit e.status_code
+end
+require 'rake'
+require 'jeweler'
+Jeweler::Tasks.new do |gem|
+  # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
+  gem.name = "bio-sge"
+  gem.homepage = "http://github.com/ktym/bioruby-sge"
+  gem.license = "MIT"
+  gem.summary = %Q{BioRuby plugin for Sun Grid Engine}
+  gem.description = %Q{Entries in a flatfile will be parased by the BioRuby's Bio::FlatFile.auto module. These entries are used as queries for the Sun Grid Engine (SGE) system. Huge amount of queries are automatically splitted into subdirectories. With a specified command line to be executed, queries are submited to the SGE as an array job.}
+  gem.email = "k@bioruby.org"
+  gem.authors = ["Toshiaki Katayama"]
+  # Include your dependencies below. Runtime dependencies are required when using your gem,
+  # and development dependencies are only needed for development (ie running rake tasks, tests, etc)
+  #  gem.add_runtime_dependency 'jabber4r', '> 0.1'
+  #  gem.add_development_dependency 'rspec', '> 1.2.3'
+end
+Jeweler::RubygemsDotOrgTasks.new
+require 'rake/testtask'
+Rake::TestTask.new(:test) do |test|
+  test.libs << 'lib' << 'test'
+  test.pattern = 'test/**/test_*.rb'
+  test.verbose = true
+end
+require 'rcov/rcovtask'
+Rcov::RcovTask.new do |test|
+  test.libs << 'test'
+  test.pattern = 'test/**/test_*.rb'
+  test.verbose = true
+end
+task :default => :test
+require 'rake/rdoctask'
+Rake::RDocTask.new do |rdoc|
+  version = File.exist?('VERSION') ? File.read('VERSION') : ""
+  rdoc.rdoc_dir = 'rdoc'
+  rdoc.title = "bio-sge #{version}"
+  rdoc.rdoc_files.include('README*')
+  rdoc.rdoc_files.include('lib/**/*.rb')
+end

data/VERSION ADDED

	@@ -0,0 +1 @@
1	+ 0.0.0

data/bin/biosge.rb ADDED

@@ -0,0 +1,220 @@
+#!/usr/bin/env ruby
+#
+# = Bio::SGE -- Sun Grid Engine array job submitter (Bio::FlatFile query to SGE)
+#
+# Copyright::	Copyright (C) 2009, 2010 Toshiaki Katayama <mailto:ktym at hgc dot jp>
+# License::	Distributes under the same terms as Ruby
+# Site::	http://kanehisa.hgc.jp/~k/sge/
+# Download::	http://kanehisa.hgc.jp/~k/sge/sge.rb
+#
+# == USAGE (AS A COMMAND)
+#
+# Usage:
+#     % biosge.rb [options...] -q input_file -t db_file -c 'command --opts #{query} #{target}'
+#
+# Options:
+#     -q or --query file
+#        Specify a flatfile including multiple entries.
+#     -t or --target file
+#        Specify a database file to be used.
+#     -c or --command 'string'
+#        Specify a command line to be executed.
+#        The following identifiers can be used in the command line 'string'.
+#          '#{query}'       fragmented query file name (== input_file)
+#          '#{target}'      target database file name
+#          '#{work_dir}'    current working directory
+#          '#{task_id}'     SGE_TASK_ID
+#          '#{slice}'       -- task_id / @@slice (integer >= 1)
+#          '#{input_file}'  -- 'input/#{slice}/#{task_id}'
+#          '#{output_file}' -- 'output/#{slice}/#{task_id}'
+#          '#{error_file}'  -- 'error/#{slice}/#{task_id}'
+#     -o or --sge_opts 'string'
+#        Additional options for the qsub command.
+#          '-l s_vmem=16G -l mem_req=16' to reserve 16GB RAM for each job
+#          '-l cpu_arch=xeon'            to limit to use xeon CPUs only
+#        Resource reservation and backfill options:
+#          '-R y -l s_rt=12:0:0'         to limit max exec time to 12h (SIGUSER1)
+#          '-R y -l h_rt=12:0:0'         to limit max exec time to 12h (SIGKILL)
+#          '-R y -pe mpi-fillup 4'       to reserve 4 threads for MPI
+#     -m or --task_min integer
+#        Start number of tasks (default is 1, increase to start from halfway).
+#     -M or --taks_max integer
+#        Last value (default is a total number of entries in query).
+#     -s or --task_step integer
+#        Number of processes per one job (default is 1000). Large value is
+#        recommended for short tasks with a large number of queries, and
+#        a small value (minimum is 1) can be used for time consuming tasks
+#        with a small number of queries.
+#     --clear
+#        Remove a SGE script and output/error/log directories
+#     --clean
+#        Remove a count file and the extracted input directory
+#     --distclean
+#        Exec both of --clear and --clean
+#     -h or --help
+#        Print this help message.
+#
+# Examples:
+#     % biosge.rb -q data/query.pep -t data/target.pep -c 'blastall -p blastp -i #{query} -d #{target}' -o '-l cpu_arch=xeon'
+#     % biosge.rb -q data/query.nuc -t /usr/local/db/blast/ncbi/nr -c 'blastall -p blastx -s 10 -i #{query} -d #{target}' -o '-l cpu_arch=xeon -l sjob -l s_vmem=4G,mem_req=4'
+#     % biosge.rb -q data/dme.nuc -t data/dme.genome -s 1 -c 'exonerate --bestn 1 --model est2genome --showtargetgff 1 --showvulgar yes #{query} #{target}'
+#     % biosge.rb -q data/hsa.pep -t data/Pfam-A.hmm -m 1000 -M 2000 -s 10 -c 'hmmscan --tblout output/#{slice}/#{task_id}.tbl #{target} #{query}'
+#     % biosge.rb -q data/refseq.gb -c 'bp_genbank2gff3.pl -out stdout #{query}'
+#     % biosge.rb --distclean
+#
+# See also:
+#     http://kanehisa.hgc.jp/~k/sge/
+#
+# == RESULTS
+#
+# The execution results will be stored in the following files and directories.
+#
+#   count.txt     # correspondence table of the file numbers and entry IDs
+#   input/        # extracted sequence files (one file, one sequence)
+#   output/       # outputs of the command (numberd same as the input files)
+#   error/        # errors of the command (numberd same as the input files)
+#   log/          # log files of the qsub run (stdout and stderr)
+#
+# You can confirm whether there were no system errors during the SGE execution
+# by sizes and contents of files in the log/ directory.
+#
+# Then, check the error/ directory whether there was a problem or not in your
+# jobs (some command may utilize the stderr to another purpose).
+#
+# Finally, main results can be obtained from files in the output/ directory.
+#
+require 'bio-sge'
+require 'getoptlong'
+def show_usage
+  prog  = File.basename($0)
+  usage = %Q[
+Usage:
+    % #{prog} \[options...\] -q input_file -t db_file -c 'command --opts \#{query} \#{target}'
+Options:
+    -q or --query file
+       Specify a flatfile including multiple entries.
+    -t or --target file
+       Specify a database file to be used.
+    -c or --command 'string'
+       Specify a command line to be executed.
+       The following identifiers can be used in the command line 'string'.
+         '\#{query}'       fragmented query file name (== input_file)
+         '\#{target}'      target database file name
+         '\#{work_dir}'    current working directory
+         '\#{task_id}'     SGE_TASK_ID
+         '\#{slice}'       -- task_id / @@slice (integer >= 0)
+         '\#{input_file}'  -- "input/\#{slice}/\#{task_id}"
+         '\#{output_file}' -- "output/\#{slice}/\#{task_id}"
+         '\#{error_file}'  -- "error/\#{slice}/\#{task_id}"
+    -o or --sge_opts 'string'
+       Additional options for the qsub command.
+         '-l s_vmem=16G -l mem_req=16' to reserve 16GB RAM for each job
+         '-l cpu_arch=xeon'            to limit to use xeon CPUs only
+       Resource reservation and backfill options:
+         '-R y -l s_rt=12:0:0'         to limit max exec time to 12h (SIGUSER1)
+         '-R y -l h_rt=12:0:0'         to limit max exec time to 12h (SIGKILL)
+         '-R y -pe mpi-fillup 4'       to reserve 4 threads for MPI
+    -m or --task_min integer
+       Start number of tasks (default is 1, increase to start from halfway).
+    -M or --taks_max integer
+       Last value (default is a total number of entries in query).
+    -s or --task_step integer
+       Number of processes per one job (default is 1000). Large value is
+       recommended for short tasks with a large number of queries, and
+       a small value (minimum is 1) can be used for time consuming tasks
+       with a small number of queries.
+    -h or --help
+       Print this help message.
+    --clear
+       Remove a SGE script and output/error/log directories
+    --clean
+       Remove a count file and the extracted input directory
+    --distclean
+       Exec both of --clear and --clean
+Examples:
+    % #{prog} -q data/query.pep -t data/target.pep -c 'blastall -p blastp -i \#{query} -d \#{target}' -o '-l cpu_arch=xeon'
+    % #{prog} -q data/query.nuc -t /usr/local/db/blast/ncbi/nr -c 'blastall -p blastx -s 10 -i \#{query} -d \#{target}' -o '-l cpu_arch=xeon -l sjob -l s_vmem=4G,mem_req=4'
+    % #{prog} -q data/dme.nuc -t data/dme.genome -s 1 -c 'exonerate --bestn 1 --model est2genome --showtargetgff 1 --showvulgar yes \#{query} \#{target}'
+    % #{prog} -q data/hsa.pep -t data/Pfam-A.hmm -m 1000 -M 2000 -s 10 -c 'hmmscan --tblout output/\#{slice}/\#{task_id}.tbl \#{target} \#{query}'
+    % #{prog} -q data/refseq.gb -c 'bp_genbank2gff3.pl -out stdout \#{query}'
+    % #{prog} --distclean
+See also:
+    http://kanehisa.hgc.jp/~k/sge/
+]
+  puts usage
+  exit
+end
+$opts = Hash.new
+args = GetoptLong.new(
+  [ '--query',     '-q',  GetoptLong::REQUIRED_ARGUMENT ],
+  [ '--target',    '-t',  GetoptLong::REQUIRED_ARGUMENT ],
+  [ '--command',   '-c',  GetoptLong::REQUIRED_ARGUMENT ],
+  [ '--sge_opts',  '-o',  GetoptLong::REQUIRED_ARGUMENT ],
+  [ '--task_min',  '-m',  GetoptLong::REQUIRED_ARGUMENT ],
+  [ '--task_max',  '-M',  GetoptLong::REQUIRED_ARGUMENT ],
+  [ '--task_step', '-s',  GetoptLong::REQUIRED_ARGUMENT ],
+  [ '--clear',            GetoptLong::NO_ARGUMENT ],
+  [ '--clean',            GetoptLong::NO_ARGUMENT ],
+  [ '--distclean',        GetoptLong::NO_ARGUMENT ],
+  [ '--help',      '-h',  GetoptLong::NO_ARGUMENT ]
+)
+args.each_option do |name, value|
+  case name
+  when /--query/
+    $opts[:query] = value
+  when /--target/
+    $opts[:target] = value
+  when /--command/
+    $opts[:command] = value
+  when /--sge_opts/
+    $opts[:sge_opts] = value
+  when /--task_min/
+    $opts[:task_min] = value.to_i
+  when /--task_max/
+    $opts[:task_max] = value.to_i
+  when /--task_step/
+    $opts[:task_step] = value.to_i
+  when /--clear/
+    $opts[:clear] = true
+  when /--clean/
+    $opts[:clean] = true
+  when /--distclean/
+    $opts[:clear] = true
+    $opts[:clean] = true
+  when /--help/
+    $opts[:help] = true
+  end
+end
+if $opts[:clear]
+  sge = Bio::SGE.new
+  sge.clear
+end
+if $opts[:clean]
+  sge = Bio::SGE.new
+  sge.clean
+end
+show_usage if $opts[:help] or !$opts[:command]
+sge = Bio::SGE.new { |opt|
+  opt.query     = $opts[:query]     if $opts[:query]
+  opt.target    = $opts[:target]    if $opts[:target]
+  opt.command   = $opts[:command]   if $opts[:command]
+  opt.sge_opts  = $opts[:sge_opts]  if $opts[:sge_opts]
+  opt.task_min  = $opts[:task_min]  if $opts[:task_min]
+  opt.task_max  = $opts[:task_max]  if $opts[:task_max]
+  opt.task_step = $opts[:task_step] if $opts[:task_step]
+}
+sge.prepare
+sge.submit

data/bio-sge.gemspec ADDED

@@ -0,0 +1,70 @@
+# Generated by jeweler
+# DO NOT EDIT THIS FILE DIRECTLY
+# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
+# -*- encoding: utf-8 -*-
+Gem::Specification.new do |s|
+  s.name = %q{bio-sge}
+  s.version = "0.0.0"
+  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.authors = ["Toshiaki Katayama"]
+  s.date = %q{2010-12-24}
+  s.default_executable = %q{biosge.rb}
+  s.description = %q{Entries in a flatfile will be parased by the BioRuby's Bio::FlatFile.auto module. These entries are used as queries for the Sun Grid Engine (SGE) system. Huge amount of queries are automatically splitted into subdirectories. With a specified command line to be executed, queries are submited to the SGE as an array job.}
+  s.email = %q{k@bioruby.org}
+  s.executables = ["biosge.rb"]
+  s.extra_rdoc_files = [
+    "LICENSE.txt",
+    "README.rdoc"
+  ]
+  s.files = [
+    ".document",
+    "Gemfile",
+    "Gemfile.lock",
+    "LICENSE.txt",
+    "README.rdoc",
+    "Rakefile",
+    "VERSION",
+    "bin/biosge.rb",
+    "bio-sge.gemspec",
+    "lib/bio-sge.rb",
+    "test/helper.rb",
+    "test/test_bio-sge.rb"
+  ]
+  s.homepage = %q{http://github.com/ktym/bioruby-sge}
+  s.licenses = ["MIT"]
+  s.require_paths = ["lib"]
+  s.rubygems_version = %q{1.3.7}
+  s.summary = %q{BioRuby plugin for Sun Grid Engine}
+  s.test_files = [
+    "test/helper.rb",
+    "test/test_bio-sge.rb"
+  ]
+  if s.respond_to? :specification_version then
+    current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
+    s.specification_version = 3
+    if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
+      s.add_development_dependency(%q<shoulda>, [">= 0"])
+      s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
+      s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
+      s.add_development_dependency(%q<rcov>, [">= 0"])
+      s.add_development_dependency(%q<bio>, [">= 1.4.1"])
+    else
+      s.add_dependency(%q<shoulda>, [">= 0"])
+      s.add_dependency(%q<bundler>, ["~> 1.0.0"])
+      s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
+      s.add_dependency(%q<rcov>, [">= 0"])
+      s.add_dependency(%q<bio>, [">= 1.4.1"])
+    end
+  else
+    s.add_dependency(%q<shoulda>, [">= 0"])
+    s.add_dependency(%q<bundler>, ["~> 1.0.0"])
+    s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
+    s.add_dependency(%q<rcov>, [">= 0"])
+    s.add_dependency(%q<bio>, [">= 1.4.1"])
+  end
+end

data/lib/bio-sge.rb ADDED

@@ -0,0 +1,475 @@
+#
+# = Bio::SGE -- Sun Grid Engine array job submitter (Bio::FlatFile query to SGE)
+#
+# Copyright::	Copyright (C) 2009, 2010 Toshiaki Katayama <mailto:ktym at hgc dot jp>
+# License::	Distributes under the same terms as Ruby
+# Site::	http://kanehisa.hgc.jp/~k/sge/
+# Download::	http://kanehisa.hgc.jp/~k/sge/sge.rb
+#
+# == USAGE (AS A LIBRARY)
+#
+# The Bio::SGE class extract entries in a biological flatfile as queries
+# and execute a bulk submission to the Sun Grid Engine as an array job.
+#
+# This class takes a flatfile (e.g. multi FASTA file) as a 'query',
+# a database file as a 'target', and a command line to be executed
+# as a 'command' (see also SCRIPT VARIABLES section).
+#
+# The flatfile must be accepted by the Bio::FlatFile.auto class method
+# of the BioRuby (http://bioruby.org/) package.
+#
+# Instantiation of the Bio::SGE object can be done by
+#
+#   sge = Bio::SGE.new(query, target, command, sge_opts)
+#
+# or by assigning these values through accessors prior to a job submission
+#
+#   sge = Bio::SGE.new
+#   sge.query = 'flat_file'
+#   sge.target = 'target_database_file'
+#   sge.command = 'command --to_be_executed --with_opts'
+#
+# or by assigning these values with a block parameter.
+#
+#   sge = Bio::SGE.new { |opt|
+#     opt.query = 'flat_file'
+#     opt.target = 'target_database_file'
+#     opt.command = 'command --to_be_executed --with_opts'
+#   }
+#
+# Then, the "prepare" method will
+#
+# * create output directories
+# * generate a SGE script to be submitted
+# * extract each entry in the query as separate files
+#   (files are numbered by the order of appearance)
+#
+# and now you can submit your SGE job by the "submit" method.
+#
+#   sge.prepare
+#   sge.submit
+#
+# The "submit" method will automatically take care of messy tasks such that
+# (1) splitting array jobs according to the number of total jobs, (2) save
+# stdout and stderr from SGE system to a separate log directory etc.
+#
+# == RESULTS
+#
+# The execution results will be stored in the following files and directories.
+#
+#   count.txt     # correspondence table of the file numbers and entry IDs
+#   input/        # extracted sequence files (one file, one sequence)
+#   output/       # outputs of the command (numberd same as the input files)
+#   error/        # errors of the command (numberd same as the input files)
+#   log/          # log files of the qsub run (stdout and stderr)
+#
+# You can confirm whether there were no system errors during the SGE execution
+# by sizes and contents of files in the log/ directory.
+#
+# Then, check the error/ directory whether there was a problem or not in your
+# jobs (some command may utilize the stderr to another purpose).
+#
+# Finally, main results can be obtained from files in the output/ directory.
+#
+# == ADVANCED USAGE
+#
+# You can individually call following methods instead of the "prepare" method.
+#
+#   sge.setup     # to prepare output directories
+#   sge.script    # to generate a SGE script
+#   sge.extract   # to extract each entry
+#
+# Therefore, if you want to reuse the sequence files already extracted to
+# the input directory, just comment out the line calling "prepare" method
+# (and also avoid to use "extract" method, of course).
+#
+#   #sge.prepare  # comment out this line in your script
+#   sge.script
+#   sge.setup
+#   #sge.extract  # don't use this as well
+#
+#   sge.submit    # then submit
+#
+# Reversely, you can also clean up the working directory (e.g. to remove
+# test or previous execution results) by the following methods.
+#
+#   sge.clear     # to remove a SGE script and output/error/log directories
+#   sge.clean     # to remove a count file and the extracted input directory
+#   sge.distclean # to remove all of the above
+#
+# == SGE OPTIONS
+#
+# You can specify the "-t start-last:step" range values for a array job
+# by following accessors (these are optional; see EXAMPLES section below).
+#
+#   sge.task_min  # start value (default is 1)
+#   sge.task_max  # last value (default is a total number of entries in query)
+#   sge.task_step # number of processes per one job (default is 1000)
+#   sge.sge_opts  # additional options for the qsub command
+#
+# For example, if you only need to calculate on sequences starting from 8421st
+# upto 9064th, and want to invoke 100 processes per each qsub execution, you
+# can specify them by the following way.
+#
+#   sge.task_min = 8421
+#   sge.task_max = 9064
+#   sge.task_step = 100
+#
+#   sge.submit
+#
+# == OVER ALL SKELETON
+#
+#   #!/usr/bin/env ruby
+#
+#   require 'sge'
+#
+#   sge = Bio::SGE.new { |opt|
+#     opt.query = 'flat_file'
+#     opt.target = 'target_database_file'
+#     opt.command = 'command --to_be_executed --with_opts'
+#     opt.sge_opts = '-l cpu_arch=xeon'
+#     opt.task_min = 8421
+#     opt.task_max = 9064
+#     opt.task_step = 100
+#   }
+#   sge.clear           # included in sge.distclean
+#   sge.clean		# included in sge.distclean
+#   sge.script          # included in sge.prepare
+#   sge.setup           # included in sge.prepare
+#   sge.extract         # included in sge.prepare
+#   sge.submit
+#
+# == SCRIPT VARIABLES
+#
+# In the 'command' specification, you can use following identifiers as variables.
+#
+#   '#{query}'       fragmented query file name (== input_file)
+#   '#{target}'      target database file name
+#   '#{work_dir}'    current working directory
+#
+#   '#{task_id}'     SGE_TASK_ID
+#   '#{slice}'       -- task_id / @@slice (integer >= 1)
+#   '#{input_file}'  -- 'input/#{slice}/#{task_id}'
+#   '#{output_file}' -- 'output/#{slice}/#{task_id}'
+#   '#{error_file}'  -- 'error/#{slice}/#{task_id}'
+#
+# Note that these identifires must be kept in 'single quotes' to avoid variable
+# expansion before the script generation (see EXAMPLES section in below).
+#
+# == EXAMPLES
+#
+# 1. Exonerate (Query = Multi fasta protein sequences; Target = Genomic DNA)
+#
+#     #!/usr/bin/env ruby
+#
+#     require 'sge'
+#
+#     sge = Bio::SGE.new { |opt|
+#       opt.query = 'd.melanogaster.pep'
+#       opt.target = 'genomic_scaffolds'
+#       opt.command = 'exonerate --bestn 1 --model protein2genome --showtargetgff 1 --showvulgar yes #{query} #{target}'
+#       opt.sge_opts = '-l cpu_arch=xeon'
+#     }
+#     sge.prepare
+#     sge.submit
+#
+#
+# 2. BLAST (Query = Multi fasta; Target = BLAST DB)
+#
+#     #!/usr/bin/env ruby
+#
+#     require 'sge'
+#
+#     sge = Bio::SGE.new { |opt|
+#       opt.query = 'query.pep'
+#       opt.target = 'target.pep'
+#       opt.command = 'blastall -p blastp -i #{query} -d #{target}'
+#       opt.sge_opts = '-l cpu_arch=xeon'
+#     }
+#     sge.prepare
+#     sge.submit
+#
+#
+# 3. HMMER (Query = Multi fasta protein sequences; Target = Pfam DB)
+#
+#     #!/usr/bin/env ruby
+#
+#     require 'sge'
+#
+#     sge = Bio::SGE.new { |opt|
+#       opt.query = 'data/h.sapiens.pep'
+#       opt.target = 'db/Pfam_ls'
+#       opt.command = 'hmmscan --tblout output/#{slice}/#{task_id}.tbl #{target} #{query}'
+#       opt.sge_opts = '-l cpu_arch=xeon'
+#     }
+#     sge.prepare
+#     sge.submit
+#
+# 4. RefSeq to GFF (Query = RefSeq entries in GenBank format)
+#
+#     #!/usr/bin/env ruby
+#
+#     require 'sge'
+#
+#     sge = Bio::SGE.new { |opt|
+#       opt.query = 'invertebrate6.genomic.gbff'
+#       opt.command = 'bp_genbank2gff3.pl -out stdout #{query}'
+#     }
+#     sge.prepare
+#     sge.submit
+#
+# == CHANGE LOG
+#
+# === 2010/12/24 v2.5
+#
+# * released as a BioRuby plugin
+#
+# === 2010/09/11 v2.4
+#
+# * changed to skip "extract" when "count.txt" file exists, so that
+#   user can easily re-submit the job (with different parameter or fix)
+#   just after the --clear. To extract again (starting from scratch),
+#   use --clean (with --clear) or --distclean first.
+# * doc fix
+#
+# === 2010/05/21 v2.3
+#
+# * slice is changed to start from 1 (instead of 0) and have 1000 files
+#   per directory (instead of 10000).
+#
+# === 2010/03/25 v2.2
+#
+# * doc fix
+#
+# === 2009/12/10 v2.1
+#
+# * --clear, --clean, --distcrean options are supported.
+#
+# === 2009/12/07 v2.0
+#
+# * Extended to be used as a command.
+#
+# === 2009/11/13 v1.3
+#
+# * SGE class is moved under the Bio name space (Bio::SGE) as it tightly
+#   depends on the Bio::FlatFile (in BioRuby).
+# * Bio::SGE is improved to accept options as a block parameter as well.
+#
+# === 2009/11/02 v1.2
+#
+# * slice functionality is fixed to properly create slice directories
+#   under the output and error directories
+#
+# === 2009/09/29 v1.1
+#
+# * slice (sub directory to supress the number of files in a single directory)
+#   is introduced not to overload file server (MDS)
+# * fixed document
+#
+# === 2009/09/29 v1.0
+#
+# * SGE_TASK_LAST is introduced to avoid remainder jobs are submitted
+# * documentation is rewrited in Rdoc format
+# * web site is opend and released to public
+#
+# === 2009/09/29 v0.3
+#
+# * SGE_TASK_STEPSIZE is introduced not to overload the SGE manager
+#   by a bunch of short time jobs
+#
+# === 2009/09/23 v0.2
+#
+# * query/target variables are intoduced to allow commands having
+#   BLAST-like options for specifying query and target files
+# * added documentation
+#
+# === 2009/09/17 v0.1
+#
+# * implemented FASTA file extraction and qsub submission functionality
+#
+require 'rubygems'
+require 'fileutils'
+require 'bio'
+module Bio
+class SGE
+  # Number of files per directory
+  @@slice = 1000
+  # Template string for script generation
+  @@template = <<'END'
+#$ -S /usr/local/bin/ruby
+work_dir = "%WORK_DIR%"
+offset = ENV["SGE_TASK_ID"].to_i
+limit  = ENV["SGE_TASK_STEPSIZE"].to_i
+last   = ENV["SGE_TASK_LAST"].to_i
+slice = slice_old = nil
+offset.upto(offset + limit) do |task_id|
+  break if task_id > last
+  slice_old = slice
+  slice = (task_id - 1) / %SLICE% + 1
+  output_dir = "%OUTPUT_DIR%/#{slice}"
+  error_dir = "%ERROR_DIR%/#{slice}"
+  Dir.mkdir(output_dir) if slice_old != slice and ! File.directory?(output_dir)
+  Dir.mkdir(error_dir)  if slice_old != slice and ! File.directory?(error_dir)
+  input_file  = "%INPUT_DIR%/#{slice}/#{task_id}"
+  output_file = "%OUTPUT_DIR%/#{slice}/#{task_id}"
+  error_file  = "%ERROR_DIR%/#{slice}/#{task_id}"
+  query = input_file
+  target = "%TARGET%"
+  if File.exists?(query)
+    system("%COMMAND% > #{output_file} 2> #{error_file}")
+  end
+end
+END
+  attr_accessor :query, :target, :command, :sge_opts, :count
+  attr_accessor :task_min, :task_max, :task_step
+  attr_accessor :work_dir, :log_dir, :input_dir, :output_dir, :error_dir
+  def initialize(query = nil, target = nil, command = nil, sge_opts = nil)
+    @work_dir = Dir.pwd
+    @query = "#{@work_dir}/#{query}"
+    @target = "#{@work_dir}/#{target}"
+    @command = command
+    @sge_opts = sge_opts
+    yield(self) if block_given?
+    @log_dir = "log"
+    @input_dir = "input"
+    @output_dir = "output"
+    @error_dir = "error"
+    @script_file = "script.rb"
+    @count_file = "count.txt"
+  end
+  def prepare
+    setup
+    script
+    extract
+  end
+  def submit
+    unless @count
+      $stderr.puts "Reading #{@count_file} ..."
+      @count = File.readlines(@count_file).last[/^\d+/].to_i
+      $stderr.puts "done."
+    end
+    task_min = @task_min || 1
+    task_max = @task_max || @count
+    task_step = @task_step || 1000
+    # system upper limit is 75000
+    limit = 50000
+    task_min.step(task_max, limit) do |offset|
+      opts = "#{@sge_opts} -o #{@log_dir} -e #{@log_dir} -cwd"
+      span = "-t #{offset}-#{[offset + limit, task_max].min}:#{task_step}"
+      qsub = "qsub #{opts} #{span} #{@script_file}"
+      $stderr.puts "Submitting ... #{qsub}"
+      system(qsub)
+    end
+  end
+  def rmtree(file)
+    $stderr.print "Deleting #{file} ... "
+    FileUtils.rmtree(file)
+    $stderr.puts "done."
+  end
+  def clear
+    rmtree(@script_file)
+    rmtree(@output_dir)
+    rmtree(@error_dir)
+    rmtree(@log_dir)
+  end
+  def clean
+    rmtree(@count_file)
+    rmtree(@input_dir)
+  end
+  def distclean
+    clear
+    clean
+  end
+  def mkpath(dir)
+    $stderr.print "Creating #{dir} ... "
+    if File.directory?(dir)
+      $stderr.puts "skip (already exists)."
+    else
+      FileUtils.mkpath(dir)
+      $stderr.puts "done."
+    end
+  end
+  def setup
+    mkpath(@log_dir)
+    mkpath(@input_dir)
+    mkpath(@output_dir)
+    mkpath(@error_dir)
+  end
+  def script
+    sge_script = @@template.dup
+    sge_script.gsub!('%WORK_DIR%', @work_dir)
+    sge_script.gsub!('%INPUT_DIR%', @input_dir)
+    sge_script.gsub!('%OUTPUT_DIR%', @output_dir)
+    sge_script.gsub!('%ERROR_DIR%', @error_dir)
+    sge_script.gsub!('%TARGET%', @target)
+    sge_script.gsub!('%COMMAND%', @command)
+    sge_script.gsub!('%SLICE%', @@slice.to_s)
+    File.open(@script_file, "w") do |file|
+      file.puts sge_script
+    end
+  end
+  def extract
+    return if File.exists?(@count_file)
+    slice = slice_old = nil
+    @count = 0
+    File.open(@count_file, "a") do |count_file|
+      Bio::FlatFile.auto(@query) do |ff|
+        ff.each do |entry|
+          @count += 1
+          $stderr.print "Extracting ... #{@count} (#{entry.entry_id}) "
+          if (@task_min and @count < @task_min) or (@task_max and @count > @task_max)
+            $stderr.puts "skip."
+            next
+          else
+            slice_old = slice
+            slice = (@count - 1) / @@slice + 1
+            slice_dir = "#{@input_dir}/#{slice}"
+            mkpath(slice_dir) if slice_old != slice
+            File.open("#{slice_dir}/#{@count}", "w") do |file|
+              file.puts ff.entry_raw
+            end
+            count_file.puts [@count, entry.entry_id].join("\t")
+            $stderr.puts "done."
+          end
+        end
+      end
+    end
+  end
+end # class SGE
+end # module Bio

data/test/helper.rb ADDED

@@ -0,0 +1,18 @@
+require 'rubygems'
+require 'bundler'
+begin
+  Bundler.setup(:default, :development)
+rescue Bundler::BundlerError => e
+  $stderr.puts e.message
+  $stderr.puts "Run `bundle install` to install missing gems"
+  exit e.status_code
+end
+require 'test/unit'
+require 'shoulda'
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
+$LOAD_PATH.unshift(File.dirname(__FILE__))
+require 'bio-sge'
+class Test::Unit::TestCase
+end

data/test/test_bio-sge.rb ADDED

@@ -0,0 +1,7 @@
+require 'helper'
+class TestBioSge < Test::Unit::TestCase
+  should "probably rename this file and start testing for real" do
+    flunk "hey buddy, you should probably rename this file and start testing for real"
+  end
+end

metadata ADDED

@@ -0,0 +1,155 @@
+--- !ruby/object:Gem::Specification
+name: bio-sge
+version: !ruby/object:Gem::Version
+  hash: 31
+  prerelease: false
+  segments:
+  - 0
+  - 0
+  - 0
+  version: 0.0.0
+platform: ruby
+authors:
+- Toshiaki Katayama
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2010-12-24 00:00:00 +09:00
+default_executable: biosge.rb
+dependencies:
+- !ruby/object:Gem::Dependency
+  type: :development
+  prerelease: false
+  name: shoulda
+  version_requirements: &id001 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
+        version: "0"
+  requirement: *id001
+- !ruby/object:Gem::Dependency
+  type: :development
+  prerelease: false
+  name: bundler
+  version_requirements: &id002 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        hash: 23
+        segments:
+        - 1
+        - 0
+        - 0
+        version: 1.0.0
+  requirement: *id002
+- !ruby/object:Gem::Dependency
+  type: :development
+  prerelease: false
+  name: jeweler
+  version_requirements: &id003 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        hash: 7
+        segments:
+        - 1
+        - 5
+        - 2
+        version: 1.5.2
+  requirement: *id003
+- !ruby/object:Gem::Dependency
+  type: :development
+  prerelease: false
+  name: rcov
+  version_requirements: &id004 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
+        version: "0"
+  requirement: *id004
+- !ruby/object:Gem::Dependency
+  type: :development
+  prerelease: false
+  name: bio
+  version_requirements: &id005 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 5
+        segments:
+        - 1
+        - 4
+        - 1
+        version: 1.4.1
+  requirement: *id005
+description: Entries in a flatfile will be parased by the BioRuby's Bio::FlatFile.auto module. These entries are used as queries for the Sun Grid Engine (SGE) system. Huge amount of queries are automatically splitted into subdirectories. With a specified command line to be executed, queries are submited to the SGE as an array job.
+email: k@bioruby.org
+executables:
+- biosge.rb
+extensions: []
+extra_rdoc_files:
+- LICENSE.txt
+- README.rdoc
+files:
+- .document
+- Gemfile
+- Gemfile.lock
+- LICENSE.txt
+- README.rdoc
+- Rakefile
+- VERSION
+- bin/biosge.rb
+- bio-sge.gemspec
+- lib/bio-sge.rb
+- test/helper.rb
+- test/test_bio-sge.rb
+has_rdoc: true
+homepage: http://github.com/ktym/bioruby-sge
+licenses:
+- MIT
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.7
+signing_key:
+specification_version: 3
+summary: BioRuby plugin for Sun Grid Engine
+test_files:
+- test/helper.rb
+- test/test_bio-sge.rb