full_lengther_next 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +4 -0
- data/Manifest.txt +27 -0
- data/PostInstall.txt +6 -0
- data/README.rdoc +147 -0
- data/Rakefile +37 -0
- data/bin/download_fln_dbs.rb +197 -0
- data/bin/full_lengther_next +173 -0
- data/bin/make_user_db.rb +144 -0
- data/lib/full_lengther_next.rb +13 -0
- data/lib/full_lengther_next/classes/common_functions.rb +94 -0
- data/lib/full_lengther_next/classes/fl2_stats.rb +222 -0
- data/lib/full_lengther_next/classes/fl_analysis.rb +688 -0
- data/lib/full_lengther_next/classes/fl_string_utils.rb +139 -0
- data/lib/full_lengther_next/classes/lcs.rb +33 -0
- data/lib/full_lengther_next/classes/my_worker.rb +122 -0
- data/lib/full_lengther_next/classes/my_worker_manager.rb +167 -0
- data/lib/full_lengther_next/classes/orf.rb +32 -0
- data/lib/full_lengther_next/classes/sequence.rb +111 -0
- data/lib/full_lengther_next/classes/test_code.rb +877 -0
- data/lib/full_lengther_next/classes/une_los_hit.rb +287 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/test/test_full_lengther_next.rb +11 -0
- data/test/test_helper.rb +3 -0
- metadata +150 -0
data/History.txt
ADDED
data/Manifest.txt
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
bin/download_fln_dbs.rb
|
2
|
+
bin/make_user_db.rb
|
3
|
+
bin/full_lengther_next
|
4
|
+
History.txt
|
5
|
+
lib/full_lengther_next/classes/common_functions.rb
|
6
|
+
lib/full_lengther_next/classes/fl2_stats.rb
|
7
|
+
lib/full_lengther_next/classes/fl_analysis.rb
|
8
|
+
lib/full_lengther_next/classes/fl_string_utils.rb
|
9
|
+
lib/full_lengther_next/classes/lcs.rb
|
10
|
+
lib/full_lengther_next/classes/my_worker.rb
|
11
|
+
lib/full_lengther_next/classes/my_worker_manager.rb
|
12
|
+
lib/full_lengther_next/classes/orf.rb
|
13
|
+
lib/full_lengther_next/classes/sequence.rb
|
14
|
+
lib/full_lengther_next/classes/test_code.rb
|
15
|
+
lib/full_lengther_next/classes/une_los_hit.rb
|
16
|
+
lib/full_lengther_next.rb
|
17
|
+
Manifest.txt
|
18
|
+
PostInstall.txt
|
19
|
+
Rakefile
|
20
|
+
README.rdoc
|
21
|
+
script
|
22
|
+
script/console
|
23
|
+
script/destroy
|
24
|
+
script/generate
|
25
|
+
test
|
26
|
+
test/test_full_lengther_next.rb
|
27
|
+
test/test_helper.rb
|
data/PostInstall.txt
ADDED
data/README.rdoc
ADDED
@@ -0,0 +1,147 @@
|
|
1
|
+
= full_lengther_next
|
2
|
+
|
3
|
+
* http://www.scbi.uma.es/downloads
|
4
|
+
|
5
|
+
== DESCRIPTION:
|
6
|
+
|
7
|
+
FULL-LENGTHERNEXT is a tool adapted to NGS technologies, able to work in parallel and in a distributed way to minimise computing time. It is able to classify unigenes to full-length, 5’-end, 3’-end and internal, suggesting which unknown genes are coding or not. It will be also shown that FULL-LENGTHERNEXT fixes frame shifts, one of the main mistake found in wrong entries of full-length sequences databases, and it is a fast tool to compare different transcriptome assemblies.
|
8
|
+
|
9
|
+
== FEATURES/PROBLEMS:
|
10
|
+
|
11
|
+
* FULL-LENGTHERNEXT uses scbi_mapreduce and thus is able to exploit all the benefits of a cluster environment. It also works in multi-core machines big shared-memory servers.
|
12
|
+
|
13
|
+
* It is able to classify unigenes to full-length, 5’-end, 3’-end and internal.
|
14
|
+
|
15
|
+
* FULL-LENGTHERNEXT fixes frame shifts.
|
16
|
+
|
17
|
+
* It returns the translated protein sequence for the complete genes and the nucleotide sequence with frame shift fixed and highlighting the start and end codon for an easier finding of the gene and the UTR regions.
|
18
|
+
|
19
|
+
* FULL-LENGTHERNEXT suggests putative new genes analysing what of the genes classified as unknown are probably coding.
|
20
|
+
|
21
|
+
* It produces a stats file useful for assemblies comparison.
|
22
|
+
|
23
|
+
== SYNOPSIS:
|
24
|
+
|
25
|
+
FULL-LENGTHERNEXT must be fed with a multifasta file containing all unigenes to analyse and which group belongs the organism under study among fungi, human, invertebrates, mammals, plants, rodents or vertebrates, to use the most appropriate databases. Furthermore, it is possible parametrizing the number of cpus to be used (workers), the minimum identity percent (default = 45%) and minimum e value (default = 1e-25) thresholds, the maximum distance between query and subject gene limits (default = 15 amino acids) and a user database of complete proteins if desired.
|
26
|
+
|
27
|
+
full_lengther_next -f input.fasta -g [fungi|human|invertebrates|mammals|plants|rodents|vertebrates] -d user_db [options]
|
28
|
+
|
29
|
+
|
30
|
+
=== PBS Submission script
|
31
|
+
|
32
|
+
$> cat sample_work.sh
|
33
|
+
|
34
|
+
# 12 distributed workers and 1 GB memory per worker:
|
35
|
+
#PBS -l select=12:ncpus=1:mpiprocs=1:mem=1gb
|
36
|
+
# request 10 hours of walltime:
|
37
|
+
#PBS -l walltime=10:00:00
|
38
|
+
# cd to working directory (from where job was submitted)
|
39
|
+
cd $PBS_O_WORKDIR
|
40
|
+
|
41
|
+
# create workers file with assigned node names
|
42
|
+
|
43
|
+
cat ${PBS_NODEFILE} > workers
|
44
|
+
|
45
|
+
# init seqtrimnext
|
46
|
+
source ~seqtrimnext/init_env
|
47
|
+
|
48
|
+
time seqtrimnext -t paired_ends.txt -Q fastq -w workers -s 10.0.0
|
49
|
+
Once this submission script is created, you only need to launch it with:
|
50
|
+
|
51
|
+
qsub sample_work.sh
|
52
|
+
|
53
|
+
== REQUIREMENTS:
|
54
|
+
|
55
|
+
Ruby 1.9.2
|
56
|
+
|
57
|
+
Blast plus 2.24 or greater (prior versions have bugs that produces bad results)
|
58
|
+
|
59
|
+
== INSTALL:
|
60
|
+
|
61
|
+
=== Installing Blast
|
62
|
+
|
63
|
+
*Download the latest version of Blast+ from ftp.ncbi.nlm.nih.gov/blast/executables/release/LATEST/
|
64
|
+
*You can also use a precompiled version if you like
|
65
|
+
*To install from source, decompress the downloaded file, cd to the decompressed folder, and issue the following commands:
|
66
|
+
|
67
|
+
./configure
|
68
|
+
make
|
69
|
+
sudo make install
|
70
|
+
|
71
|
+
=== Installing Ruby 1.9
|
72
|
+
|
73
|
+
*You can use RVM to install ruby:
|
74
|
+
|
75
|
+
Download latest certificates (maybe you don’t need them):
|
76
|
+
|
77
|
+
$ curl -O http://curl.haxx.se/ca/cacert.pem
|
78
|
+
$ export CURL_CA_BUNDLE=`pwd`/cacert.pem # add this to your .bashrc or
|
79
|
+
equivalent
|
80
|
+
|
81
|
+
Install RVM:
|
82
|
+
|
83
|
+
$ bash < <(curl -k https://rvm.beginrescueend.com/install/rvm)
|
84
|
+
Setup environment:
|
85
|
+
|
86
|
+
$ echo '[[ -s "$HOME/.rvm/scripts/rvm" ]] && . "$HOME/.rvm/scripts/rvm" # Load RVM function' >> ~/.bash_profile
|
87
|
+
Install ruby 1.9.2 (this can take a while):
|
88
|
+
|
89
|
+
$ rvm install 1.9.2
|
90
|
+
Set it as the default:
|
91
|
+
|
92
|
+
$ rvm use 1.9.2 --default
|
93
|
+
|
94
|
+
|
95
|
+
=== Install Full-LengtherNEXT
|
96
|
+
|
97
|
+
Full-LengtherNEXT is very easy to install. It is distributed as a ruby gem. The next command will install Full-LengtherNEXT and all the required gems:
|
98
|
+
|
99
|
+
gem install full_lengther_next
|
100
|
+
|
101
|
+
|
102
|
+
=== Install and rebuild Full-LengthNEXT databases
|
103
|
+
|
104
|
+
Full-LengthNEXT needs some databases to work. You can use the BLASTDB environment variable to to change the default database location. To install them, execute:
|
105
|
+
|
106
|
+
$ download_fln_dbs.rb
|
107
|
+
|
108
|
+
==== User database
|
109
|
+
|
110
|
+
In addition, Full-LengthNEXT is able to use a customised database. It can be created executing:
|
111
|
+
|
112
|
+
$ make_user_db.rb
|
113
|
+
|
114
|
+
This script only needs two parameters, a database division among fungi, human, invertebrates, mammals, plants, rodents or vertebrates, and a ‘taxon’ corresponds to a specific taxonomic group such as genus, family, or order.
|
115
|
+
For example, if our organism under study is a pine, the database division will be ‘plants’ and the taxon may be Pinus, Pinaceae or even Coniferales order, which includes pines and the other of conifers. Therefore, the command line will be:
|
116
|
+
|
117
|
+
$ mk_user_db.rb plants Coniferales.
|
118
|
+
|
119
|
+
Otherwise, this database must contain only with complete proteins, and formatted with the BLAST command:
|
120
|
+
|
121
|
+
makeblastdb -in sequences.fasta -dbtype 'prot' -parse_seqids
|
122
|
+
|
123
|
+
|
124
|
+
== LICENSE:
|
125
|
+
|
126
|
+
(The MIT License)
|
127
|
+
|
128
|
+
Copyright (c) 2012 - Noe Fernandez & Dario Guerrero, noeisneo@gmail.com & dariogf@gmail.com
|
129
|
+
|
130
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
131
|
+
a copy of this software and associated documentation files (the
|
132
|
+
'Software'), to deal in the Software without restriction, including
|
133
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
134
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
135
|
+
permit persons to whom the Software is furnished to do so, subject to
|
136
|
+
the following conditions:
|
137
|
+
|
138
|
+
The above copyright notice and this permission notice shall be
|
139
|
+
included in all copies or substantial portions of the Software.
|
140
|
+
|
141
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
142
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
143
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
144
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
145
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
146
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
147
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Rakefile
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
gem 'hoe', '>= 2.1.0'
|
3
|
+
require 'hoe'
|
4
|
+
require 'fileutils'
|
5
|
+
require './lib/full_lengther_next'
|
6
|
+
|
7
|
+
Hoe.plugin :newgem
|
8
|
+
# Hoe.plugin :website
|
9
|
+
# Hoe.plugin :cucumberfeatures
|
10
|
+
|
11
|
+
# Generate all the Rake tasks
|
12
|
+
# Run 'rake -T' to see list of generated tasks (from gem root directory)
|
13
|
+
$hoe = Hoe.spec 'full_lengther_next' do
|
14
|
+
self.developer 'Noe Fernandez & Dario Guerrero', 'noeisneo@gmail.com & dariogf@gmail.com'
|
15
|
+
self.post_install_message = 'PostInstall.txt' # TODO remove if post-install message not required
|
16
|
+
self.rubyforge_name = self.name # TODO this is default value
|
17
|
+
# self.extra_deps = [['activesupport','>= 2.0.2']]
|
18
|
+
self.extra_deps = []
|
19
|
+
# self.extra_deps << ['narray','>=0']
|
20
|
+
# self.extra_deps << ['gnuplot','>=0']
|
21
|
+
# self.extra_deps << ['term-ansicolor','>=1.0.5']
|
22
|
+
self.extra_deps << ['xml-simple','>=1.0.12']
|
23
|
+
self.extra_deps << ['scbi_blast','>=0.0.32']
|
24
|
+
self.extra_deps << ['scbi_mapreduce','>=0.0.29']
|
25
|
+
self.extra_deps << ['scbi_fasta','>=0.1.7']
|
26
|
+
# self.extra_deps << ['scbi_fastq','>=0.0.13']
|
27
|
+
self.extra_deps << ['scbi_plot','>=0.0.6']
|
28
|
+
# self.extra_deps << ['scbi_math','>=0.0.1']
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
require 'newgem/tasks'
|
33
|
+
Dir['tasks/**/*.rake'].each { |t| load t }
|
34
|
+
|
35
|
+
# TODO - want other tests/tasks run by default? Add them to the list
|
36
|
+
# remove_task :default
|
37
|
+
# task :default => [:spec, :features]
|
@@ -0,0 +1,197 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# 15-2-2011 Noe Fernandez-Pozo
|
4
|
+
# Script to download Full-LengtherNext databases.
|
5
|
+
# Once in UniProtKB/Swiss-Prot, a protein entry is removed from UniProtKB/TrEMBL.
|
6
|
+
|
7
|
+
require 'net/ftp'
|
8
|
+
|
9
|
+
################################################### Functions
|
10
|
+
|
11
|
+
def conecta_uniprot(my_array, formatted_db_path)
|
12
|
+
|
13
|
+
$ftp = Net::FTP.new()
|
14
|
+
|
15
|
+
if !File.exists?('blast_dbs')
|
16
|
+
Dir.mkdir('blast_dbs')
|
17
|
+
end
|
18
|
+
|
19
|
+
$ftp.connect('ftp.uniprot.org')
|
20
|
+
|
21
|
+
$ftp.login
|
22
|
+
|
23
|
+
puts "connected to UniProt"
|
24
|
+
|
25
|
+
my_array.each do |db_group|
|
26
|
+
puts "Downloading #{db_group}"
|
27
|
+
download_uniprot(db_group, formatted_db_path)
|
28
|
+
end
|
29
|
+
|
30
|
+
$ftp.chdir("/pub/databases/uniprot/current_release/knowledgebase/complete")
|
31
|
+
$ftp.getbinaryfile("uniprot_sprot_varsplic.fasta.gz", "#{formatted_db_path}/uniprot_sprot_varsplic.fasta.gz")
|
32
|
+
|
33
|
+
puts "isoform files downloaded"
|
34
|
+
|
35
|
+
$ftp.close
|
36
|
+
|
37
|
+
end
|
38
|
+
|
39
|
+
def download_uniprot(uniprot_group, formatted_db_path)
|
40
|
+
|
41
|
+
$ftp.chdir("/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions")
|
42
|
+
$ftp.getbinaryfile("uniprot_sprot_#{uniprot_group}.dat.gz", "#{formatted_db_path}/uniprot_sprot_#{uniprot_group}.dat.gz")
|
43
|
+
$ftp.getbinaryfile("uniprot_trembl_#{uniprot_group}.dat.gz", "#{formatted_db_path}/uniprot_trembl_#{uniprot_group}.dat.gz")
|
44
|
+
|
45
|
+
puts "#{uniprot_group} files downloaded"
|
46
|
+
|
47
|
+
end
|
48
|
+
|
49
|
+
def filter_incomplete_seqs(file_name, isoform_hash, formatted_db_path)
|
50
|
+
|
51
|
+
puts "filtering sequences from #{file_name}"
|
52
|
+
|
53
|
+
# UniProtKB fragments with FT NON_CONS and FT NON_TER features.
|
54
|
+
#
|
55
|
+
# * FT NON_TER: The residue at an extremity of the sequence is not the terminal residue. If applied to position 1, this signifies that the first position is not the N-terminus of the complete molecule. If applied to the last position, it means that this position is not the C-terminus of the complete molecule. There is no description field for this key. Examples of NON_TER key feature lines:
|
56
|
+
# FT NON_TER 1 1
|
57
|
+
# FT NON_TER 29 29
|
58
|
+
# * FT NON_CONS: Non-consecutive residues. Indicates that two residues in a sequence are not consecutive and that there are a number of unreported or missing residues between them. Example of a NON_CONS key feature line:
|
59
|
+
# FT NON_CONS 1683 1684
|
60
|
+
#
|
61
|
+
# NON_CONS fragments are not indicated as non-consecutive in InterPro and being non-consecutive the match to methods may be incorrect if the method spans the 'break'.
|
62
|
+
|
63
|
+
newseq=false
|
64
|
+
print_seq=true
|
65
|
+
id=''
|
66
|
+
description = ''
|
67
|
+
organism_name = ''
|
68
|
+
seq = ''
|
69
|
+
organelle = ''
|
70
|
+
|
71
|
+
file_name =~ /uniprot_([a-z]+)_([a-z]+).dat/
|
72
|
+
db_name = $1
|
73
|
+
output_name = $2
|
74
|
+
db_name.sub!('sprot','sp')
|
75
|
+
db_name.sub!('trembl','tr')
|
76
|
+
|
77
|
+
if !File.exists?("#{formatted_db_path}/#{db_name}_#{output_name}")
|
78
|
+
Dir.mkdir("#{formatted_db_path}/#{db_name}_#{output_name}")
|
79
|
+
end
|
80
|
+
|
81
|
+
output_file = File.new("#{formatted_db_path}/#{db_name}_#{output_name}/#{db_name}_#{output_name}.fasta", "w")
|
82
|
+
|
83
|
+
File.open(file_name).each_line do |line|
|
84
|
+
if (newseq == false)
|
85
|
+
if (line =~ /^AC\s+(\w+);/)
|
86
|
+
id=$1
|
87
|
+
newseq = true
|
88
|
+
description = ''
|
89
|
+
organism_name = ''
|
90
|
+
seq = ''
|
91
|
+
print_seq = true
|
92
|
+
organelle = ''
|
93
|
+
end
|
94
|
+
else
|
95
|
+
if (line =~ /^DE\s+(.+)\;*/)
|
96
|
+
if (description == '')
|
97
|
+
description = $1
|
98
|
+
description.sub!(/RecName: Full=/,'sp=')
|
99
|
+
description.sub!(/SubName: Full=/,'tr=')
|
100
|
+
end
|
101
|
+
if (line =~ /Flags: Fragment/)
|
102
|
+
# puts "#{id} #{line}"
|
103
|
+
print_seq=false
|
104
|
+
end
|
105
|
+
elsif (line =~ /^OS\s+(.+)/)
|
106
|
+
organism_name = $1
|
107
|
+
elsif (line =~ /^OG\s+(.+)/)
|
108
|
+
organelle = $1
|
109
|
+
elsif (line =~ /^FT\s+NON_TER\s+/)
|
110
|
+
print_seq=false
|
111
|
+
# puts "#{id} NON_TER"
|
112
|
+
elsif (line =~ /^FT\s+NON_CONS\s+(\d+)\s+/)
|
113
|
+
print_seq=false
|
114
|
+
# puts "#{id} NON_CONS"
|
115
|
+
elsif (line =~ /^\s+([\w\s]+)/)
|
116
|
+
seq += $1
|
117
|
+
elsif (line =~ /^\/\//)
|
118
|
+
seq.gsub!(/\s*/,'')
|
119
|
+
if (seq !~ /^M/i)
|
120
|
+
print_seq=false
|
121
|
+
end
|
122
|
+
newseq = false
|
123
|
+
|
124
|
+
if (print_seq)
|
125
|
+
output_file.puts ">#{id} #{description} #{organism_name} #{organelle}\n#{seq}"
|
126
|
+
if (!isoform_hash[id].nil?)
|
127
|
+
output_file.puts isoform_hash[id]
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
output_file.close
|
134
|
+
end
|
135
|
+
|
136
|
+
def load_isoform_hash(file)
|
137
|
+
|
138
|
+
isoform_hash = {}
|
139
|
+
my_fasta = ''
|
140
|
+
acc = ''
|
141
|
+
File.open(file).each do |line|
|
142
|
+
line.chomp!
|
143
|
+
if (line =~ /(^>\w+\|(\w+)\-\d\|.+)/)
|
144
|
+
if (isoform_hash[acc].nil?)
|
145
|
+
isoform_hash[acc]= "#{my_fasta}\n"
|
146
|
+
else
|
147
|
+
isoform_hash[acc]+= "#{my_fasta}\n"
|
148
|
+
end
|
149
|
+
my_fasta = "#{$1}\n"
|
150
|
+
acc = $2
|
151
|
+
else
|
152
|
+
my_fasta += line
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
# if (isoform_hash[acc].nil?)
|
157
|
+
# isoform_hash[acc]= "#{my_fasta}\n"
|
158
|
+
# else
|
159
|
+
# isoform_hash[acc]+= "#{my_fasta}\n"
|
160
|
+
# end
|
161
|
+
|
162
|
+
return isoform_hash
|
163
|
+
end
|
164
|
+
################################################### MAIN
|
165
|
+
|
166
|
+
ROOT_PATH=File.dirname(__FILE__)
|
167
|
+
|
168
|
+
if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB'])
|
169
|
+
formatted_db_path = ENV['BLASTDB']
|
170
|
+
else # otherwise use ROOTPATH + DB
|
171
|
+
formatted_db_path = File.expand_path(File.join(ROOT_PATH, "blast_dbs"))
|
172
|
+
end
|
173
|
+
|
174
|
+
ENV['BLASTDB']=formatted_db_path
|
175
|
+
puts "Databases will be downloaded at: #{ENV['BLASTDB']}"
|
176
|
+
|
177
|
+
|
178
|
+
my_array = ["human","fungi","invertebrates","mammals","plants","rodents","vertebrates"]
|
179
|
+
# my_array = ["plants","invertebrates"] # used for a shoter test
|
180
|
+
|
181
|
+
conecta_uniprot(my_array, formatted_db_path)
|
182
|
+
`gunzip #{formatted_db_path}/*gz`
|
183
|
+
|
184
|
+
isoform_hash = {}
|
185
|
+
isoform_hash = load_isoform_hash("#{formatted_db_path}/uniprot_sprot_varsplic.fasta")
|
186
|
+
|
187
|
+
my_array.each do |db_group|
|
188
|
+
|
189
|
+
filter_incomplete_seqs("#{formatted_db_path}/uniprot_sprot_#{db_group}.dat", isoform_hash, formatted_db_path)
|
190
|
+
filter_incomplete_seqs("#{formatted_db_path}/uniprot_trembl_#{db_group}.dat", isoform_hash, formatted_db_path)
|
191
|
+
|
192
|
+
`makeblastdb -in #{formatted_db_path}/sp_#{db_group}/sp_#{db_group}.fasta -dbtype 'prot' -parse_seqids`
|
193
|
+
`makeblastdb -in #{formatted_db_path}/tr_#{db_group}/tr_#{db_group}.fasta -dbtype 'prot' -parse_seqids`
|
194
|
+
|
195
|
+
end
|
196
|
+
|
197
|
+
puts "download_fln_dbs.rb has finished"
|
@@ -0,0 +1,173 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# 12-2-2011 Noe Fernandez Pozo.
|
4
|
+
# Full-Lengther2 predicts if your sequences are complete, showing you the nucleotide sequences and the translated protein
|
5
|
+
|
6
|
+
#------------------------------------------------------------------ parameters entry
|
7
|
+
require 'optparse'
|
8
|
+
require 'socket'
|
9
|
+
|
10
|
+
options = {}
|
11
|
+
|
12
|
+
if !File.exists?('logs')
|
13
|
+
Dir.mkdir('logs')
|
14
|
+
end
|
15
|
+
|
16
|
+
optparse = OptionParser.new do |opts|
|
17
|
+
|
18
|
+
options[:fasta] = nil
|
19
|
+
opts.on( '-f', '--fasta FILE', 'Fasta input file' ) do |file|
|
20
|
+
options[:fasta] = file
|
21
|
+
end
|
22
|
+
|
23
|
+
options[:tax_group] = nil
|
24
|
+
opts.on( '-g', '--taxon_group GROUP', "Taxon group, required to use the best databases:\n\t\t\t\t\tfungi\n\t\t\t\t\thuman\n\t\t\t\t\tinvertebrates\n\t\t\t\t\tmammals\n\t\t\t\t\tplants\n\t\t\t\t\trodents\n\t\t\t\t\tvertebrates\n\n" ) do |tax_name|
|
25
|
+
options[:tax_group] = tax_name
|
26
|
+
end
|
27
|
+
|
28
|
+
options[:user_db] = nil
|
29
|
+
opts.on( '-d', '--blast_db DB_NAME', 'User blast plus database' ) do |db|
|
30
|
+
options[:user_db] = db
|
31
|
+
end
|
32
|
+
|
33
|
+
# options[:verbose] = nil
|
34
|
+
# opts.on( '-v', '--verbose_mode', "verbose mode\n\n" ) do |verbose|
|
35
|
+
# options[:verbose] = verbose
|
36
|
+
# end
|
37
|
+
|
38
|
+
options[:evalue] = 1.0e-25
|
39
|
+
opts.on( '-e', '--evalue EVALUE', 'e value threshold to consider as reliable the orthologue sequence. Default=1.0e-25' ) do |evalue|
|
40
|
+
options[:evalue] = evalue.to_f
|
41
|
+
end
|
42
|
+
|
43
|
+
options[:ident] = 45.00
|
44
|
+
opts.on( '-i', '--identity_percent IDENTITY', 'identity percent threshold to consider as reliable the sequence similarity. Default=45.00' ) do |ident|
|
45
|
+
options[:ident] = ident.to_f
|
46
|
+
end
|
47
|
+
|
48
|
+
options[:distance] = 15
|
49
|
+
opts.on( '-a', '--aas_distance DISTANCE', "distance threshold in aminoacids used for some calculations, the less distance the more strict. Default=15\n\n" ) do |distance|
|
50
|
+
options[:distance] = distance.to_i
|
51
|
+
end
|
52
|
+
|
53
|
+
options[:workers] = 2
|
54
|
+
opts.on( '-w', '--workers INTEGER/FILE', 'Number of CPUs, or a file containing machine names to launch workers with ssh' ) do |workers|
|
55
|
+
if File.exists?(workers)
|
56
|
+
# use workers file
|
57
|
+
options[:workers] = File.read(workers).split("\n").map{|w| w.chomp}
|
58
|
+
options[:workers].shift
|
59
|
+
elsif (workers.to_i > 0)
|
60
|
+
options[:workers] = workers.to_i
|
61
|
+
else
|
62
|
+
options[:workers] = 2
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
options[:chunk_size] = 200
|
67
|
+
opts.on( '-c', '--chunk_size SIZE', "Number of sequences processed in each block when parallelization is used. Default=200" ) do |s|
|
68
|
+
options[:chunk_size] = s.to_i
|
69
|
+
end
|
70
|
+
|
71
|
+
options[:server_ip] = '0.0.0.0'
|
72
|
+
opts.on( '-s', '--server IP', 'Server ip. Can use a partial ip to select the apropriate interface' ) do |server_ip|
|
73
|
+
|
74
|
+
# get list of available ips
|
75
|
+
ip_list = Socket.ip_address_list.select{|e| e.ipv4?}.map{|e| e.ip_address}
|
76
|
+
|
77
|
+
ip=ip_list.select{|ip| ip.index(server_ip)==0}.first
|
78
|
+
|
79
|
+
if !ip
|
80
|
+
ip='0.0.0.0'
|
81
|
+
# $LOG.info("No available ip matching #{server_ip}")
|
82
|
+
end
|
83
|
+
# $ .info("Using ip #{ip}")
|
84
|
+
options[:server_ip] = ip
|
85
|
+
end
|
86
|
+
|
87
|
+
options[:port] = 0 #50000
|
88
|
+
opts.on( '-p', '--port PORT', "Server port\n\n" ) do |port|
|
89
|
+
options[:port] = port.to_i
|
90
|
+
end
|
91
|
+
|
92
|
+
|
93
|
+
# Set a banner, displayed at the top of the help screen.
|
94
|
+
opts.banner = "Usage: full_lengther_2 -f input.fasta -g [fungi|human|invertebrates|mammals|plants|rodents|vertebrates] [options]\n\n"
|
95
|
+
|
96
|
+
# This displays the help screen
|
97
|
+
opts.on( '-h', '--help', 'Display this screen' ) do
|
98
|
+
puts opts
|
99
|
+
exit
|
100
|
+
end
|
101
|
+
|
102
|
+
end
|
103
|
+
|
104
|
+
# parse options and remove from ARGV
|
105
|
+
optparse.parse!
|
106
|
+
# @verbose = options[:verbose]
|
107
|
+
|
108
|
+
# if (!@verbose.nil?)
|
109
|
+
# puts "You have chosen the verbose mode:\n\nInput File:\t#{options[:fasta]}\nTaxon Group:\t#{options[:tax_group]}\nOwn Database:\t#{options[:user_db]}\nCPU Number:\t#{options[:workers]}"
|
110
|
+
# end
|
111
|
+
|
112
|
+
#----------------------------------------------------------------------- testing errors in parameters entry
|
113
|
+
if (options[:fasta].nil?) || (options[:tax_group].nil?)
|
114
|
+
puts "incorrect number of arguments, you need a fasta file and a taxonomical group:\n\n\t"
|
115
|
+
puts optparse.help
|
116
|
+
exit
|
117
|
+
end
|
118
|
+
#----------------------------------------------------------------------- loading classes and gems
|
119
|
+
ROOT_PATH=File.dirname(__FILE__)
|
120
|
+
|
121
|
+
# $: << File.expand_path(File.join(ROOT_PATH, "classes"))
|
122
|
+
|
123
|
+
# load gem path, only to test locally
|
124
|
+
# $: << File.expand_path('~/progs/ruby/gems/full_lengther_next/lib')
|
125
|
+
|
126
|
+
require 'full_lengther_next'
|
127
|
+
|
128
|
+
if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB'])
|
129
|
+
formatted_db_path = ENV['BLASTDB']
|
130
|
+
else # otherwise use ROOTPATH + DB
|
131
|
+
formatted_db_path = File.expand_path(File.join(ROOT_PATH, "blast_dbs"))
|
132
|
+
end
|
133
|
+
|
134
|
+
ENV['BLASTDB']=formatted_db_path
|
135
|
+
puts "Using databases at: #{ENV['BLASTDB']}"
|
136
|
+
|
137
|
+
if !File.exists?("#{ENV['BLASTDB']}/sp_#{options[:tax_group]}/sp_#{options[:tax_group]}.fasta.psq")
|
138
|
+
puts "DB File #{ENV['BLASTDB']}/sp_#{options[:tax_group]}/sp_#{options[:tax_group]}.fasta.psq doesn't exists, or"
|
139
|
+
puts "incorrect taxon group name: #{options[:tax_group]} choose:"
|
140
|
+
puts optparse.help
|
141
|
+
exit
|
142
|
+
end
|
143
|
+
|
144
|
+
require 'scbi_blast' # is a gem
|
145
|
+
require 'scbi_mapreduce'
|
146
|
+
# puts $:
|
147
|
+
require 'fl_string_utils'
|
148
|
+
require "une_los_hit"
|
149
|
+
require "lcs" # like the class simliar of seqtrim, return the longest common sequence
|
150
|
+
require "test_code"
|
151
|
+
|
152
|
+
########################################################## MAIN #################################################################
|
153
|
+
|
154
|
+
require 'my_worker_manager'
|
155
|
+
|
156
|
+
$LOG = Logger.new(STDOUT)
|
157
|
+
$LOG.datetime_format = "%Y-%m-%d %H:%M:%S"
|
158
|
+
|
159
|
+
custom_worker_file = File.join(ROOT_PATH,'classes','my_worker.rb')
|
160
|
+
|
161
|
+
$LOG.info 'Starting server'
|
162
|
+
# initialize work manager (open files, etc)
|
163
|
+
MyWorkerManager.init_work_manager(options, options[:chunk_size])
|
164
|
+
|
165
|
+
# Create server
|
166
|
+
server = ScbiMapreduce::Manager.new(options[:server_ip],options[:port], options[:workers], MyWorkerManager,custom_worker_file, STDOUT,File.join(ROOT_PATH,'init_env'))
|
167
|
+
server.chunk_size=options[:chunk_size]
|
168
|
+
# launch server
|
169
|
+
server.start_server
|
170
|
+
|
171
|
+
$LOG.info 'Closing server'
|
172
|
+
|
173
|
+
puts "\nGracias por utilizar Full-LengtherNEXT"
|