full_lengther_next 0.5.7 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +2 -4
- data/bin/download_fln_dbs.rb +43 -32
- data/bin/full_lengther_next +3 -3
- data/bin/make_user_db.rb +25 -10
- data/lib/full_lengther_next.rb +1 -1
- metadata +5 -5
data/Rakefile
CHANGED
@@ -22,13 +22,11 @@ $hoe = Hoe.spec 'full_lengther_next' do
|
|
22
22
|
self.extra_deps << ['xml-simple','>=1.0.12']
|
23
23
|
self.extra_deps << ['scbi_blast','>=0.0.32']
|
24
24
|
self.extra_deps << ['scbi_mapreduce','>=0.0.29']
|
25
|
-
self.extra_deps << ['
|
25
|
+
self.extra_deps << ['scbi_zcat']
|
26
26
|
self.extra_deps << ['bio-cd-hit-report', '>= 0.1.0 ']
|
27
27
|
self.extra_deps << ['bio', '>= 1.4.3']
|
28
|
-
# self.extra_deps << ['scbi_fastq','>=0.0.13']
|
29
28
|
self.extra_deps << ['scbi_plot','>=0.0.6']
|
30
|
-
|
31
|
-
|
29
|
+
|
32
30
|
end
|
33
31
|
|
34
32
|
require 'newgem/tasks'
|
data/bin/download_fln_dbs.rb
CHANGED
@@ -10,7 +10,7 @@ $: << File.expand_path(File.join(ROOT_PATH, "../lib/full_lengther_next/classes/"
|
|
10
10
|
require 'bio'
|
11
11
|
require 'net/ftp'
|
12
12
|
require 'open-uri'
|
13
|
-
require '
|
13
|
+
require 'scbi_zcat'
|
14
14
|
require 'optparse'
|
15
15
|
require 'cdhit'
|
16
16
|
require 'handle_db'
|
@@ -20,52 +20,64 @@ require 'handle_db'
|
|
20
20
|
#############################################################################################
|
21
21
|
def download_ncrna(formatted_db_path, no_download)
|
22
22
|
|
23
|
-
ncrna_zip = File.join(formatted_db_path, 'ncrna.
|
24
|
-
ncrna_fasta = File.join(formatted_db_path, 'ncrna.fasta')
|
23
|
+
ncrna_zip = File.join(formatted_db_path, 'ncrna.gz')
|
25
24
|
db_path = File.join(formatted_db_path, 'nc_rna_db')
|
26
|
-
|
27
|
-
|
28
|
-
|
25
|
+
db_files = File.join(db_path, 'ncrna')
|
26
|
+
fasta = File.join(db_path , 'filtered.fasta')
|
29
27
|
if !no_download
|
30
28
|
puts "Downloading ncRNA database"
|
31
29
|
open(ncrna_zip, 'wb') do |my_file|
|
32
|
-
my_file.print open('
|
30
|
+
my_file.print open('ftp://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/sequences/rnacentral_active.fasta.gz').read
|
31
|
+
#my_file.print open('http://www.ncrna.org/frnadb/files/ncrna.zip').read
|
33
32
|
end
|
34
33
|
puts "\nncRNA database downloaded"
|
35
|
-
system("unzip", ncrna_zip, "-d", ncrna_fasta)
|
36
|
-
system("rm", ncrna_zip)
|
37
34
|
end
|
38
35
|
|
39
|
-
if File.exists?(
|
36
|
+
if File.exists?(ncrna_zip)
|
40
37
|
Dir.mkdir(db_path) if !File.exists?(db_path)
|
41
|
-
|
42
|
-
filtered_fasta = filtering_ncbi_seqs(
|
43
|
-
#system("LANG=C sed 's/[^A-Za-z0-9\.> -\|]/_/g' #{ncrna_fasta}_filtered > #{ncrna_fasta}_cln")
|
38
|
+
black_list = [' 16S ', 'rRNA', 'ribosomal', 'tRNA', 'rrn'] #rrn = ribosonal rna
|
39
|
+
filtered_fasta = filtering_ncbi_seqs(ncrna_zip, 40, black_list)
|
44
40
|
puts "\nncRNA database decompressed and cleaned"
|
45
|
-
do_makeblastdb(filtered_fasta, db_files, 'nucl')
|
46
|
-
|
41
|
+
#do_makeblastdb(filtered_fasta, db_files, 'nucl')
|
42
|
+
output_file = File.open(fasta, 'w')
|
43
|
+
output_file.puts filtered_fasta
|
44
|
+
output_file.close
|
45
|
+
system("cd-hit -i #{fasta} -o /dev/stderr -c 0.95 -n 11 -M 0 2>&1 >/dev/null | makeblastdb -in - -out #{db_files} -title #{File.basename(db_files)} -dbtype 'nucl' -parse_seqids")
|
47
46
|
puts "\nncRNA database completed"
|
48
47
|
end
|
49
48
|
end
|
50
49
|
|
51
|
-
def filtering_ncbi_seqs(fasta_file, max_length)
|
52
|
-
|
50
|
+
def filtering_ncbi_seqs(fasta_file, max_length, black_list)
|
51
|
+
fasta = ScbiZcatFile.new(fasta_file)
|
53
52
|
filtered_fasta = ''
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
53
|
+
seq_name = nil
|
54
|
+
seq = ''
|
55
|
+
while !fasta.eof
|
56
|
+
line = fasta.readline.chomp
|
57
|
+
if line[0] == '>'
|
58
|
+
if !seq_name.nil?
|
59
|
+
filtered_fasta << "#{seq_name}\n#{seq}\n" if seq.length >= max_length && !compare_list(seq_name, black_list)
|
60
|
+
end
|
61
|
+
seq_name = line
|
62
|
+
seq = ''
|
63
|
+
else
|
64
|
+
seq << line
|
65
|
+
end
|
66
|
+
end
|
67
|
+
filtered_fasta << "#{seq_name}\n#{seq}\n" if seq.length >= max_length && !compare_list(seq_name, black_list)
|
68
|
+
|
69
|
+
return filtered_fasta
|
70
|
+
end
|
71
|
+
|
72
|
+
def compare_list(string, list)
|
73
|
+
res = FALSE
|
74
|
+
list.each do |word|
|
75
|
+
if string.include?(word)
|
76
|
+
res = TRUE
|
77
|
+
break
|
66
78
|
end
|
67
79
|
end
|
68
|
-
return
|
80
|
+
return res
|
69
81
|
end
|
70
82
|
|
71
83
|
def conecta_uniprot(my_array, formatted_db_path)
|
@@ -312,7 +324,7 @@ optparse = OptionParser.new do |opts|
|
|
312
324
|
end
|
313
325
|
|
314
326
|
options[:all] = FALSE
|
315
|
-
opts.on( '-a', '--all_sequences', 'Generate
|
327
|
+
opts.on( '-a', '--all_sequences', 'Generate databases with all sequences') do
|
316
328
|
options[:all] = TRUE
|
317
329
|
end
|
318
330
|
|
@@ -354,7 +366,6 @@ else # otherwise use ROOTPATH + DB
|
|
354
366
|
Dir.mkdir(formatted_db_path)
|
355
367
|
end
|
356
368
|
|
357
|
-
puts formatted_db_path
|
358
369
|
|
359
370
|
ENV['BLASTDB'] = formatted_db_path
|
360
371
|
puts "Databases will be downloaded at: #{ENV['BLASTDB']}"
|
data/bin/full_lengther_next
CHANGED
@@ -63,9 +63,9 @@ optparse = OptionParser.new do |opts|
|
|
63
63
|
options[:ident] = ident.to_f
|
64
64
|
end
|
65
65
|
|
66
|
-
options[:high_clustering] =
|
67
|
-
opts.on( '-k', '--high_clustering', 'Only for representative transcriptome. Add a clustering step using pfam ids. Default
|
68
|
-
options[:high_clustering] =
|
66
|
+
options[:high_clustering] = FALSE
|
67
|
+
opts.on( '-k', '--high_clustering', 'Only for representative transcriptome. Add a clustering step using pfam ids. Default false' ) do
|
68
|
+
options[:high_clustering] = TRUE
|
69
69
|
end
|
70
70
|
|
71
71
|
options[:subject_coverage] = 0.25
|
data/bin/make_user_db.rb
CHANGED
@@ -63,6 +63,11 @@ optparse = OptionParser.new do |opts|
|
|
63
63
|
options[:local] = TRUE
|
64
64
|
end
|
65
65
|
|
66
|
+
options[:user_fasta] = nil
|
67
|
+
opts.on( '-f', '--user_fasta FILE', 'Use a cutom fasta file to build the user database') do |file|
|
68
|
+
options[:user_fasta] = file
|
69
|
+
end
|
70
|
+
|
66
71
|
options[:cdhit] = 0
|
67
72
|
opts.on( '-c', '--cdhit FLOAT', 'Compact databases with cdhit. 0 for deactivate, >0 - 1 to set percentage of identity. Default: 0') do |cdhit|
|
68
73
|
options[:cdhit] = cdhit.to_f
|
@@ -85,10 +90,14 @@ optparse.parse!
|
|
85
90
|
########################################################
|
86
91
|
## MAIN
|
87
92
|
########################################################
|
88
|
-
|
89
|
-
if options[:taxon].nil? || options[:uniprot_div].nil?
|
90
|
-
|
91
|
-
|
93
|
+
if options[:user_fasta].nil?
|
94
|
+
if options[:taxon].nil? || options[:uniprot_div].nil?
|
95
|
+
puts 'Taxon or uniprot division was not specified'
|
96
|
+
Process.exit(-1)
|
97
|
+
end
|
98
|
+
elsif !File.exists?(options[:user_fasta]) || options[:taxon].nil?
|
99
|
+
puts 'User fasta file not exists or taxon was not specified'
|
100
|
+
Process.exit(-1)
|
92
101
|
end
|
93
102
|
|
94
103
|
if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB'])
|
@@ -102,15 +111,21 @@ if !options[:local]
|
|
102
111
|
else
|
103
112
|
user_db_folder = File.join(Dir.pwd, options[:taxon])
|
104
113
|
end
|
105
|
-
|
114
|
+
|
106
115
|
user_db_folder.gsub!(' ', '_')
|
107
|
-
output_file_path.gsub!(' ', '_')
|
108
116
|
Dir.mkdir(user_db_folder) if !File.exists?(user_db_folder)
|
117
|
+
output_file_path = File.join(user_db_folder, options[:taxon]+".fasta")
|
118
|
+
output_file_path.gsub!(' ', '_')
|
109
119
|
|
110
|
-
|
111
|
-
|
112
|
-
isoform_hash =
|
113
|
-
seqs
|
120
|
+
seqs = ''
|
121
|
+
if options[:user_fasta].nil?
|
122
|
+
isoform_hash = load_isoform_hash(File.join(formatted_db_path, 'uniprot_sprot_varsplic.fasta'))
|
123
|
+
seqs = get_seqs(File.join(formatted_db_path, 'sp_' + options[:uniprot_div],"sp_#{options[:uniprot_div]}.index"), options[:taxon], isoform_hash)
|
124
|
+
isoform_hash = nil
|
125
|
+
seqs << get_seqs(File.join(formatted_db_path, 'tr_' + options[:uniprot_div],"tr_#{options[:uniprot_div]}.index"), options[:taxon], isoform_hash)
|
126
|
+
else
|
127
|
+
seqs = File.open(options[:user_fasta]).read
|
128
|
+
end
|
114
129
|
|
115
130
|
if options[:cdhit] > 0
|
116
131
|
output_file = File.open(output_file_path, 'w')
|
data/lib/full_lengther_next.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: full_lengther_next
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2016-
|
12
|
+
date: 2016-04-14 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: xml-simple
|
@@ -60,13 +60,13 @@ dependencies:
|
|
60
60
|
- !ruby/object:Gem::Version
|
61
61
|
version: 0.0.29
|
62
62
|
- !ruby/object:Gem::Dependency
|
63
|
-
name:
|
63
|
+
name: scbi_zcat
|
64
64
|
requirement: !ruby/object:Gem::Requirement
|
65
65
|
none: false
|
66
66
|
requirements:
|
67
67
|
- - ! '>='
|
68
68
|
- !ruby/object:Gem::Version
|
69
|
-
version: 0
|
69
|
+
version: '0'
|
70
70
|
type: :runtime
|
71
71
|
prerelease: false
|
72
72
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -74,7 +74,7 @@ dependencies:
|
|
74
74
|
requirements:
|
75
75
|
- - ! '>='
|
76
76
|
- !ruby/object:Gem::Version
|
77
|
-
version: 0
|
77
|
+
version: '0'
|
78
78
|
- !ruby/object:Gem::Dependency
|
79
79
|
name: bio-cd-hit-report
|
80
80
|
requirement: !ruby/object:Gem::Requirement
|