full_lengther_next 0.5.7 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +2 -4
- data/bin/download_fln_dbs.rb +43 -32
- data/bin/full_lengther_next +3 -3
- data/bin/make_user_db.rb +25 -10
- data/lib/full_lengther_next.rb +1 -1
- metadata +5 -5
data/Rakefile
CHANGED
@@ -22,13 +22,11 @@ $hoe = Hoe.spec 'full_lengther_next' do
|
|
22
22
|
self.extra_deps << ['xml-simple','>=1.0.12']
|
23
23
|
self.extra_deps << ['scbi_blast','>=0.0.32']
|
24
24
|
self.extra_deps << ['scbi_mapreduce','>=0.0.29']
|
25
|
-
self.extra_deps << ['
|
25
|
+
self.extra_deps << ['scbi_zcat']
|
26
26
|
self.extra_deps << ['bio-cd-hit-report', '>= 0.1.0 ']
|
27
27
|
self.extra_deps << ['bio', '>= 1.4.3']
|
28
|
-
# self.extra_deps << ['scbi_fastq','>=0.0.13']
|
29
28
|
self.extra_deps << ['scbi_plot','>=0.0.6']
|
30
|
-
|
31
|
-
|
29
|
+
|
32
30
|
end
|
33
31
|
|
34
32
|
require 'newgem/tasks'
|
data/bin/download_fln_dbs.rb
CHANGED
@@ -10,7 +10,7 @@ $: << File.expand_path(File.join(ROOT_PATH, "../lib/full_lengther_next/classes/"
|
|
10
10
|
require 'bio'
|
11
11
|
require 'net/ftp'
|
12
12
|
require 'open-uri'
|
13
|
-
require '
|
13
|
+
require 'scbi_zcat'
|
14
14
|
require 'optparse'
|
15
15
|
require 'cdhit'
|
16
16
|
require 'handle_db'
|
@@ -20,52 +20,64 @@ require 'handle_db'
|
|
20
20
|
#############################################################################################
|
21
21
|
def download_ncrna(formatted_db_path, no_download)
|
22
22
|
|
23
|
-
ncrna_zip = File.join(formatted_db_path, 'ncrna.
|
24
|
-
ncrna_fasta = File.join(formatted_db_path, 'ncrna.fasta')
|
23
|
+
ncrna_zip = File.join(formatted_db_path, 'ncrna.gz')
|
25
24
|
db_path = File.join(formatted_db_path, 'nc_rna_db')
|
26
|
-
|
27
|
-
|
28
|
-
|
25
|
+
db_files = File.join(db_path, 'ncrna')
|
26
|
+
fasta = File.join(db_path , 'filtered.fasta')
|
29
27
|
if !no_download
|
30
28
|
puts "Downloading ncRNA database"
|
31
29
|
open(ncrna_zip, 'wb') do |my_file|
|
32
|
-
my_file.print open('
|
30
|
+
my_file.print open('ftp://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/sequences/rnacentral_active.fasta.gz').read
|
31
|
+
#my_file.print open('http://www.ncrna.org/frnadb/files/ncrna.zip').read
|
33
32
|
end
|
34
33
|
puts "\nncRNA database downloaded"
|
35
|
-
system("unzip", ncrna_zip, "-d", ncrna_fasta)
|
36
|
-
system("rm", ncrna_zip)
|
37
34
|
end
|
38
35
|
|
39
|
-
if File.exists?(
|
36
|
+
if File.exists?(ncrna_zip)
|
40
37
|
Dir.mkdir(db_path) if !File.exists?(db_path)
|
41
|
-
|
42
|
-
filtered_fasta = filtering_ncbi_seqs(
|
43
|
-
#system("LANG=C sed 's/[^A-Za-z0-9\.> -\|]/_/g' #{ncrna_fasta}_filtered > #{ncrna_fasta}_cln")
|
38
|
+
black_list = [' 16S ', 'rRNA', 'ribosomal', 'tRNA', 'rrn'] #rrn = ribosonal rna
|
39
|
+
filtered_fasta = filtering_ncbi_seqs(ncrna_zip, 40, black_list)
|
44
40
|
puts "\nncRNA database decompressed and cleaned"
|
45
|
-
do_makeblastdb(filtered_fasta, db_files, 'nucl')
|
46
|
-
|
41
|
+
#do_makeblastdb(filtered_fasta, db_files, 'nucl')
|
42
|
+
output_file = File.open(fasta, 'w')
|
43
|
+
output_file.puts filtered_fasta
|
44
|
+
output_file.close
|
45
|
+
system("cd-hit -i #{fasta} -o /dev/stderr -c 0.95 -n 11 -M 0 2>&1 >/dev/null | makeblastdb -in - -out #{db_files} -title #{File.basename(db_files)} -dbtype 'nucl' -parse_seqids")
|
47
46
|
puts "\nncRNA database completed"
|
48
47
|
end
|
49
48
|
end
|
50
49
|
|
51
|
-
def filtering_ncbi_seqs(fasta_file, max_length)
|
52
|
-
|
50
|
+
def filtering_ncbi_seqs(fasta_file, max_length, black_list)
|
51
|
+
fasta = ScbiZcatFile.new(fasta_file)
|
53
52
|
filtered_fasta = ''
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
53
|
+
seq_name = nil
|
54
|
+
seq = ''
|
55
|
+
while !fasta.eof
|
56
|
+
line = fasta.readline.chomp
|
57
|
+
if line[0] == '>'
|
58
|
+
if !seq_name.nil?
|
59
|
+
filtered_fasta << "#{seq_name}\n#{seq}\n" if seq.length >= max_length && !compare_list(seq_name, black_list)
|
60
|
+
end
|
61
|
+
seq_name = line
|
62
|
+
seq = ''
|
63
|
+
else
|
64
|
+
seq << line
|
65
|
+
end
|
66
|
+
end
|
67
|
+
filtered_fasta << "#{seq_name}\n#{seq}\n" if seq.length >= max_length && !compare_list(seq_name, black_list)
|
68
|
+
|
69
|
+
return filtered_fasta
|
70
|
+
end
|
71
|
+
|
72
|
+
def compare_list(string, list)
|
73
|
+
res = FALSE
|
74
|
+
list.each do |word|
|
75
|
+
if string.include?(word)
|
76
|
+
res = TRUE
|
77
|
+
break
|
66
78
|
end
|
67
79
|
end
|
68
|
-
return
|
80
|
+
return res
|
69
81
|
end
|
70
82
|
|
71
83
|
def conecta_uniprot(my_array, formatted_db_path)
|
@@ -312,7 +324,7 @@ optparse = OptionParser.new do |opts|
|
|
312
324
|
end
|
313
325
|
|
314
326
|
options[:all] = FALSE
|
315
|
-
opts.on( '-a', '--all_sequences', 'Generate
|
327
|
+
opts.on( '-a', '--all_sequences', 'Generate databases with all sequences') do
|
316
328
|
options[:all] = TRUE
|
317
329
|
end
|
318
330
|
|
@@ -354,7 +366,6 @@ else # otherwise use ROOTPATH + DB
|
|
354
366
|
Dir.mkdir(formatted_db_path)
|
355
367
|
end
|
356
368
|
|
357
|
-
puts formatted_db_path
|
358
369
|
|
359
370
|
ENV['BLASTDB'] = formatted_db_path
|
360
371
|
puts "Databases will be downloaded at: #{ENV['BLASTDB']}"
|
data/bin/full_lengther_next
CHANGED
@@ -63,9 +63,9 @@ optparse = OptionParser.new do |opts|
|
|
63
63
|
options[:ident] = ident.to_f
|
64
64
|
end
|
65
65
|
|
66
|
-
options[:high_clustering] =
|
67
|
-
opts.on( '-k', '--high_clustering', 'Only for representative transcriptome. Add a clustering step using pfam ids. Default
|
68
|
-
options[:high_clustering] =
|
66
|
+
options[:high_clustering] = FALSE
|
67
|
+
opts.on( '-k', '--high_clustering', 'Only for representative transcriptome. Add a clustering step using pfam ids. Default false' ) do
|
68
|
+
options[:high_clustering] = TRUE
|
69
69
|
end
|
70
70
|
|
71
71
|
options[:subject_coverage] = 0.25
|
data/bin/make_user_db.rb
CHANGED
@@ -63,6 +63,11 @@ optparse = OptionParser.new do |opts|
|
|
63
63
|
options[:local] = TRUE
|
64
64
|
end
|
65
65
|
|
66
|
+
options[:user_fasta] = nil
|
67
|
+
opts.on( '-f', '--user_fasta FILE', 'Use a cutom fasta file to build the user database') do |file|
|
68
|
+
options[:user_fasta] = file
|
69
|
+
end
|
70
|
+
|
66
71
|
options[:cdhit] = 0
|
67
72
|
opts.on( '-c', '--cdhit FLOAT', 'Compact databases with cdhit. 0 for deactivate, >0 - 1 to set percentage of identity. Default: 0') do |cdhit|
|
68
73
|
options[:cdhit] = cdhit.to_f
|
@@ -85,10 +90,14 @@ optparse.parse!
|
|
85
90
|
########################################################
|
86
91
|
## MAIN
|
87
92
|
########################################################
|
88
|
-
|
89
|
-
if options[:taxon].nil? || options[:uniprot_div].nil?
|
90
|
-
|
91
|
-
|
93
|
+
if options[:user_fasta].nil?
|
94
|
+
if options[:taxon].nil? || options[:uniprot_div].nil?
|
95
|
+
puts 'Taxon or uniprot division was not specified'
|
96
|
+
Process.exit(-1)
|
97
|
+
end
|
98
|
+
elsif !File.exists?(options[:user_fasta]) || options[:taxon].nil?
|
99
|
+
puts 'User fasta file not exists or taxon was not specified'
|
100
|
+
Process.exit(-1)
|
92
101
|
end
|
93
102
|
|
94
103
|
if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB'])
|
@@ -102,15 +111,21 @@ if !options[:local]
|
|
102
111
|
else
|
103
112
|
user_db_folder = File.join(Dir.pwd, options[:taxon])
|
104
113
|
end
|
105
|
-
|
114
|
+
|
106
115
|
user_db_folder.gsub!(' ', '_')
|
107
|
-
output_file_path.gsub!(' ', '_')
|
108
116
|
Dir.mkdir(user_db_folder) if !File.exists?(user_db_folder)
|
117
|
+
output_file_path = File.join(user_db_folder, options[:taxon]+".fasta")
|
118
|
+
output_file_path.gsub!(' ', '_')
|
109
119
|
|
110
|
-
|
111
|
-
|
112
|
-
isoform_hash =
|
113
|
-
seqs
|
120
|
+
seqs = ''
|
121
|
+
if options[:user_fasta].nil?
|
122
|
+
isoform_hash = load_isoform_hash(File.join(formatted_db_path, 'uniprot_sprot_varsplic.fasta'))
|
123
|
+
seqs = get_seqs(File.join(formatted_db_path, 'sp_' + options[:uniprot_div],"sp_#{options[:uniprot_div]}.index"), options[:taxon], isoform_hash)
|
124
|
+
isoform_hash = nil
|
125
|
+
seqs << get_seqs(File.join(formatted_db_path, 'tr_' + options[:uniprot_div],"tr_#{options[:uniprot_div]}.index"), options[:taxon], isoform_hash)
|
126
|
+
else
|
127
|
+
seqs = File.open(options[:user_fasta]).read
|
128
|
+
end
|
114
129
|
|
115
130
|
if options[:cdhit] > 0
|
116
131
|
output_file = File.open(output_file_path, 'w')
|
data/lib/full_lengther_next.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: full_lengther_next
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2016-
|
12
|
+
date: 2016-04-14 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: xml-simple
|
@@ -60,13 +60,13 @@ dependencies:
|
|
60
60
|
- !ruby/object:Gem::Version
|
61
61
|
version: 0.0.29
|
62
62
|
- !ruby/object:Gem::Dependency
|
63
|
-
name:
|
63
|
+
name: scbi_zcat
|
64
64
|
requirement: !ruby/object:Gem::Requirement
|
65
65
|
none: false
|
66
66
|
requirements:
|
67
67
|
- - ! '>='
|
68
68
|
- !ruby/object:Gem::Version
|
69
|
-
version: 0
|
69
|
+
version: '0'
|
70
70
|
type: :runtime
|
71
71
|
prerelease: false
|
72
72
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -74,7 +74,7 @@ dependencies:
|
|
74
74
|
requirements:
|
75
75
|
- - ! '>='
|
76
76
|
- !ruby/object:Gem::Version
|
77
|
-
version: 0
|
77
|
+
version: '0'
|
78
78
|
- !ruby/object:Gem::Dependency
|
79
79
|
name: bio-cd-hit-report
|
80
80
|
requirement: !ruby/object:Gem::Requirement
|