full_lengther_next 0.0.8 → 0.5.6
Sign up to get free protection for your applications and to get access to all the features.
- data/.gemtest +0 -0
- data/History.txt +2 -2
- data/Manifest.txt +33 -18
- data/Rakefile +4 -2
- data/bin/download_fln_dbs.rb +310 -158
- data/bin/full_lengther_next +160 -103
- data/bin/make_test_dataset.rb +236 -0
- data/bin/make_user_db.rb +101 -117
- data/bin/plot_fln.rb +270 -0
- data/bin/plot_taxonomy.rb +70 -0
- data/lib/expresscanvas.zip +0 -0
- data/lib/full_lengther_next.rb +3 -3
- data/lib/full_lengther_next/classes/artifacts.rb +66 -0
- data/lib/full_lengther_next/classes/blast_functions.rb +326 -0
- data/lib/full_lengther_next/classes/cdhit.rb +154 -0
- data/lib/full_lengther_next/classes/chimeric_seqs.rb +315 -57
- data/lib/full_lengther_next/classes/common_functions.rb +105 -63
- data/lib/full_lengther_next/classes/exonerate_result.rb +258 -0
- data/lib/full_lengther_next/classes/fl_analysis.rb +226 -617
- data/lib/full_lengther_next/classes/fl_string_utils.rb +4 -2
- data/lib/full_lengther_next/classes/fln_stats.rb +598 -557
- data/lib/full_lengther_next/classes/handle_db.rb +30 -0
- data/lib/full_lengther_next/classes/my_worker.rb +308 -138
- data/lib/full_lengther_next/classes/my_worker_EST.rb +54 -0
- data/lib/full_lengther_next/classes/my_worker_manager_EST.rb +69 -0
- data/lib/full_lengther_next/classes/my_worker_manager_fln.rb +389 -0
- data/lib/full_lengther_next/classes/nc_rna.rb +5 -7
- data/lib/full_lengther_next/classes/reptrans.rb +210 -0
- data/lib/full_lengther_next/classes/sequence.rb +439 -80
- data/lib/full_lengther_next/classes/test_code.rb +15 -16
- data/lib/full_lengther_next/classes/types.rb +12 -0
- data/lib/full_lengther_next/classes/une_los_hit.rb +148 -230
- data/lib/full_lengther_next/classes/warnings.rb +40 -0
- metadata +207 -93
- data/lib/full_lengther_next/classes/lcs.rb +0 -33
- data/lib/full_lengther_next/classes/my_worker_manager.rb +0 -240
data/.gemtest
ADDED
File without changes
|
data/History.txt
CHANGED
data/Manifest.txt
CHANGED
@@ -1,29 +1,44 @@
|
|
1
|
+
Rakefile
|
2
|
+
script
|
3
|
+
script/generate
|
4
|
+
script/destroy
|
5
|
+
script/console
|
6
|
+
test
|
7
|
+
test/test_full_lengther_next.rb
|
8
|
+
test/test_helper.rb
|
9
|
+
bin/plot_taxonomy.rb
|
10
|
+
bin/plot_fln.rb
|
1
11
|
bin/download_fln_dbs.rb
|
2
|
-
bin/make_user_db.rb
|
3
12
|
bin/full_lengther_next
|
13
|
+
bin/make_user_db.rb
|
14
|
+
bin/make_test_dataset.rb
|
15
|
+
PostInstall.txt
|
16
|
+
README.rdoc
|
4
17
|
History.txt
|
5
|
-
|
18
|
+
Manifest.txt
|
19
|
+
lib/full_lengther_next
|
20
|
+
lib/full_lengther_next/classes
|
21
|
+
lib/full_lengther_next/classes/blast_functions.rb
|
22
|
+
lib/full_lengther_next/classes/my_worker_manager_fln.rb
|
23
|
+
lib/full_lengther_next/classes/types.rb
|
6
24
|
lib/full_lengther_next/classes/chimeric_seqs.rb
|
25
|
+
lib/full_lengther_next/classes/artifacts.rb
|
26
|
+
lib/full_lengther_next/classes/cdhit.rb
|
7
27
|
lib/full_lengther_next/classes/fl_analysis.rb
|
8
28
|
lib/full_lengther_next/classes/fl_string_utils.rb
|
9
|
-
lib/full_lengther_next/classes/fln_stats.rb
|
10
|
-
lib/full_lengther_next/classes/lcs.rb
|
11
29
|
lib/full_lengther_next/classes/my_worker.rb
|
12
|
-
lib/full_lengther_next/classes/my_worker_manager.rb
|
13
|
-
lib/full_lengther_next/classes/nc_rna.rb
|
14
|
-
lib/full_lengther_next/classes/orf.rb
|
15
30
|
lib/full_lengther_next/classes/sequence.rb
|
31
|
+
lib/full_lengther_next/classes/my_worker_EST.rb
|
16
32
|
lib/full_lengther_next/classes/test_code.rb
|
33
|
+
lib/full_lengther_next/classes/orf.rb
|
17
34
|
lib/full_lengther_next/classes/une_los_hit.rb
|
35
|
+
lib/full_lengther_next/classes/warnings.rb
|
36
|
+
lib/full_lengther_next/classes/fln_stats.rb
|
37
|
+
lib/full_lengther_next/classes/my_worker_manager_EST.rb
|
38
|
+
lib/full_lengther_next/classes/nc_rna.rb
|
39
|
+
lib/full_lengther_next/classes/reptrans.rb
|
40
|
+
lib/full_lengther_next/classes/common_functions.rb
|
41
|
+
lib/full_lengther_next/classes/exonerate_result.rb
|
42
|
+
lib/full_lengther_next/classes/handle_db.rb
|
18
43
|
lib/full_lengther_next.rb
|
19
|
-
|
20
|
-
PostInstall.txt
|
21
|
-
Rakefile
|
22
|
-
README.rdoc
|
23
|
-
script
|
24
|
-
script/console
|
25
|
-
script/destroy
|
26
|
-
script/generate
|
27
|
-
test
|
28
|
-
test/test_full_lengther_next.rb
|
29
|
-
test/test_helper.rb
|
44
|
+
lib/expresscanvas.zip
|
data/Rakefile
CHANGED
@@ -11,7 +11,7 @@ Hoe.plugin :newgem
|
|
11
11
|
# Generate all the Rake tasks
|
12
12
|
# Run 'rake -T' to see list of generated tasks (from gem root directory)
|
13
13
|
$hoe = Hoe.spec 'full_lengther_next' do
|
14
|
-
self.developer 'Noe Fernandez & Dario Guerrero', 'noeisneo@gmail.com & dariogf@gmail.com'
|
14
|
+
self.developer 'Pedro Seoane & Noe Fernandez & Dario Guerrero ', 'seoanezonjic@hotmail.com & noeisneo@gmail.com & dariogf@gmail.com'
|
15
15
|
self.post_install_message = 'PostInstall.txt' # TODO remove if post-install message not required
|
16
16
|
self.rubyforge_name = self.name # TODO this is default value
|
17
17
|
# self.extra_deps = [['activesupport','>= 2.0.2']]
|
@@ -20,9 +20,11 @@ $hoe = Hoe.spec 'full_lengther_next' do
|
|
20
20
|
# self.extra_deps << ['gnuplot','>=0']
|
21
21
|
# self.extra_deps << ['term-ansicolor','>=1.0.5']
|
22
22
|
self.extra_deps << ['xml-simple','>=1.0.12']
|
23
|
-
self.extra_deps << ['scbi_blast','>=0.0.
|
23
|
+
self.extra_deps << ['scbi_blast','>=0.0.32']
|
24
24
|
self.extra_deps << ['scbi_mapreduce','>=0.0.29']
|
25
25
|
self.extra_deps << ['scbi_fasta','>=0.1.7']
|
26
|
+
self.extra_deps << ['bio-cd-hit-report', '>= 0.1.0 ']
|
27
|
+
self.extra_deps << ['bio', '>= 1.4.3']
|
26
28
|
# self.extra_deps << ['scbi_fastq','>=0.0.13']
|
27
29
|
self.extra_deps << ['scbi_plot','>=0.0.6']
|
28
30
|
# self.extra_deps << ['scbi_math','>=0.0.1']
|
data/bin/download_fln_dbs.rb
CHANGED
@@ -1,66 +1,94 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
|
2
|
+
|
3
3
|
# 15-2-2011 Noe Fernandez-Pozo
|
4
4
|
# Script to download Full-LengtherNext databases.
|
5
5
|
# Once in UniProtKB/Swiss-Prot, a protein entry is removed from UniProtKB/TrEMBL.
|
6
6
|
|
7
|
+
ROOT_PATH=File.dirname(__FILE__)
|
8
|
+
$: << File.expand_path(File.join(ROOT_PATH, "../lib/full_lengther_next/classes/"))
|
9
|
+
|
10
|
+
require 'bio'
|
7
11
|
require 'net/ftp'
|
8
12
|
require 'open-uri'
|
13
|
+
require 'scbi_fasta'
|
14
|
+
require 'optparse'
|
15
|
+
require 'cdhit'
|
16
|
+
require 'handle_db'
|
17
|
+
|
18
|
+
##############################################################################################
|
19
|
+
## METHODS
|
20
|
+
#############################################################################################
|
21
|
+
def download_ncrna(formatted_db_path, no_download)
|
9
22
|
|
10
|
-
|
23
|
+
ncrna_zip = File.join(formatted_db_path, 'ncrna.zip')
|
24
|
+
ncrna_fasta = File.join(formatted_db_path, 'ncrna.fasta')
|
25
|
+
db_path = File.join(formatted_db_path, 'nc_rna_db')
|
26
|
+
source_file = File.join(db_path, ncrna_fasta)
|
11
27
|
|
12
|
-
def download_ncrna(formatted_db_path)
|
13
28
|
|
14
|
-
if !
|
15
|
-
|
29
|
+
if !no_download
|
30
|
+
puts "Downloading ncRNA database"
|
31
|
+
open(ncrna_zip, 'wb') do |my_file|
|
32
|
+
my_file.print open('http://www.ncrna.org/frnadb/files/ncrna.zip').read
|
33
|
+
end
|
34
|
+
puts "\nncRNA database downloaded"
|
35
|
+
system("unzip", ncrna_zip, "-d", ncrna_fasta)
|
36
|
+
system("rm", ncrna_zip)
|
16
37
|
end
|
17
38
|
|
18
|
-
|
19
|
-
|
20
|
-
|
39
|
+
if File.exists?(ncrna_fasta)
|
40
|
+
Dir.mkdir(db_path) if !File.exists?(db_path)
|
41
|
+
db_files = File.join(db_path, 'ncrna')
|
42
|
+
filtered_fasta = filtering_ncbi_seqs(ncrna_fasta, 40)
|
43
|
+
#system("LANG=C sed 's/[^A-Za-z0-9\.> -\|]/_/g' #{ncrna_fasta}_filtered > #{ncrna_fasta}_cln")
|
44
|
+
puts "\nncRNA database decompressed and cleaned"
|
45
|
+
do_makeblastdb(filtered_fasta, db_files, 'nucl')
|
46
|
+
system("rm #{ncrna_fasta}")
|
47
|
+
puts "\nncRNA database completed"
|
21
48
|
end
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
49
|
+
end
|
50
|
+
|
51
|
+
def filtering_ncbi_seqs(fasta_file, max_length)
|
52
|
+
fasta = FastaQualFile.new(fasta_file)
|
53
|
+
filtered_fasta = ''
|
54
|
+
fasta.each do |name, seq, comments|
|
55
|
+
name ="#{name} #{comments}"
|
56
|
+
if seq.length >= max_length
|
57
|
+
fields = name.split('|')
|
58
|
+
if fields[1] == '' || name.include?('||')
|
59
|
+
new_name = name
|
60
|
+
else #Cut huge description
|
61
|
+
new_name = fields[0]+'|'
|
62
|
+
ids = fields[1].split(',')
|
63
|
+
new_name << "#{ids.first}\|#{fields[2]}"
|
64
|
+
end
|
65
|
+
filtered_fasta << ">#{new_name}\n#{seq}\n"
|
66
|
+
end
|
67
|
+
end
|
68
|
+
return filtered_fasta
|
35
69
|
end
|
36
70
|
|
37
71
|
def conecta_uniprot(my_array, formatted_db_path)
|
38
72
|
|
73
|
+
Dir.mkdir(formatted_db_path) if !File.exists?(formatted_db_path)
|
74
|
+
varsplic_out=File.join(formatted_db_path,'uniprot_sprot_varsplic.fasta.gz')
|
75
|
+
|
39
76
|
$ftp = Net::FTP.new()
|
40
|
-
|
41
|
-
if !File.exists?(formatted_db_path)
|
42
|
-
Dir.mkdir(formatted_db_path)
|
43
|
-
end
|
44
|
-
|
45
|
-
$ftp.connect('ftp.uniprot.org')
|
46
|
-
|
77
|
+
$ftp.connect('ftp.ebi.ac.uk')
|
47
78
|
$ftp.login
|
48
79
|
|
49
80
|
puts "connected to UniProt"
|
50
|
-
|
51
81
|
my_array.each do |db_group|
|
52
82
|
puts "Downloading #{db_group}"
|
53
83
|
download_uniprot(db_group, formatted_db_path)
|
54
84
|
end
|
55
|
-
|
56
|
-
|
85
|
+
|
86
|
+
#archivo de variantes de splicing. POR QUE?
|
57
87
|
$ftp.chdir("/pub/databases/uniprot/current_release/knowledgebase/complete")
|
58
88
|
$ftp.getbinaryfile("uniprot_sprot_varsplic.fasta.gz", varsplic_out)
|
59
|
-
|
60
|
-
puts "isoform files downloaded"
|
61
|
-
|
62
89
|
$ftp.close
|
63
90
|
|
91
|
+
puts "isoform files downloaded"
|
64
92
|
end
|
65
93
|
|
66
94
|
def download_uniprot(uniprot_group, formatted_db_path)
|
@@ -70,156 +98,280 @@ def download_uniprot(uniprot_group, formatted_db_path)
|
|
70
98
|
$ftp.chdir("/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions")
|
71
99
|
$ftp.getbinaryfile("uniprot_sprot_#{uniprot_group}.dat.gz", sp_out)
|
72
100
|
$ftp.getbinaryfile("uniprot_trembl_#{uniprot_group}.dat.gz", tr_out)
|
73
|
-
|
101
|
+
|
74
102
|
puts "#{uniprot_group} files downloaded"
|
75
103
|
|
76
104
|
end
|
77
105
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
#
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
organism_name = ''
|
97
|
-
seq = ''
|
98
|
-
organelle = ''
|
99
|
-
|
100
|
-
file_name =~ /uniprot_([a-z]+)_([a-z]+).dat/
|
101
|
-
db_name = $1
|
102
|
-
output_name = $2
|
103
|
-
db_name.sub!('sprot','sp')
|
104
|
-
db_name.sub!('trembl','tr')
|
105
|
-
|
106
|
-
if !File.exists?(File.join(formatted_db_path, "#{db_name}_#{output_name}"))
|
107
|
-
Dir.mkdir(File.join(formatted_db_path, "#{db_name}_#{output_name}"))
|
108
|
-
end
|
109
|
-
|
110
|
-
output_file = File.new(File.join(formatted_db_path, "#{db_name}_#{output_name}/#{db_name}_#{output_name}.fasta"), "w")
|
111
|
-
|
112
|
-
File.open(file_name).each_line do |line|
|
113
|
-
if (newseq == false)
|
114
|
-
if (line =~ /^AC\s+(\w+);/)
|
115
|
-
id=$1
|
116
|
-
newseq = true
|
117
|
-
description = ''
|
118
|
-
organism_name = ''
|
119
|
-
seq = ''
|
120
|
-
print_seq = true
|
121
|
-
organelle = ''
|
122
|
-
end
|
123
|
-
else
|
124
|
-
if (line =~ /^DE\s+(.+)\;*/)
|
125
|
-
if (description == '')
|
126
|
-
description = $1
|
127
|
-
description.sub!(/RecName: Full=/,'sp=')
|
128
|
-
description.sub!(/SubName: Full=/,'tr=')
|
129
|
-
end
|
130
|
-
if (line =~ /Flags: Fragment/)
|
131
|
-
# puts "#{id} #{line}"
|
132
|
-
print_seq=false
|
133
|
-
end
|
134
|
-
elsif (line =~ /^OS\s+(.+)/)
|
135
|
-
organism_name = $1
|
136
|
-
elsif (line =~ /^OG\s+(.+)/)
|
137
|
-
organelle = $1
|
138
|
-
elsif (line =~ /^FT\s+NON_TER\s+/)
|
139
|
-
print_seq=false
|
140
|
-
# puts "#{id} NON_TER"
|
141
|
-
elsif (line =~ /^FT\s+NON_CONS\s+(\d+)\s+/)
|
142
|
-
print_seq=false
|
143
|
-
# puts "#{id} NON_CONS"
|
144
|
-
elsif (line =~ /^\s+([\w\s]+)/)
|
145
|
-
seq += $1
|
146
|
-
elsif (line =~ /^\/\//)
|
147
|
-
seq.gsub!(/\s*/,'')
|
148
|
-
if (seq !~ /^M/i)
|
149
|
-
print_seq=false
|
150
|
-
end
|
151
|
-
newseq = false
|
152
|
-
|
153
|
-
if (print_seq)
|
154
|
-
output_file.puts ">#{id} #{description} #{organism_name} #{organelle}\n#{seq}"
|
155
|
-
if (!isoform_hash[id].nil?)
|
156
|
-
output_file.puts isoform_hash[id]
|
157
|
-
end
|
158
|
-
end
|
106
|
+
|
107
|
+
|
108
|
+
def filter_and_makeDB(formatted_db_path, dbtype, db_group, isoform_hash, prefix, options)
|
109
|
+
file_name = prefix +'_' + db_group
|
110
|
+
puts 'Building ' + file_name
|
111
|
+
fasta = File.join(formatted_db_path,"#{file_name}","#{file_name}.fasta")
|
112
|
+
blastdb_input = fasta.gsub('.fasta', '')
|
113
|
+
current_db_source = File.join(formatted_db_path, "uniprot_#{dbtype}_#{db_group}.dat")
|
114
|
+
if File.exists?(current_db_source)
|
115
|
+
seqs = filter_incomplete_seqs(current_db_source, isoform_hash, formatted_db_path, file_name, options)
|
116
|
+
if !options[:only_index]
|
117
|
+
if options[:cdhit] > 0
|
118
|
+
output_file = File.open(fasta, 'w')
|
119
|
+
output_file.puts seqs
|
120
|
+
output_file.close
|
121
|
+
system("cd-hit -i #{fasta} -o /dev/stderr -c #{options[:cdhit]} -M 0 -s 0.95 2>&1 >/dev/null| makeblastdb -in - -out #{blastdb_input} -title #{File.basename(blastdb_input)} -dbtype 'prot' -parse_seqids")
|
122
|
+
else
|
123
|
+
do_makeblastdb(seqs, blastdb_input, 'prot')
|
159
124
|
end
|
160
125
|
end
|
161
126
|
end
|
162
|
-
output_file.close
|
163
127
|
end
|
164
128
|
|
165
|
-
def
|
129
|
+
def complete?(uniprot_record)
|
130
|
+
complete = TRUE
|
131
|
+
if uniprot_record.description.include?('Flags: Fragment') || #Discard non full length records
|
132
|
+
uniprot_record.seq[0] != 'M' ||
|
133
|
+
uniprot_record.seq.include?('XX') ||
|
134
|
+
uniprot_record.ft.keys.include?('NON_TER') ||# The residue at an extremity of the sequence is not the terminal residue. If applied to position 1, this signifies that the first position is not the N-terminus of the complete molecule. If applied to the last position, it means that this position is not the C-terminus of the complete molecule. There is no description field for this key
|
135
|
+
uniprot_record.ft.keys.include?('NON_CONS') # Non-consecutive residues. Indicates that two residues in a sequence are not consecutive and that there are a number of unreported or missing residues between them
|
136
|
+
complete = FALSE
|
137
|
+
end
|
138
|
+
return complete
|
139
|
+
end
|
140
|
+
|
141
|
+
def fln_record(uniprot_record, seqs, index, isoform_hash)
|
142
|
+
index_record = []
|
143
|
+
# Primary data
|
144
|
+
accession_number = uniprot_record.accession
|
145
|
+
description_data = uniprot_record.description.split(';')
|
146
|
+
description = description_data.first
|
147
|
+
description.sub!(/RecName: Full=/,'sp=')
|
148
|
+
description.sub!(/SubName: Full=/,'tr=')
|
149
|
+
description.sub!(/{\S*}/,'')
|
150
|
+
|
151
|
+
organism = uniprot_record.os.first.values.reverse.join(' ')
|
152
|
+
organelle = uniprot_record.og.join(' ')
|
153
|
+
sequence = uniprot_record.seq.gsub('U','X')
|
154
|
+
|
155
|
+
# Secondary data
|
156
|
+
index_record << accession_number
|
157
|
+
index_record << description
|
158
|
+
taxonomy = uniprot_record.oc.join(';')
|
159
|
+
index_record << taxonomy
|
160
|
+
index_record << organism
|
161
|
+
index_record << sequence
|
162
|
+
if !organelle.empty?
|
163
|
+
index_record << organelle
|
164
|
+
else
|
165
|
+
index_record << '-'
|
166
|
+
end
|
167
|
+
go_data = uniprot_record.dr['GO']
|
168
|
+
if !go_data.nil?
|
169
|
+
index_record << go_data.map{|go| go[0]}.join(';') # GO ID
|
170
|
+
index_record << go_data.map{|go| go[1]}.join(';') # GO Description
|
171
|
+
else
|
172
|
+
index_record << '-'
|
173
|
+
index_record << '-'
|
174
|
+
end
|
175
|
+
kegg_data = uniprot_record.dr['KEGG']
|
176
|
+
if !kegg_data.nil?
|
177
|
+
index_record << kegg_data.map{|kegg| kegg[0]}.join(';')
|
178
|
+
else
|
179
|
+
index_record << '-'
|
180
|
+
end
|
181
|
+
interpro_data = uniprot_record.dr['InterPro']
|
182
|
+
if !interpro_data.nil?
|
183
|
+
index_record << interpro_data.map{|ip| ip[0]}.join(';') # interpro ID
|
184
|
+
index_record << interpro_data.map{|ip| ip[1]}.join(';') # ip Description
|
185
|
+
else
|
186
|
+
index_record << '-'
|
187
|
+
index_record << '-'
|
188
|
+
end
|
189
|
+
|
190
|
+
if !description_data[1].nil? && description_data[1].include?('EC=')
|
191
|
+
index_record << description_data[1].split(' ').first.gsub('=',':')
|
192
|
+
else
|
193
|
+
index_record << '-'
|
194
|
+
end
|
195
|
+
|
196
|
+
pfam_data = uniprot_record.dr['Pfam']
|
197
|
+
if !pfam_data.nil?
|
198
|
+
index_record << pfam_data.map!{|pf| pf[0]}.join(';') # pfam ID
|
199
|
+
index_record << pfam_data.map!{|pf| pf[1]}.join(';') # pfam description
|
200
|
+
else
|
201
|
+
index_record << '-'
|
202
|
+
index_record << '-'
|
203
|
+
end
|
204
|
+
|
205
|
+
unipathway_data = uniprot_record.dr['UniPathway']
|
206
|
+
if !unipathway_data.nil?
|
207
|
+
index_record << unipathway_data.map!{|pf| pf[0]}.join(';') # unipathway ID
|
208
|
+
else
|
209
|
+
index_record << '-'
|
210
|
+
end
|
211
|
+
|
212
|
+
seqs << ">#{[accession_number, description, organism, organelle].join(' ')}\n#{sequence}\n"
|
213
|
+
index.puts index_record.join("\t")
|
214
|
+
seqs << isoform_hash[accession_number]+"\n" if !isoform_hash.nil? && !isoform_hash[accession_number].nil?
|
215
|
+
end
|
216
|
+
|
217
|
+
def ncbi_record(uniprot_record, seqs)
|
218
|
+
accession_number = uniprot_record.accession
|
219
|
+
id = uniprot_record.entry_id
|
220
|
+
organism = uniprot_record.os.first.values.reverse.join(' ')
|
221
|
+
sequence = uniprot_record.seq
|
222
|
+
description = uniprot_record.description.split(';').first
|
223
|
+
gene_name = nil
|
224
|
+
gn_field = uniprot_record.gn.first
|
225
|
+
gene_name = gn_field[:name] if !gn_field.nil?
|
226
|
+
prediction_field = uniprot_record.get('PE')
|
227
|
+
prediction_field =~ /PE\s+(\d+):/
|
228
|
+
prediction_status = $1
|
229
|
+
sequence_version_field = uniprot_record.dt['sequence']
|
230
|
+
sequence_version_field =~ /sequence version (\d+)./
|
231
|
+
sequence_version = $1
|
232
|
+
db = nil
|
233
|
+
if description.include?('RecName: Full=')
|
234
|
+
db = 'sp'
|
235
|
+
description.sub!(/RecName: Full=/,'')
|
236
|
+
elsif description.include?('SubName: Full=')
|
237
|
+
db = 'tr'
|
238
|
+
description.sub!(/SubName: Full=/,'')
|
239
|
+
end
|
240
|
+
taxonomy = uniprot_record.oc.join(';')
|
241
|
+
|
242
|
+
seqs << ">#{db}|#{accession_number}|#{id} #{description} OS=#{organism} GN=#{gene_name} PE=#{prediction_status} SV=#{sequence_version}\n#{sequence}\n"
|
243
|
+
end
|
244
|
+
|
245
|
+
def filter_incomplete_seqs(file_name, isoform_hash, formatted_db_path, db_name, options)
|
166
246
|
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
247
|
+
puts "filtering sequences from #{file_name}"
|
248
|
+
|
249
|
+
db_folder = File.join(formatted_db_path, db_name)
|
250
|
+
Dir.mkdir(db_folder) if !File.exists?(db_folder)
|
251
|
+
|
252
|
+
main_name = File.join(db_folder, db_name)
|
253
|
+
index = File.open(main_name + '.index', 'w') if !options[:all]
|
254
|
+
seqs = ''
|
255
|
+
Bio::FlatFile.auto(file_name).each_entry {|uniprot_record|
|
256
|
+
if !options[:all] && !complete?(uniprot_record)
|
257
|
+
next
|
258
|
+
else #Get attributes of full length records
|
259
|
+
if options[:all]
|
260
|
+
ncbi_record(uniprot_record, seqs)
|
175
261
|
else
|
176
|
-
isoform_hash
|
262
|
+
fln_record(uniprot_record, seqs, index, isoform_hash)
|
177
263
|
end
|
178
|
-
my_fasta = "#{$1}\n"
|
179
|
-
acc = $2
|
180
|
-
else
|
181
|
-
my_fasta += line
|
182
264
|
end
|
183
|
-
|
184
|
-
|
185
|
-
|
265
|
+
}
|
266
|
+
|
267
|
+
index.close if !options[:all]
|
268
|
+
return seqs
|
186
269
|
end
|
187
270
|
|
188
|
-
################################################### MAIN
|
189
271
|
|
190
|
-
|
272
|
+
|
273
|
+
##########################################################################################
|
274
|
+
## OPTIONS
|
275
|
+
##########################################################################################
|
276
|
+
|
277
|
+
options = {}
|
278
|
+
|
279
|
+
divs = %w{human fungi invertebrates mammals plants rodents vertebrates}
|
280
|
+
|
281
|
+
optparse = OptionParser.new do |opts|
|
282
|
+
options[:uniprot_div] = divs
|
283
|
+
opts.on( '-u', '--file String', 'Uniprot DBs to be downloaded. String structure: \'div_name1,div_name2..\'. Posible options: human, fungi, invertebrates, mammals, plants, rodents, vertebrates. Default: download all') do |uniprot_div|
|
284
|
+
temp_divs = uniprot_div.split(',')
|
285
|
+
check_valid_ids = temp_divs - divs
|
286
|
+
if !check_valid_ids.empty?
|
287
|
+
puts 'This uniprot division not exists', check_valid_ids
|
288
|
+
process.exit
|
289
|
+
else
|
290
|
+
options[:uniprot_div] = temp_divs
|
291
|
+
end
|
292
|
+
end
|
293
|
+
|
294
|
+
options[:no_download] = FALSE
|
295
|
+
opts.on( '-d', '--no_download', 'Only parse downloaded files without download them again') do
|
296
|
+
options[:no_download] = TRUE
|
297
|
+
end
|
298
|
+
|
299
|
+
options[:no_ncrna] = FALSE
|
300
|
+
opts.on( '-n', '--no_ncrna', 'No use ncrna sequences') do
|
301
|
+
options[:no_ncrna] = TRUE
|
302
|
+
end
|
303
|
+
|
304
|
+
options[:only_index] = FALSE
|
305
|
+
opts.on( '-i', '--only_index', 'Build annotation index only without blast DB') do
|
306
|
+
options[:only_index] = TRUE
|
307
|
+
end
|
308
|
+
|
309
|
+
options[:no_trembl] = FALSE
|
310
|
+
opts.on( '-t', '--no_trembl', 'No use trembl sequences') do
|
311
|
+
options[:no_trembl] = TRUE
|
312
|
+
end
|
313
|
+
|
314
|
+
options[:all] = FALSE
|
315
|
+
opts.on( '-a', '--all_sequences', 'Generate databaeses with all sequences') do
|
316
|
+
options[:all] = TRUE
|
317
|
+
end
|
318
|
+
|
319
|
+
options[:cdhit] = 0
|
320
|
+
opts.on( '-c', '--cdhit FLOAT', 'Compact databases with cdhit. 0 for deactivate, >0 - 1 to set percentage of identity. Default: 0') do |cdhit|
|
321
|
+
options[:cdhit] = cdhit.to_f
|
322
|
+
end
|
323
|
+
|
324
|
+
options[:no_uniprot] = FALSE
|
325
|
+
opts.on( '-p', '--no_uniprot', 'No use uniprot sequences') do
|
326
|
+
options[:no_uniprot] = TRUE
|
327
|
+
end
|
328
|
+
|
329
|
+
|
330
|
+
# Set a banner, displayed at the top of the help screen.
|
331
|
+
opts.banner = "Usage: #{File.basename(__FILE__)} [options] \n\n"
|
332
|
+
|
333
|
+
# This displays the help screen
|
334
|
+
opts.on( '-h', '--help', 'Display this screen' ) do
|
335
|
+
puts opts
|
336
|
+
exit
|
337
|
+
end
|
338
|
+
|
339
|
+
end # End opts
|
340
|
+
|
341
|
+
# parse options and remove from ARGV
|
342
|
+
optparse.parse!
|
343
|
+
|
344
|
+
|
345
|
+
##############################################################################################
|
346
|
+
## MAIN
|
347
|
+
##############################################################################################
|
348
|
+
|
191
349
|
|
192
350
|
if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB'])
|
193
351
|
formatted_db_path = ENV['BLASTDB']
|
194
352
|
else # otherwise use ROOTPATH + DB
|
195
353
|
formatted_db_path = File.expand_path(File.join(ROOT_PATH, "blast_dbs"))
|
354
|
+
Dir.mkdir(formatted_db_path)
|
196
355
|
end
|
197
356
|
|
198
|
-
|
357
|
+
puts formatted_db_path
|
358
|
+
|
359
|
+
ENV['BLASTDB'] = formatted_db_path
|
199
360
|
puts "Databases will be downloaded at: #{ENV['BLASTDB']}"
|
200
361
|
puts "\nTo set the path for storing databases, execute next line in your terminal or add it to your .bash_profile:\n\n\texport BLASTDB=/my_path/\n\n"
|
201
362
|
|
202
|
-
|
203
|
-
# my_array = ["plants","human"] # used for a shoter test
|
204
|
-
|
205
|
-
conecta_uniprot(my_array, formatted_db_path)
|
206
|
-
system('gunzip '+formatted_db_path+'*.gz')
|
207
|
-
|
208
|
-
isoform_hash = {}
|
209
|
-
isoform_hash = load_isoform_hash(File.join(formatted_db_path, "uniprot_sprot_varsplic.fasta"))
|
363
|
+
download_ncrna(formatted_db_path, options[:no_download]) if !options[:no_ncrna]
|
210
364
|
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
filter_incomplete_seqs(File.join(formatted_db_path, "uniprot_sprot_#{db_group}.dat"), isoform_hash, formatted_db_path)
|
216
|
-
filter_incomplete_seqs(File.join(formatted_db_path, "uniprot_trembl_#{db_group}.dat"), isoform_hash, formatted_db_path)
|
217
|
-
|
218
|
-
sp_fasta=File.join(formatted_db_path,"sp_#{db_group}","sp_#{db_group}.fasta")
|
219
|
-
tr_fasta=File.join(formatted_db_path,"tr_#{db_group}","tr_#{db_group}.fasta")
|
220
|
-
system("makeblastdb -in #{sp_fasta} -dbtype 'prot' -parse_seqids")
|
221
|
-
system("makeblastdb -in #{tr_fasta} -dbtype 'prot' -parse_seqids")
|
222
|
-
|
365
|
+
if !options[:no_download]
|
366
|
+
conecta_uniprot(options[:uniprot_div], formatted_db_path)
|
367
|
+
system('gunzip '+formatted_db_path+'*.gz')
|
223
368
|
end
|
224
369
|
|
225
|
-
|
370
|
+
if !options[:no_uniprot]
|
371
|
+
isoform_hash = load_isoform_hash(File.join(formatted_db_path, "uniprot_sprot_varsplic.fasta")) #archivo de variantes de splicing. POR QUE?
|
372
|
+
options[:uniprot_div].each do |db_group|
|
373
|
+
filter_and_makeDB(formatted_db_path, 'sprot', db_group, isoform_hash, 'sp', options)
|
374
|
+
filter_and_makeDB(formatted_db_path, 'trembl', db_group, nil, 'tr', options) if !options[:no_trembl]
|
375
|
+
end
|
376
|
+
end
|
377
|
+
puts "download_fln_dbs.rb has finished"
|