full_lengther_next 0.0.8 → 0.5.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gemtest +0 -0
- data/History.txt +2 -2
- data/Manifest.txt +33 -18
- data/Rakefile +4 -2
- data/bin/download_fln_dbs.rb +310 -158
- data/bin/full_lengther_next +160 -103
- data/bin/make_test_dataset.rb +236 -0
- data/bin/make_user_db.rb +101 -117
- data/bin/plot_fln.rb +270 -0
- data/bin/plot_taxonomy.rb +70 -0
- data/lib/expresscanvas.zip +0 -0
- data/lib/full_lengther_next.rb +3 -3
- data/lib/full_lengther_next/classes/artifacts.rb +66 -0
- data/lib/full_lengther_next/classes/blast_functions.rb +326 -0
- data/lib/full_lengther_next/classes/cdhit.rb +154 -0
- data/lib/full_lengther_next/classes/chimeric_seqs.rb +315 -57
- data/lib/full_lengther_next/classes/common_functions.rb +105 -63
- data/lib/full_lengther_next/classes/exonerate_result.rb +258 -0
- data/lib/full_lengther_next/classes/fl_analysis.rb +226 -617
- data/lib/full_lengther_next/classes/fl_string_utils.rb +4 -2
- data/lib/full_lengther_next/classes/fln_stats.rb +598 -557
- data/lib/full_lengther_next/classes/handle_db.rb +30 -0
- data/lib/full_lengther_next/classes/my_worker.rb +308 -138
- data/lib/full_lengther_next/classes/my_worker_EST.rb +54 -0
- data/lib/full_lengther_next/classes/my_worker_manager_EST.rb +69 -0
- data/lib/full_lengther_next/classes/my_worker_manager_fln.rb +389 -0
- data/lib/full_lengther_next/classes/nc_rna.rb +5 -7
- data/lib/full_lengther_next/classes/reptrans.rb +210 -0
- data/lib/full_lengther_next/classes/sequence.rb +439 -80
- data/lib/full_lengther_next/classes/test_code.rb +15 -16
- data/lib/full_lengther_next/classes/types.rb +12 -0
- data/lib/full_lengther_next/classes/une_los_hit.rb +148 -230
- data/lib/full_lengther_next/classes/warnings.rb +40 -0
- metadata +207 -93
- data/lib/full_lengther_next/classes/lcs.rb +0 -33
- data/lib/full_lengther_next/classes/my_worker_manager.rb +0 -240
data/.gemtest
ADDED
File without changes
|
data/History.txt
CHANGED
data/Manifest.txt
CHANGED
@@ -1,29 +1,44 @@
|
|
1
|
+
Rakefile
|
2
|
+
script
|
3
|
+
script/generate
|
4
|
+
script/destroy
|
5
|
+
script/console
|
6
|
+
test
|
7
|
+
test/test_full_lengther_next.rb
|
8
|
+
test/test_helper.rb
|
9
|
+
bin/plot_taxonomy.rb
|
10
|
+
bin/plot_fln.rb
|
1
11
|
bin/download_fln_dbs.rb
|
2
|
-
bin/make_user_db.rb
|
3
12
|
bin/full_lengther_next
|
13
|
+
bin/make_user_db.rb
|
14
|
+
bin/make_test_dataset.rb
|
15
|
+
PostInstall.txt
|
16
|
+
README.rdoc
|
4
17
|
History.txt
|
5
|
-
|
18
|
+
Manifest.txt
|
19
|
+
lib/full_lengther_next
|
20
|
+
lib/full_lengther_next/classes
|
21
|
+
lib/full_lengther_next/classes/blast_functions.rb
|
22
|
+
lib/full_lengther_next/classes/my_worker_manager_fln.rb
|
23
|
+
lib/full_lengther_next/classes/types.rb
|
6
24
|
lib/full_lengther_next/classes/chimeric_seqs.rb
|
25
|
+
lib/full_lengther_next/classes/artifacts.rb
|
26
|
+
lib/full_lengther_next/classes/cdhit.rb
|
7
27
|
lib/full_lengther_next/classes/fl_analysis.rb
|
8
28
|
lib/full_lengther_next/classes/fl_string_utils.rb
|
9
|
-
lib/full_lengther_next/classes/fln_stats.rb
|
10
|
-
lib/full_lengther_next/classes/lcs.rb
|
11
29
|
lib/full_lengther_next/classes/my_worker.rb
|
12
|
-
lib/full_lengther_next/classes/my_worker_manager.rb
|
13
|
-
lib/full_lengther_next/classes/nc_rna.rb
|
14
|
-
lib/full_lengther_next/classes/orf.rb
|
15
30
|
lib/full_lengther_next/classes/sequence.rb
|
31
|
+
lib/full_lengther_next/classes/my_worker_EST.rb
|
16
32
|
lib/full_lengther_next/classes/test_code.rb
|
33
|
+
lib/full_lengther_next/classes/orf.rb
|
17
34
|
lib/full_lengther_next/classes/une_los_hit.rb
|
35
|
+
lib/full_lengther_next/classes/warnings.rb
|
36
|
+
lib/full_lengther_next/classes/fln_stats.rb
|
37
|
+
lib/full_lengther_next/classes/my_worker_manager_EST.rb
|
38
|
+
lib/full_lengther_next/classes/nc_rna.rb
|
39
|
+
lib/full_lengther_next/classes/reptrans.rb
|
40
|
+
lib/full_lengther_next/classes/common_functions.rb
|
41
|
+
lib/full_lengther_next/classes/exonerate_result.rb
|
42
|
+
lib/full_lengther_next/classes/handle_db.rb
|
18
43
|
lib/full_lengther_next.rb
|
19
|
-
|
20
|
-
PostInstall.txt
|
21
|
-
Rakefile
|
22
|
-
README.rdoc
|
23
|
-
script
|
24
|
-
script/console
|
25
|
-
script/destroy
|
26
|
-
script/generate
|
27
|
-
test
|
28
|
-
test/test_full_lengther_next.rb
|
29
|
-
test/test_helper.rb
|
44
|
+
lib/expresscanvas.zip
|
data/Rakefile
CHANGED
@@ -11,7 +11,7 @@ Hoe.plugin :newgem
|
|
11
11
|
# Generate all the Rake tasks
|
12
12
|
# Run 'rake -T' to see list of generated tasks (from gem root directory)
|
13
13
|
$hoe = Hoe.spec 'full_lengther_next' do
|
14
|
-
self.developer 'Noe Fernandez & Dario Guerrero', 'noeisneo@gmail.com & dariogf@gmail.com'
|
14
|
+
self.developer 'Pedro Seoane & Noe Fernandez & Dario Guerrero ', 'seoanezonjic@hotmail.com & noeisneo@gmail.com & dariogf@gmail.com'
|
15
15
|
self.post_install_message = 'PostInstall.txt' # TODO remove if post-install message not required
|
16
16
|
self.rubyforge_name = self.name # TODO this is default value
|
17
17
|
# self.extra_deps = [['activesupport','>= 2.0.2']]
|
@@ -20,9 +20,11 @@ $hoe = Hoe.spec 'full_lengther_next' do
|
|
20
20
|
# self.extra_deps << ['gnuplot','>=0']
|
21
21
|
# self.extra_deps << ['term-ansicolor','>=1.0.5']
|
22
22
|
self.extra_deps << ['xml-simple','>=1.0.12']
|
23
|
-
self.extra_deps << ['scbi_blast','>=0.0.
|
23
|
+
self.extra_deps << ['scbi_blast','>=0.0.32']
|
24
24
|
self.extra_deps << ['scbi_mapreduce','>=0.0.29']
|
25
25
|
self.extra_deps << ['scbi_fasta','>=0.1.7']
|
26
|
+
self.extra_deps << ['bio-cd-hit-report', '>= 0.1.0 ']
|
27
|
+
self.extra_deps << ['bio', '>= 1.4.3']
|
26
28
|
# self.extra_deps << ['scbi_fastq','>=0.0.13']
|
27
29
|
self.extra_deps << ['scbi_plot','>=0.0.6']
|
28
30
|
# self.extra_deps << ['scbi_math','>=0.0.1']
|
data/bin/download_fln_dbs.rb
CHANGED
@@ -1,66 +1,94 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
|
2
|
+
|
3
3
|
# 15-2-2011 Noe Fernandez-Pozo
|
4
4
|
# Script to download Full-LengtherNext databases.
|
5
5
|
# Once in UniProtKB/Swiss-Prot, a protein entry is removed from UniProtKB/TrEMBL.
|
6
6
|
|
7
|
+
ROOT_PATH=File.dirname(__FILE__)
|
8
|
+
$: << File.expand_path(File.join(ROOT_PATH, "../lib/full_lengther_next/classes/"))
|
9
|
+
|
10
|
+
require 'bio'
|
7
11
|
require 'net/ftp'
|
8
12
|
require 'open-uri'
|
13
|
+
require 'scbi_fasta'
|
14
|
+
require 'optparse'
|
15
|
+
require 'cdhit'
|
16
|
+
require 'handle_db'
|
17
|
+
|
18
|
+
##############################################################################################
|
19
|
+
## METHODS
|
20
|
+
#############################################################################################
|
21
|
+
def download_ncrna(formatted_db_path, no_download)
|
9
22
|
|
10
|
-
|
23
|
+
ncrna_zip = File.join(formatted_db_path, 'ncrna.zip')
|
24
|
+
ncrna_fasta = File.join(formatted_db_path, 'ncrna.fasta')
|
25
|
+
db_path = File.join(formatted_db_path, 'nc_rna_db')
|
26
|
+
source_file = File.join(db_path, ncrna_fasta)
|
11
27
|
|
12
|
-
def download_ncrna(formatted_db_path)
|
13
28
|
|
14
|
-
if !
|
15
|
-
|
29
|
+
if !no_download
|
30
|
+
puts "Downloading ncRNA database"
|
31
|
+
open(ncrna_zip, 'wb') do |my_file|
|
32
|
+
my_file.print open('http://www.ncrna.org/frnadb/files/ncrna.zip').read
|
33
|
+
end
|
34
|
+
puts "\nncRNA database downloaded"
|
35
|
+
system("unzip", ncrna_zip, "-d", ncrna_fasta)
|
36
|
+
system("rm", ncrna_zip)
|
16
37
|
end
|
17
38
|
|
18
|
-
|
19
|
-
|
20
|
-
|
39
|
+
if File.exists?(ncrna_fasta)
|
40
|
+
Dir.mkdir(db_path) if !File.exists?(db_path)
|
41
|
+
db_files = File.join(db_path, 'ncrna')
|
42
|
+
filtered_fasta = filtering_ncbi_seqs(ncrna_fasta, 40)
|
43
|
+
#system("LANG=C sed 's/[^A-Za-z0-9\.> -\|]/_/g' #{ncrna_fasta}_filtered > #{ncrna_fasta}_cln")
|
44
|
+
puts "\nncRNA database decompressed and cleaned"
|
45
|
+
do_makeblastdb(filtered_fasta, db_files, 'nucl')
|
46
|
+
system("rm #{ncrna_fasta}")
|
47
|
+
puts "\nncRNA database completed"
|
21
48
|
end
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
49
|
+
end
|
50
|
+
|
51
|
+
def filtering_ncbi_seqs(fasta_file, max_length)
|
52
|
+
fasta = FastaQualFile.new(fasta_file)
|
53
|
+
filtered_fasta = ''
|
54
|
+
fasta.each do |name, seq, comments|
|
55
|
+
name ="#{name} #{comments}"
|
56
|
+
if seq.length >= max_length
|
57
|
+
fields = name.split('|')
|
58
|
+
if fields[1] == '' || name.include?('||')
|
59
|
+
new_name = name
|
60
|
+
else #Cut huge description
|
61
|
+
new_name = fields[0]+'|'
|
62
|
+
ids = fields[1].split(',')
|
63
|
+
new_name << "#{ids.first}\|#{fields[2]}"
|
64
|
+
end
|
65
|
+
filtered_fasta << ">#{new_name}\n#{seq}\n"
|
66
|
+
end
|
67
|
+
end
|
68
|
+
return filtered_fasta
|
35
69
|
end
|
36
70
|
|
37
71
|
def conecta_uniprot(my_array, formatted_db_path)
|
38
72
|
|
73
|
+
Dir.mkdir(formatted_db_path) if !File.exists?(formatted_db_path)
|
74
|
+
varsplic_out=File.join(formatted_db_path,'uniprot_sprot_varsplic.fasta.gz')
|
75
|
+
|
39
76
|
$ftp = Net::FTP.new()
|
40
|
-
|
41
|
-
if !File.exists?(formatted_db_path)
|
42
|
-
Dir.mkdir(formatted_db_path)
|
43
|
-
end
|
44
|
-
|
45
|
-
$ftp.connect('ftp.uniprot.org')
|
46
|
-
|
77
|
+
$ftp.connect('ftp.ebi.ac.uk')
|
47
78
|
$ftp.login
|
48
79
|
|
49
80
|
puts "connected to UniProt"
|
50
|
-
|
51
81
|
my_array.each do |db_group|
|
52
82
|
puts "Downloading #{db_group}"
|
53
83
|
download_uniprot(db_group, formatted_db_path)
|
54
84
|
end
|
55
|
-
|
56
|
-
|
85
|
+
|
86
|
+
#archivo de variantes de splicing. POR QUE?
|
57
87
|
$ftp.chdir("/pub/databases/uniprot/current_release/knowledgebase/complete")
|
58
88
|
$ftp.getbinaryfile("uniprot_sprot_varsplic.fasta.gz", varsplic_out)
|
59
|
-
|
60
|
-
puts "isoform files downloaded"
|
61
|
-
|
62
89
|
$ftp.close
|
63
90
|
|
91
|
+
puts "isoform files downloaded"
|
64
92
|
end
|
65
93
|
|
66
94
|
def download_uniprot(uniprot_group, formatted_db_path)
|
@@ -70,156 +98,280 @@ def download_uniprot(uniprot_group, formatted_db_path)
|
|
70
98
|
$ftp.chdir("/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions")
|
71
99
|
$ftp.getbinaryfile("uniprot_sprot_#{uniprot_group}.dat.gz", sp_out)
|
72
100
|
$ftp.getbinaryfile("uniprot_trembl_#{uniprot_group}.dat.gz", tr_out)
|
73
|
-
|
101
|
+
|
74
102
|
puts "#{uniprot_group} files downloaded"
|
75
103
|
|
76
104
|
end
|
77
105
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
#
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
organism_name = ''
|
97
|
-
seq = ''
|
98
|
-
organelle = ''
|
99
|
-
|
100
|
-
file_name =~ /uniprot_([a-z]+)_([a-z]+).dat/
|
101
|
-
db_name = $1
|
102
|
-
output_name = $2
|
103
|
-
db_name.sub!('sprot','sp')
|
104
|
-
db_name.sub!('trembl','tr')
|
105
|
-
|
106
|
-
if !File.exists?(File.join(formatted_db_path, "#{db_name}_#{output_name}"))
|
107
|
-
Dir.mkdir(File.join(formatted_db_path, "#{db_name}_#{output_name}"))
|
108
|
-
end
|
109
|
-
|
110
|
-
output_file = File.new(File.join(formatted_db_path, "#{db_name}_#{output_name}/#{db_name}_#{output_name}.fasta"), "w")
|
111
|
-
|
112
|
-
File.open(file_name).each_line do |line|
|
113
|
-
if (newseq == false)
|
114
|
-
if (line =~ /^AC\s+(\w+);/)
|
115
|
-
id=$1
|
116
|
-
newseq = true
|
117
|
-
description = ''
|
118
|
-
organism_name = ''
|
119
|
-
seq = ''
|
120
|
-
print_seq = true
|
121
|
-
organelle = ''
|
122
|
-
end
|
123
|
-
else
|
124
|
-
if (line =~ /^DE\s+(.+)\;*/)
|
125
|
-
if (description == '')
|
126
|
-
description = $1
|
127
|
-
description.sub!(/RecName: Full=/,'sp=')
|
128
|
-
description.sub!(/SubName: Full=/,'tr=')
|
129
|
-
end
|
130
|
-
if (line =~ /Flags: Fragment/)
|
131
|
-
# puts "#{id} #{line}"
|
132
|
-
print_seq=false
|
133
|
-
end
|
134
|
-
elsif (line =~ /^OS\s+(.+)/)
|
135
|
-
organism_name = $1
|
136
|
-
elsif (line =~ /^OG\s+(.+)/)
|
137
|
-
organelle = $1
|
138
|
-
elsif (line =~ /^FT\s+NON_TER\s+/)
|
139
|
-
print_seq=false
|
140
|
-
# puts "#{id} NON_TER"
|
141
|
-
elsif (line =~ /^FT\s+NON_CONS\s+(\d+)\s+/)
|
142
|
-
print_seq=false
|
143
|
-
# puts "#{id} NON_CONS"
|
144
|
-
elsif (line =~ /^\s+([\w\s]+)/)
|
145
|
-
seq += $1
|
146
|
-
elsif (line =~ /^\/\//)
|
147
|
-
seq.gsub!(/\s*/,'')
|
148
|
-
if (seq !~ /^M/i)
|
149
|
-
print_seq=false
|
150
|
-
end
|
151
|
-
newseq = false
|
152
|
-
|
153
|
-
if (print_seq)
|
154
|
-
output_file.puts ">#{id} #{description} #{organism_name} #{organelle}\n#{seq}"
|
155
|
-
if (!isoform_hash[id].nil?)
|
156
|
-
output_file.puts isoform_hash[id]
|
157
|
-
end
|
158
|
-
end
|
106
|
+
|
107
|
+
|
108
|
+
def filter_and_makeDB(formatted_db_path, dbtype, db_group, isoform_hash, prefix, options)
|
109
|
+
file_name = prefix +'_' + db_group
|
110
|
+
puts 'Building ' + file_name
|
111
|
+
fasta = File.join(formatted_db_path,"#{file_name}","#{file_name}.fasta")
|
112
|
+
blastdb_input = fasta.gsub('.fasta', '')
|
113
|
+
current_db_source = File.join(formatted_db_path, "uniprot_#{dbtype}_#{db_group}.dat")
|
114
|
+
if File.exists?(current_db_source)
|
115
|
+
seqs = filter_incomplete_seqs(current_db_source, isoform_hash, formatted_db_path, file_name, options)
|
116
|
+
if !options[:only_index]
|
117
|
+
if options[:cdhit] > 0
|
118
|
+
output_file = File.open(fasta, 'w')
|
119
|
+
output_file.puts seqs
|
120
|
+
output_file.close
|
121
|
+
system("cd-hit -i #{fasta} -o /dev/stderr -c #{options[:cdhit]} -M 0 -s 0.95 2>&1 >/dev/null| makeblastdb -in - -out #{blastdb_input} -title #{File.basename(blastdb_input)} -dbtype 'prot' -parse_seqids")
|
122
|
+
else
|
123
|
+
do_makeblastdb(seqs, blastdb_input, 'prot')
|
159
124
|
end
|
160
125
|
end
|
161
126
|
end
|
162
|
-
output_file.close
|
163
127
|
end
|
164
128
|
|
165
|
-
def
|
129
|
+
def complete?(uniprot_record)
|
130
|
+
complete = TRUE
|
131
|
+
if uniprot_record.description.include?('Flags: Fragment') || #Discard non full length records
|
132
|
+
uniprot_record.seq[0] != 'M' ||
|
133
|
+
uniprot_record.seq.include?('XX') ||
|
134
|
+
uniprot_record.ft.keys.include?('NON_TER') ||# The residue at an extremity of the sequence is not the terminal residue. If applied to position 1, this signifies that the first position is not the N-terminus of the complete molecule. If applied to the last position, it means that this position is not the C-terminus of the complete molecule. There is no description field for this key
|
135
|
+
uniprot_record.ft.keys.include?('NON_CONS') # Non-consecutive residues. Indicates that two residues in a sequence are not consecutive and that there are a number of unreported or missing residues between them
|
136
|
+
complete = FALSE
|
137
|
+
end
|
138
|
+
return complete
|
139
|
+
end
|
140
|
+
|
141
|
+
def fln_record(uniprot_record, seqs, index, isoform_hash)
|
142
|
+
index_record = []
|
143
|
+
# Primary data
|
144
|
+
accession_number = uniprot_record.accession
|
145
|
+
description_data = uniprot_record.description.split(';')
|
146
|
+
description = description_data.first
|
147
|
+
description.sub!(/RecName: Full=/,'sp=')
|
148
|
+
description.sub!(/SubName: Full=/,'tr=')
|
149
|
+
description.sub!(/{\S*}/,'')
|
150
|
+
|
151
|
+
organism = uniprot_record.os.first.values.reverse.join(' ')
|
152
|
+
organelle = uniprot_record.og.join(' ')
|
153
|
+
sequence = uniprot_record.seq.gsub('U','X')
|
154
|
+
|
155
|
+
# Secondary data
|
156
|
+
index_record << accession_number
|
157
|
+
index_record << description
|
158
|
+
taxonomy = uniprot_record.oc.join(';')
|
159
|
+
index_record << taxonomy
|
160
|
+
index_record << organism
|
161
|
+
index_record << sequence
|
162
|
+
if !organelle.empty?
|
163
|
+
index_record << organelle
|
164
|
+
else
|
165
|
+
index_record << '-'
|
166
|
+
end
|
167
|
+
go_data = uniprot_record.dr['GO']
|
168
|
+
if !go_data.nil?
|
169
|
+
index_record << go_data.map{|go| go[0]}.join(';') # GO ID
|
170
|
+
index_record << go_data.map{|go| go[1]}.join(';') # GO Description
|
171
|
+
else
|
172
|
+
index_record << '-'
|
173
|
+
index_record << '-'
|
174
|
+
end
|
175
|
+
kegg_data = uniprot_record.dr['KEGG']
|
176
|
+
if !kegg_data.nil?
|
177
|
+
index_record << kegg_data.map{|kegg| kegg[0]}.join(';')
|
178
|
+
else
|
179
|
+
index_record << '-'
|
180
|
+
end
|
181
|
+
interpro_data = uniprot_record.dr['InterPro']
|
182
|
+
if !interpro_data.nil?
|
183
|
+
index_record << interpro_data.map{|ip| ip[0]}.join(';') # interpro ID
|
184
|
+
index_record << interpro_data.map{|ip| ip[1]}.join(';') # ip Description
|
185
|
+
else
|
186
|
+
index_record << '-'
|
187
|
+
index_record << '-'
|
188
|
+
end
|
189
|
+
|
190
|
+
if !description_data[1].nil? && description_data[1].include?('EC=')
|
191
|
+
index_record << description_data[1].split(' ').first.gsub('=',':')
|
192
|
+
else
|
193
|
+
index_record << '-'
|
194
|
+
end
|
195
|
+
|
196
|
+
pfam_data = uniprot_record.dr['Pfam']
|
197
|
+
if !pfam_data.nil?
|
198
|
+
index_record << pfam_data.map!{|pf| pf[0]}.join(';') # pfam ID
|
199
|
+
index_record << pfam_data.map!{|pf| pf[1]}.join(';') # pfam description
|
200
|
+
else
|
201
|
+
index_record << '-'
|
202
|
+
index_record << '-'
|
203
|
+
end
|
204
|
+
|
205
|
+
unipathway_data = uniprot_record.dr['UniPathway']
|
206
|
+
if !unipathway_data.nil?
|
207
|
+
index_record << unipathway_data.map!{|pf| pf[0]}.join(';') # unipathway ID
|
208
|
+
else
|
209
|
+
index_record << '-'
|
210
|
+
end
|
211
|
+
|
212
|
+
seqs << ">#{[accession_number, description, organism, organelle].join(' ')}\n#{sequence}\n"
|
213
|
+
index.puts index_record.join("\t")
|
214
|
+
seqs << isoform_hash[accession_number]+"\n" if !isoform_hash.nil? && !isoform_hash[accession_number].nil?
|
215
|
+
end
|
216
|
+
|
217
|
+
def ncbi_record(uniprot_record, seqs)
|
218
|
+
accession_number = uniprot_record.accession
|
219
|
+
id = uniprot_record.entry_id
|
220
|
+
organism = uniprot_record.os.first.values.reverse.join(' ')
|
221
|
+
sequence = uniprot_record.seq
|
222
|
+
description = uniprot_record.description.split(';').first
|
223
|
+
gene_name = nil
|
224
|
+
gn_field = uniprot_record.gn.first
|
225
|
+
gene_name = gn_field[:name] if !gn_field.nil?
|
226
|
+
prediction_field = uniprot_record.get('PE')
|
227
|
+
prediction_field =~ /PE\s+(\d+):/
|
228
|
+
prediction_status = $1
|
229
|
+
sequence_version_field = uniprot_record.dt['sequence']
|
230
|
+
sequence_version_field =~ /sequence version (\d+)./
|
231
|
+
sequence_version = $1
|
232
|
+
db = nil
|
233
|
+
if description.include?('RecName: Full=')
|
234
|
+
db = 'sp'
|
235
|
+
description.sub!(/RecName: Full=/,'')
|
236
|
+
elsif description.include?('SubName: Full=')
|
237
|
+
db = 'tr'
|
238
|
+
description.sub!(/SubName: Full=/,'')
|
239
|
+
end
|
240
|
+
taxonomy = uniprot_record.oc.join(';')
|
241
|
+
|
242
|
+
seqs << ">#{db}|#{accession_number}|#{id} #{description} OS=#{organism} GN=#{gene_name} PE=#{prediction_status} SV=#{sequence_version}\n#{sequence}\n"
|
243
|
+
end
|
244
|
+
|
245
|
+
def filter_incomplete_seqs(file_name, isoform_hash, formatted_db_path, db_name, options)
|
166
246
|
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
247
|
+
puts "filtering sequences from #{file_name}"
|
248
|
+
|
249
|
+
db_folder = File.join(formatted_db_path, db_name)
|
250
|
+
Dir.mkdir(db_folder) if !File.exists?(db_folder)
|
251
|
+
|
252
|
+
main_name = File.join(db_folder, db_name)
|
253
|
+
index = File.open(main_name + '.index', 'w') if !options[:all]
|
254
|
+
seqs = ''
|
255
|
+
Bio::FlatFile.auto(file_name).each_entry {|uniprot_record|
|
256
|
+
if !options[:all] && !complete?(uniprot_record)
|
257
|
+
next
|
258
|
+
else #Get attributes of full length records
|
259
|
+
if options[:all]
|
260
|
+
ncbi_record(uniprot_record, seqs)
|
175
261
|
else
|
176
|
-
isoform_hash
|
262
|
+
fln_record(uniprot_record, seqs, index, isoform_hash)
|
177
263
|
end
|
178
|
-
my_fasta = "#{$1}\n"
|
179
|
-
acc = $2
|
180
|
-
else
|
181
|
-
my_fasta += line
|
182
264
|
end
|
183
|
-
|
184
|
-
|
185
|
-
|
265
|
+
}
|
266
|
+
|
267
|
+
index.close if !options[:all]
|
268
|
+
return seqs
|
186
269
|
end
|
187
270
|
|
188
|
-
################################################### MAIN
|
189
271
|
|
190
|
-
|
272
|
+
|
273
|
+
##########################################################################################
|
274
|
+
## OPTIONS
|
275
|
+
##########################################################################################
|
276
|
+
|
277
|
+
options = {}
|
278
|
+
|
279
|
+
divs = %w{human fungi invertebrates mammals plants rodents vertebrates}
|
280
|
+
|
281
|
+
optparse = OptionParser.new do |opts|
|
282
|
+
options[:uniprot_div] = divs
|
283
|
+
opts.on( '-u', '--file String', 'Uniprot DBs to be downloaded. String structure: \'div_name1,div_name2..\'. Posible options: human, fungi, invertebrates, mammals, plants, rodents, vertebrates. Default: download all') do |uniprot_div|
|
284
|
+
temp_divs = uniprot_div.split(',')
|
285
|
+
check_valid_ids = temp_divs - divs
|
286
|
+
if !check_valid_ids.empty?
|
287
|
+
puts 'This uniprot division not exists', check_valid_ids
|
288
|
+
process.exit
|
289
|
+
else
|
290
|
+
options[:uniprot_div] = temp_divs
|
291
|
+
end
|
292
|
+
end
|
293
|
+
|
294
|
+
options[:no_download] = FALSE
|
295
|
+
opts.on( '-d', '--no_download', 'Only parse downloaded files without download them again') do
|
296
|
+
options[:no_download] = TRUE
|
297
|
+
end
|
298
|
+
|
299
|
+
options[:no_ncrna] = FALSE
|
300
|
+
opts.on( '-n', '--no_ncrna', 'No use ncrna sequences') do
|
301
|
+
options[:no_ncrna] = TRUE
|
302
|
+
end
|
303
|
+
|
304
|
+
options[:only_index] = FALSE
|
305
|
+
opts.on( '-i', '--only_index', 'Build annotation index only without blast DB') do
|
306
|
+
options[:only_index] = TRUE
|
307
|
+
end
|
308
|
+
|
309
|
+
options[:no_trembl] = FALSE
|
310
|
+
opts.on( '-t', '--no_trembl', 'No use trembl sequences') do
|
311
|
+
options[:no_trembl] = TRUE
|
312
|
+
end
|
313
|
+
|
314
|
+
options[:all] = FALSE
|
315
|
+
opts.on( '-a', '--all_sequences', 'Generate databaeses with all sequences') do
|
316
|
+
options[:all] = TRUE
|
317
|
+
end
|
318
|
+
|
319
|
+
options[:cdhit] = 0
|
320
|
+
opts.on( '-c', '--cdhit FLOAT', 'Compact databases with cdhit. 0 for deactivate, >0 - 1 to set percentage of identity. Default: 0') do |cdhit|
|
321
|
+
options[:cdhit] = cdhit.to_f
|
322
|
+
end
|
323
|
+
|
324
|
+
options[:no_uniprot] = FALSE
|
325
|
+
opts.on( '-p', '--no_uniprot', 'No use uniprot sequences') do
|
326
|
+
options[:no_uniprot] = TRUE
|
327
|
+
end
|
328
|
+
|
329
|
+
|
330
|
+
# Set a banner, displayed at the top of the help screen.
|
331
|
+
opts.banner = "Usage: #{File.basename(__FILE__)} [options] \n\n"
|
332
|
+
|
333
|
+
# This displays the help screen
|
334
|
+
opts.on( '-h', '--help', 'Display this screen' ) do
|
335
|
+
puts opts
|
336
|
+
exit
|
337
|
+
end
|
338
|
+
|
339
|
+
end # End opts
|
340
|
+
|
341
|
+
# parse options and remove from ARGV
|
342
|
+
optparse.parse!
|
343
|
+
|
344
|
+
|
345
|
+
##############################################################################################
|
346
|
+
## MAIN
|
347
|
+
##############################################################################################
|
348
|
+
|
191
349
|
|
192
350
|
if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB'])
|
193
351
|
formatted_db_path = ENV['BLASTDB']
|
194
352
|
else # otherwise use ROOTPATH + DB
|
195
353
|
formatted_db_path = File.expand_path(File.join(ROOT_PATH, "blast_dbs"))
|
354
|
+
Dir.mkdir(formatted_db_path)
|
196
355
|
end
|
197
356
|
|
198
|
-
|
357
|
+
puts formatted_db_path
|
358
|
+
|
359
|
+
ENV['BLASTDB'] = formatted_db_path
|
199
360
|
puts "Databases will be downloaded at: #{ENV['BLASTDB']}"
|
200
361
|
puts "\nTo set the path for storing databases, execute next line in your terminal or add it to your .bash_profile:\n\n\texport BLASTDB=/my_path/\n\n"
|
201
362
|
|
202
|
-
|
203
|
-
# my_array = ["plants","human"] # used for a shoter test
|
204
|
-
|
205
|
-
conecta_uniprot(my_array, formatted_db_path)
|
206
|
-
system('gunzip '+formatted_db_path+'*.gz')
|
207
|
-
|
208
|
-
isoform_hash = {}
|
209
|
-
isoform_hash = load_isoform_hash(File.join(formatted_db_path, "uniprot_sprot_varsplic.fasta"))
|
363
|
+
download_ncrna(formatted_db_path, options[:no_download]) if !options[:no_ncrna]
|
210
364
|
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
filter_incomplete_seqs(File.join(formatted_db_path, "uniprot_sprot_#{db_group}.dat"), isoform_hash, formatted_db_path)
|
216
|
-
filter_incomplete_seqs(File.join(formatted_db_path, "uniprot_trembl_#{db_group}.dat"), isoform_hash, formatted_db_path)
|
217
|
-
|
218
|
-
sp_fasta=File.join(formatted_db_path,"sp_#{db_group}","sp_#{db_group}.fasta")
|
219
|
-
tr_fasta=File.join(formatted_db_path,"tr_#{db_group}","tr_#{db_group}.fasta")
|
220
|
-
system("makeblastdb -in #{sp_fasta} -dbtype 'prot' -parse_seqids")
|
221
|
-
system("makeblastdb -in #{tr_fasta} -dbtype 'prot' -parse_seqids")
|
222
|
-
|
365
|
+
if !options[:no_download]
|
366
|
+
conecta_uniprot(options[:uniprot_div], formatted_db_path)
|
367
|
+
system('gunzip '+formatted_db_path+'*.gz')
|
223
368
|
end
|
224
369
|
|
225
|
-
|
370
|
+
if !options[:no_uniprot]
|
371
|
+
isoform_hash = load_isoform_hash(File.join(formatted_db_path, "uniprot_sprot_varsplic.fasta")) #archivo de variantes de splicing. POR QUE?
|
372
|
+
options[:uniprot_div].each do |db_group|
|
373
|
+
filter_and_makeDB(formatted_db_path, 'sprot', db_group, isoform_hash, 'sp', options)
|
374
|
+
filter_and_makeDB(formatted_db_path, 'trembl', db_group, nil, 'tr', options) if !options[:no_trembl]
|
375
|
+
end
|
376
|
+
end
|
377
|
+
puts "download_fln_dbs.rb has finished"
|