full_lengther_next 0.0.8 → 0.5.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (36) hide show
  1. data/.gemtest +0 -0
  2. data/History.txt +2 -2
  3. data/Manifest.txt +33 -18
  4. data/Rakefile +4 -2
  5. data/bin/download_fln_dbs.rb +310 -158
  6. data/bin/full_lengther_next +160 -103
  7. data/bin/make_test_dataset.rb +236 -0
  8. data/bin/make_user_db.rb +101 -117
  9. data/bin/plot_fln.rb +270 -0
  10. data/bin/plot_taxonomy.rb +70 -0
  11. data/lib/expresscanvas.zip +0 -0
  12. data/lib/full_lengther_next.rb +3 -3
  13. data/lib/full_lengther_next/classes/artifacts.rb +66 -0
  14. data/lib/full_lengther_next/classes/blast_functions.rb +326 -0
  15. data/lib/full_lengther_next/classes/cdhit.rb +154 -0
  16. data/lib/full_lengther_next/classes/chimeric_seqs.rb +315 -57
  17. data/lib/full_lengther_next/classes/common_functions.rb +105 -63
  18. data/lib/full_lengther_next/classes/exonerate_result.rb +258 -0
  19. data/lib/full_lengther_next/classes/fl_analysis.rb +226 -617
  20. data/lib/full_lengther_next/classes/fl_string_utils.rb +4 -2
  21. data/lib/full_lengther_next/classes/fln_stats.rb +598 -557
  22. data/lib/full_lengther_next/classes/handle_db.rb +30 -0
  23. data/lib/full_lengther_next/classes/my_worker.rb +308 -138
  24. data/lib/full_lengther_next/classes/my_worker_EST.rb +54 -0
  25. data/lib/full_lengther_next/classes/my_worker_manager_EST.rb +69 -0
  26. data/lib/full_lengther_next/classes/my_worker_manager_fln.rb +389 -0
  27. data/lib/full_lengther_next/classes/nc_rna.rb +5 -7
  28. data/lib/full_lengther_next/classes/reptrans.rb +210 -0
  29. data/lib/full_lengther_next/classes/sequence.rb +439 -80
  30. data/lib/full_lengther_next/classes/test_code.rb +15 -16
  31. data/lib/full_lengther_next/classes/types.rb +12 -0
  32. data/lib/full_lengther_next/classes/une_los_hit.rb +148 -230
  33. data/lib/full_lengther_next/classes/warnings.rb +40 -0
  34. metadata +207 -93
  35. data/lib/full_lengther_next/classes/lcs.rb +0 -33
  36. data/lib/full_lengther_next/classes/my_worker_manager.rb +0 -240
File without changes
@@ -1,6 +1,6 @@
1
- === 0.0.8 2012-11-28
1
+ === 0.1.0 2013-09-12
2
2
 
3
- Protection against empty seqs when all seqs match against user_db
3
+ Major rewrite of script
4
4
 
5
5
  === 0.0.7 2012-07-25
6
6
 
@@ -1,29 +1,44 @@
1
+ Rakefile
2
+ script
3
+ script/generate
4
+ script/destroy
5
+ script/console
6
+ test
7
+ test/test_full_lengther_next.rb
8
+ test/test_helper.rb
9
+ bin/plot_taxonomy.rb
10
+ bin/plot_fln.rb
1
11
  bin/download_fln_dbs.rb
2
- bin/make_user_db.rb
3
12
  bin/full_lengther_next
13
+ bin/make_user_db.rb
14
+ bin/make_test_dataset.rb
15
+ PostInstall.txt
16
+ README.rdoc
4
17
  History.txt
5
- lib/full_lengther_next/classes/common_functions.rb
18
+ Manifest.txt
19
+ lib/full_lengther_next
20
+ lib/full_lengther_next/classes
21
+ lib/full_lengther_next/classes/blast_functions.rb
22
+ lib/full_lengther_next/classes/my_worker_manager_fln.rb
23
+ lib/full_lengther_next/classes/types.rb
6
24
  lib/full_lengther_next/classes/chimeric_seqs.rb
25
+ lib/full_lengther_next/classes/artifacts.rb
26
+ lib/full_lengther_next/classes/cdhit.rb
7
27
  lib/full_lengther_next/classes/fl_analysis.rb
8
28
  lib/full_lengther_next/classes/fl_string_utils.rb
9
- lib/full_lengther_next/classes/fln_stats.rb
10
- lib/full_lengther_next/classes/lcs.rb
11
29
  lib/full_lengther_next/classes/my_worker.rb
12
- lib/full_lengther_next/classes/my_worker_manager.rb
13
- lib/full_lengther_next/classes/nc_rna.rb
14
- lib/full_lengther_next/classes/orf.rb
15
30
  lib/full_lengther_next/classes/sequence.rb
31
+ lib/full_lengther_next/classes/my_worker_EST.rb
16
32
  lib/full_lengther_next/classes/test_code.rb
33
+ lib/full_lengther_next/classes/orf.rb
17
34
  lib/full_lengther_next/classes/une_los_hit.rb
35
+ lib/full_lengther_next/classes/warnings.rb
36
+ lib/full_lengther_next/classes/fln_stats.rb
37
+ lib/full_lengther_next/classes/my_worker_manager_EST.rb
38
+ lib/full_lengther_next/classes/nc_rna.rb
39
+ lib/full_lengther_next/classes/reptrans.rb
40
+ lib/full_lengther_next/classes/common_functions.rb
41
+ lib/full_lengther_next/classes/exonerate_result.rb
42
+ lib/full_lengther_next/classes/handle_db.rb
18
43
  lib/full_lengther_next.rb
19
- Manifest.txt
20
- PostInstall.txt
21
- Rakefile
22
- README.rdoc
23
- script
24
- script/console
25
- script/destroy
26
- script/generate
27
- test
28
- test/test_full_lengther_next.rb
29
- test/test_helper.rb
44
+ lib/expresscanvas.zip
data/Rakefile CHANGED
@@ -11,7 +11,7 @@ Hoe.plugin :newgem
11
11
  # Generate all the Rake tasks
12
12
  # Run 'rake -T' to see list of generated tasks (from gem root directory)
13
13
  $hoe = Hoe.spec 'full_lengther_next' do
14
- self.developer 'Noe Fernandez & Dario Guerrero', 'noeisneo@gmail.com & dariogf@gmail.com'
14
+ self.developer 'Pedro Seoane & Noe Fernandez & Dario Guerrero ', 'seoanezonjic@hotmail.com & noeisneo@gmail.com & dariogf@gmail.com'
15
15
  self.post_install_message = 'PostInstall.txt' # TODO remove if post-install message not required
16
16
  self.rubyforge_name = self.name # TODO this is default value
17
17
  # self.extra_deps = [['activesupport','>= 2.0.2']]
@@ -20,9 +20,11 @@ $hoe = Hoe.spec 'full_lengther_next' do
20
20
  # self.extra_deps << ['gnuplot','>=0']
21
21
  # self.extra_deps << ['term-ansicolor','>=1.0.5']
22
22
  self.extra_deps << ['xml-simple','>=1.0.12']
23
- self.extra_deps << ['scbi_blast','>=0.0.37']
23
+ self.extra_deps << ['scbi_blast','>=0.0.32']
24
24
  self.extra_deps << ['scbi_mapreduce','>=0.0.29']
25
25
  self.extra_deps << ['scbi_fasta','>=0.1.7']
26
+ self.extra_deps << ['bio-cd-hit-report', '>= 0.1.0 ']
27
+ self.extra_deps << ['bio', '>= 1.4.3']
26
28
  # self.extra_deps << ['scbi_fastq','>=0.0.13']
27
29
  self.extra_deps << ['scbi_plot','>=0.0.6']
28
30
  # self.extra_deps << ['scbi_math','>=0.0.1']
@@ -1,66 +1,94 @@
1
1
  #!/usr/bin/env ruby
2
-
2
+
3
3
  # 15-2-2011 Noe Fernandez-Pozo
4
4
  # Script to download Full-LengtherNext databases.
5
5
  # Once in UniProtKB/Swiss-Prot, a protein entry is removed from UniProtKB/TrEMBL.
6
6
 
7
+ ROOT_PATH=File.dirname(__FILE__)
8
+ $: << File.expand_path(File.join(ROOT_PATH, "../lib/full_lengther_next/classes/"))
9
+
10
+ require 'bio'
7
11
  require 'net/ftp'
8
12
  require 'open-uri'
13
+ require 'scbi_fasta'
14
+ require 'optparse'
15
+ require 'cdhit'
16
+ require 'handle_db'
17
+
18
+ ##############################################################################################
19
+ ## METHODS
20
+ #############################################################################################
21
+ def download_ncrna(formatted_db_path, no_download)
9
22
 
10
- ################################################### Functions
23
+ ncrna_zip = File.join(formatted_db_path, 'ncrna.zip')
24
+ ncrna_fasta = File.join(formatted_db_path, 'ncrna.fasta')
25
+ db_path = File.join(formatted_db_path, 'nc_rna_db')
26
+ source_file = File.join(db_path, ncrna_fasta)
11
27
 
12
- def download_ncrna(formatted_db_path)
13
28
 
14
- if !File.exists?(File.join(formatted_db_path, "nc_rna_db"))
15
- Dir.mkdir(File.join(formatted_db_path, "nc_rna_db"))
29
+ if !no_download
30
+ puts "Downloading ncRNA database"
31
+ open(ncrna_zip, 'wb') do |my_file|
32
+ my_file.print open('http://www.ncrna.org/frnadb/files/ncrna.zip').read
33
+ end
34
+ puts "\nncRNA database downloaded"
35
+ system("unzip", ncrna_zip, "-d", ncrna_fasta)
36
+ system("rm", ncrna_zip)
16
37
  end
17
38
 
18
- puts "Downloading ncRNA database"
19
- open(File.join(formatted_db_path, "nc_rna_db/ncrna_fln_100.fasta.zip"), "wb") do |my_file|
20
- my_file.print open('http://www.scbi.uma.es/downloads/FLNDB/ncrna_fln_100.fasta.zip').read
39
+ if File.exists?(ncrna_fasta)
40
+ Dir.mkdir(db_path) if !File.exists?(db_path)
41
+ db_files = File.join(db_path, 'ncrna')
42
+ filtered_fasta = filtering_ncbi_seqs(ncrna_fasta, 40)
43
+ #system("LANG=C sed 's/[^A-Za-z0-9\.> -\|]/_/g' #{ncrna_fasta}_filtered > #{ncrna_fasta}_cln")
44
+ puts "\nncRNA database decompressed and cleaned"
45
+ do_makeblastdb(filtered_fasta, db_files, 'nucl')
46
+ system("rm #{ncrna_fasta}")
47
+ puts "\nncRNA database completed"
21
48
  end
22
- puts "\nncRNA database downloaded"
23
-
24
- ncrna_zip=File.join(formatted_db_path,'nc_rna_db','ncrna_fln_100.fasta.zip')
25
- ncrna_out_dir=File.join(formatted_db_path,'nc_rna_db')
26
- system("unzip", ncrna_zip, "-d", ncrna_out_dir)
27
- system("rm", ncrna_zip)
28
-
29
- puts "\nncRNA database decompressed"
30
-
31
- ncrna_fasta=File.join(formatted_db_path,'nc_rna_db','ncrna_fln_100.fasta')
32
- system("makeblastdb", "-in", ncrna_fasta, "-dbtype", "nucl", "-parse_seqids")
33
-
34
- puts "\nncRNA database completed"
49
+ end
50
+
51
+ def filtering_ncbi_seqs(fasta_file, max_length)
52
+ fasta = FastaQualFile.new(fasta_file)
53
+ filtered_fasta = ''
54
+ fasta.each do |name, seq, comments|
55
+ name ="#{name} #{comments}"
56
+ if seq.length >= max_length
57
+ fields = name.split('|')
58
+ if fields[1] == '' || name.include?('||')
59
+ new_name = name
60
+ else #Cut huge description
61
+ new_name = fields[0]+'|'
62
+ ids = fields[1].split(',')
63
+ new_name << "#{ids.first}\|#{fields[2]}"
64
+ end
65
+ filtered_fasta << ">#{new_name}\n#{seq}\n"
66
+ end
67
+ end
68
+ return filtered_fasta
35
69
  end
36
70
 
37
71
  def conecta_uniprot(my_array, formatted_db_path)
38
72
 
73
+ Dir.mkdir(formatted_db_path) if !File.exists?(formatted_db_path)
74
+ varsplic_out=File.join(formatted_db_path,'uniprot_sprot_varsplic.fasta.gz')
75
+
39
76
  $ftp = Net::FTP.new()
40
-
41
- if !File.exists?(formatted_db_path)
42
- Dir.mkdir(formatted_db_path)
43
- end
44
-
45
- $ftp.connect('ftp.uniprot.org')
46
-
77
+ $ftp.connect('ftp.ebi.ac.uk')
47
78
  $ftp.login
48
79
 
49
80
  puts "connected to UniProt"
50
-
51
81
  my_array.each do |db_group|
52
82
  puts "Downloading #{db_group}"
53
83
  download_uniprot(db_group, formatted_db_path)
54
84
  end
55
-
56
- varsplic_out=File.join(formatted_db_path,'uniprot_sprot_varsplic.fasta.gz')
85
+
86
+ #archivo de variantes de splicing. POR QUE?
57
87
  $ftp.chdir("/pub/databases/uniprot/current_release/knowledgebase/complete")
58
88
  $ftp.getbinaryfile("uniprot_sprot_varsplic.fasta.gz", varsplic_out)
59
-
60
- puts "isoform files downloaded"
61
-
62
89
  $ftp.close
63
90
 
91
+ puts "isoform files downloaded"
64
92
  end
65
93
 
66
94
  def download_uniprot(uniprot_group, formatted_db_path)
@@ -70,156 +98,280 @@ def download_uniprot(uniprot_group, formatted_db_path)
70
98
  $ftp.chdir("/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions")
71
99
  $ftp.getbinaryfile("uniprot_sprot_#{uniprot_group}.dat.gz", sp_out)
72
100
  $ftp.getbinaryfile("uniprot_trembl_#{uniprot_group}.dat.gz", tr_out)
73
-
101
+
74
102
  puts "#{uniprot_group} files downloaded"
75
103
 
76
104
  end
77
105
 
78
- def filter_incomplete_seqs(file_name, isoform_hash, formatted_db_path)
79
-
80
- puts "filtering sequences from #{file_name}"
81
-
82
- # UniProtKB fragments with FT NON_CONS and FT NON_TER features.
83
- #
84
- # * FT NON_TER: The residue at an extremity of the sequence is not the terminal residue. If applied to position 1, this signifies that the first position is not the N-terminus of the complete molecule. If applied to the last position, it means that this position is not the C-terminus of the complete molecule. There is no description field for this key. Examples of NON_TER key feature lines:
85
- # FT NON_TER 1 1
86
- # FT NON_TER 29 29
87
- # * FT NON_CONS: Non-consecutive residues. Indicates that two residues in a sequence are not consecutive and that there are a number of unreported or missing residues between them. Example of a NON_CONS key feature line:
88
- # FT NON_CONS 1683 1684
89
- #
90
- # NON_CONS fragments are not indicated as non-consecutive in InterPro and being non-consecutive the match to methods may be incorrect if the method spans the 'break'.
91
-
92
- newseq=false
93
- print_seq=true
94
- id=''
95
- description = ''
96
- organism_name = ''
97
- seq = ''
98
- organelle = ''
99
-
100
- file_name =~ /uniprot_([a-z]+)_([a-z]+).dat/
101
- db_name = $1
102
- output_name = $2
103
- db_name.sub!('sprot','sp')
104
- db_name.sub!('trembl','tr')
105
-
106
- if !File.exists?(File.join(formatted_db_path, "#{db_name}_#{output_name}"))
107
- Dir.mkdir(File.join(formatted_db_path, "#{db_name}_#{output_name}"))
108
- end
109
-
110
- output_file = File.new(File.join(formatted_db_path, "#{db_name}_#{output_name}/#{db_name}_#{output_name}.fasta"), "w")
111
-
112
- File.open(file_name).each_line do |line|
113
- if (newseq == false)
114
- if (line =~ /^AC\s+(\w+);/)
115
- id=$1
116
- newseq = true
117
- description = ''
118
- organism_name = ''
119
- seq = ''
120
- print_seq = true
121
- organelle = ''
122
- end
123
- else
124
- if (line =~ /^DE\s+(.+)\;*/)
125
- if (description == '')
126
- description = $1
127
- description.sub!(/RecName: Full=/,'sp=')
128
- description.sub!(/SubName: Full=/,'tr=')
129
- end
130
- if (line =~ /Flags: Fragment/)
131
- # puts "#{id} #{line}"
132
- print_seq=false
133
- end
134
- elsif (line =~ /^OS\s+(.+)/)
135
- organism_name = $1
136
- elsif (line =~ /^OG\s+(.+)/)
137
- organelle = $1
138
- elsif (line =~ /^FT\s+NON_TER\s+/)
139
- print_seq=false
140
- # puts "#{id} NON_TER"
141
- elsif (line =~ /^FT\s+NON_CONS\s+(\d+)\s+/)
142
- print_seq=false
143
- # puts "#{id} NON_CONS"
144
- elsif (line =~ /^\s+([\w\s]+)/)
145
- seq += $1
146
- elsif (line =~ /^\/\//)
147
- seq.gsub!(/\s*/,'')
148
- if (seq !~ /^M/i)
149
- print_seq=false
150
- end
151
- newseq = false
152
-
153
- if (print_seq)
154
- output_file.puts ">#{id} #{description} #{organism_name} #{organelle}\n#{seq}"
155
- if (!isoform_hash[id].nil?)
156
- output_file.puts isoform_hash[id]
157
- end
158
- end
106
+
107
+
108
+ def filter_and_makeDB(formatted_db_path, dbtype, db_group, isoform_hash, prefix, options)
109
+ file_name = prefix +'_' + db_group
110
+ puts 'Building ' + file_name
111
+ fasta = File.join(formatted_db_path,"#{file_name}","#{file_name}.fasta")
112
+ blastdb_input = fasta.gsub('.fasta', '')
113
+ current_db_source = File.join(formatted_db_path, "uniprot_#{dbtype}_#{db_group}.dat")
114
+ if File.exists?(current_db_source)
115
+ seqs = filter_incomplete_seqs(current_db_source, isoform_hash, formatted_db_path, file_name, options)
116
+ if !options[:only_index]
117
+ if options[:cdhit] > 0
118
+ output_file = File.open(fasta, 'w')
119
+ output_file.puts seqs
120
+ output_file.close
121
+ system("cd-hit -i #{fasta} -o /dev/stderr -c #{options[:cdhit]} -M 0 -s 0.95 2>&1 >/dev/null| makeblastdb -in - -out #{blastdb_input} -title #{File.basename(blastdb_input)} -dbtype 'prot' -parse_seqids")
122
+ else
123
+ do_makeblastdb(seqs, blastdb_input, 'prot')
159
124
  end
160
125
  end
161
126
  end
162
- output_file.close
163
127
  end
164
128
 
165
- def load_isoform_hash(file)
129
+ def complete?(uniprot_record)
130
+ complete = TRUE
131
+ if uniprot_record.description.include?('Flags: Fragment') || #Discard non full length records
132
+ uniprot_record.seq[0] != 'M' ||
133
+ uniprot_record.seq.include?('XX') ||
134
+ uniprot_record.ft.keys.include?('NON_TER') ||# The residue at an extremity of the sequence is not the terminal residue. If applied to position 1, this signifies that the first position is not the N-terminus of the complete molecule. If applied to the last position, it means that this position is not the C-terminus of the complete molecule. There is no description field for this key
135
+ uniprot_record.ft.keys.include?('NON_CONS') # Non-consecutive residues. Indicates that two residues in a sequence are not consecutive and that there are a number of unreported or missing residues between them
136
+ complete = FALSE
137
+ end
138
+ return complete
139
+ end
140
+
141
+ def fln_record(uniprot_record, seqs, index, isoform_hash)
142
+ index_record = []
143
+ # Primary data
144
+ accession_number = uniprot_record.accession
145
+ description_data = uniprot_record.description.split(';')
146
+ description = description_data.first
147
+ description.sub!(/RecName: Full=/,'sp=')
148
+ description.sub!(/SubName: Full=/,'tr=')
149
+ description.sub!(/{\S*}/,'')
150
+
151
+ organism = uniprot_record.os.first.values.reverse.join(' ')
152
+ organelle = uniprot_record.og.join(' ')
153
+ sequence = uniprot_record.seq.gsub('U','X')
154
+
155
+ # Secondary data
156
+ index_record << accession_number
157
+ index_record << description
158
+ taxonomy = uniprot_record.oc.join(';')
159
+ index_record << taxonomy
160
+ index_record << organism
161
+ index_record << sequence
162
+ if !organelle.empty?
163
+ index_record << organelle
164
+ else
165
+ index_record << '-'
166
+ end
167
+ go_data = uniprot_record.dr['GO']
168
+ if !go_data.nil?
169
+ index_record << go_data.map{|go| go[0]}.join(';') # GO ID
170
+ index_record << go_data.map{|go| go[1]}.join(';') # GO Description
171
+ else
172
+ index_record << '-'
173
+ index_record << '-'
174
+ end
175
+ kegg_data = uniprot_record.dr['KEGG']
176
+ if !kegg_data.nil?
177
+ index_record << kegg_data.map{|kegg| kegg[0]}.join(';')
178
+ else
179
+ index_record << '-'
180
+ end
181
+ interpro_data = uniprot_record.dr['InterPro']
182
+ if !interpro_data.nil?
183
+ index_record << interpro_data.map{|ip| ip[0]}.join(';') # interpro ID
184
+ index_record << interpro_data.map{|ip| ip[1]}.join(';') # ip Description
185
+ else
186
+ index_record << '-'
187
+ index_record << '-'
188
+ end
189
+
190
+ if !description_data[1].nil? && description_data[1].include?('EC=')
191
+ index_record << description_data[1].split(' ').first.gsub('=',':')
192
+ else
193
+ index_record << '-'
194
+ end
195
+
196
+ pfam_data = uniprot_record.dr['Pfam']
197
+ if !pfam_data.nil?
198
+ index_record << pfam_data.map!{|pf| pf[0]}.join(';') # pfam ID
199
+ index_record << pfam_data.map!{|pf| pf[1]}.join(';') # pfam description
200
+ else
201
+ index_record << '-'
202
+ index_record << '-'
203
+ end
204
+
205
+ unipathway_data = uniprot_record.dr['UniPathway']
206
+ if !unipathway_data.nil?
207
+ index_record << unipathway_data.map!{|pf| pf[0]}.join(';') # unipathway ID
208
+ else
209
+ index_record << '-'
210
+ end
211
+
212
+ seqs << ">#{[accession_number, description, organism, organelle].join(' ')}\n#{sequence}\n"
213
+ index.puts index_record.join("\t")
214
+ seqs << isoform_hash[accession_number]+"\n" if !isoform_hash.nil? && !isoform_hash[accession_number].nil?
215
+ end
216
+
217
+ def ncbi_record(uniprot_record, seqs)
218
+ accession_number = uniprot_record.accession
219
+ id = uniprot_record.entry_id
220
+ organism = uniprot_record.os.first.values.reverse.join(' ')
221
+ sequence = uniprot_record.seq
222
+ description = uniprot_record.description.split(';').first
223
+ gene_name = nil
224
+ gn_field = uniprot_record.gn.first
225
+ gene_name = gn_field[:name] if !gn_field.nil?
226
+ prediction_field = uniprot_record.get('PE')
227
+ prediction_field =~ /PE\s+(\d+):/
228
+ prediction_status = $1
229
+ sequence_version_field = uniprot_record.dt['sequence']
230
+ sequence_version_field =~ /sequence version (\d+)./
231
+ sequence_version = $1
232
+ db = nil
233
+ if description.include?('RecName: Full=')
234
+ db = 'sp'
235
+ description.sub!(/RecName: Full=/,'')
236
+ elsif description.include?('SubName: Full=')
237
+ db = 'tr'
238
+ description.sub!(/SubName: Full=/,'')
239
+ end
240
+ taxonomy = uniprot_record.oc.join(';')
241
+
242
+ seqs << ">#{db}|#{accession_number}|#{id} #{description} OS=#{organism} GN=#{gene_name} PE=#{prediction_status} SV=#{sequence_version}\n#{sequence}\n"
243
+ end
244
+
245
+ def filter_incomplete_seqs(file_name, isoform_hash, formatted_db_path, db_name, options)
166
246
 
167
- isoform_hash = {}
168
- my_fasta = ''
169
- acc = ''
170
- File.open(file).each do |line|
171
- line.chomp!
172
- if (line =~ /(^>\w+\|(\w+)\-\d\|.+)/)
173
- if (isoform_hash[acc].nil?)
174
- isoform_hash[acc]= "#{my_fasta}\n"
247
+ puts "filtering sequences from #{file_name}"
248
+
249
+ db_folder = File.join(formatted_db_path, db_name)
250
+ Dir.mkdir(db_folder) if !File.exists?(db_folder)
251
+
252
+ main_name = File.join(db_folder, db_name)
253
+ index = File.open(main_name + '.index', 'w') if !options[:all]
254
+ seqs = ''
255
+ Bio::FlatFile.auto(file_name).each_entry {|uniprot_record|
256
+ if !options[:all] && !complete?(uniprot_record)
257
+ next
258
+ else #Get attributes of full length records
259
+ if options[:all]
260
+ ncbi_record(uniprot_record, seqs)
175
261
  else
176
- isoform_hash[acc]+= "#{my_fasta}\n"
262
+ fln_record(uniprot_record, seqs, index, isoform_hash)
177
263
  end
178
- my_fasta = "#{$1}\n"
179
- acc = $2
180
- else
181
- my_fasta += line
182
264
  end
183
- end
184
-
185
- return isoform_hash
265
+ }
266
+
267
+ index.close if !options[:all]
268
+ return seqs
186
269
  end
187
270
 
188
- ################################################### MAIN
189
271
 
190
- ROOT_PATH=File.dirname(__FILE__)
272
+
273
+ ##########################################################################################
274
+ ## OPTIONS
275
+ ##########################################################################################
276
+
277
+ options = {}
278
+
279
+ divs = %w{human fungi invertebrates mammals plants rodents vertebrates}
280
+
281
+ optparse = OptionParser.new do |opts|
282
+ options[:uniprot_div] = divs
283
+ opts.on( '-u', '--file String', 'Uniprot DBs to be downloaded. String structure: \'div_name1,div_name2..\'. Posible options: human, fungi, invertebrates, mammals, plants, rodents, vertebrates. Default: download all') do |uniprot_div|
284
+ temp_divs = uniprot_div.split(',')
285
+ check_valid_ids = temp_divs - divs
286
+ if !check_valid_ids.empty?
287
+ puts 'This uniprot division not exists', check_valid_ids
288
+ process.exit
289
+ else
290
+ options[:uniprot_div] = temp_divs
291
+ end
292
+ end
293
+
294
+ options[:no_download] = FALSE
295
+ opts.on( '-d', '--no_download', 'Only parse downloaded files without download them again') do
296
+ options[:no_download] = TRUE
297
+ end
298
+
299
+ options[:no_ncrna] = FALSE
300
+ opts.on( '-n', '--no_ncrna', 'No use ncrna sequences') do
301
+ options[:no_ncrna] = TRUE
302
+ end
303
+
304
+ options[:only_index] = FALSE
305
+ opts.on( '-i', '--only_index', 'Build annotation index only without blast DB') do
306
+ options[:only_index] = TRUE
307
+ end
308
+
309
+ options[:no_trembl] = FALSE
310
+ opts.on( '-t', '--no_trembl', 'No use trembl sequences') do
311
+ options[:no_trembl] = TRUE
312
+ end
313
+
314
+ options[:all] = FALSE
315
+ opts.on( '-a', '--all_sequences', 'Generate databaeses with all sequences') do
316
+ options[:all] = TRUE
317
+ end
318
+
319
+ options[:cdhit] = 0
320
+ opts.on( '-c', '--cdhit FLOAT', 'Compact databases with cdhit. 0 for deactivate, >0 - 1 to set percentage of identity. Default: 0') do |cdhit|
321
+ options[:cdhit] = cdhit.to_f
322
+ end
323
+
324
+ options[:no_uniprot] = FALSE
325
+ opts.on( '-p', '--no_uniprot', 'No use uniprot sequences') do
326
+ options[:no_uniprot] = TRUE
327
+ end
328
+
329
+
330
+ # Set a banner, displayed at the top of the help screen.
331
+ opts.banner = "Usage: #{File.basename(__FILE__)} [options] \n\n"
332
+
333
+ # This displays the help screen
334
+ opts.on( '-h', '--help', 'Display this screen' ) do
335
+ puts opts
336
+ exit
337
+ end
338
+
339
+ end # End opts
340
+
341
+ # parse options and remove from ARGV
342
+ optparse.parse!
343
+
344
+
345
+ ##############################################################################################
346
+ ## MAIN
347
+ ##############################################################################################
348
+
191
349
 
192
350
  if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB'])
193
351
  formatted_db_path = ENV['BLASTDB']
194
352
  else # otherwise use ROOTPATH + DB
195
353
  formatted_db_path = File.expand_path(File.join(ROOT_PATH, "blast_dbs"))
354
+ Dir.mkdir(formatted_db_path)
196
355
  end
197
356
 
198
- ENV['BLASTDB']=formatted_db_path
357
+ puts formatted_db_path
358
+
359
+ ENV['BLASTDB'] = formatted_db_path
199
360
  puts "Databases will be downloaded at: #{ENV['BLASTDB']}"
200
361
  puts "\nTo set the path for storing databases, execute next line in your terminal or add it to your .bash_profile:\n\n\texport BLASTDB=/my_path/\n\n"
201
362
 
202
- my_array = ["human","fungi","invertebrates","mammals","plants","rodents","vertebrates"]
203
- # my_array = ["plants","human"] # used for a shoter test
204
-
205
- conecta_uniprot(my_array, formatted_db_path)
206
- system('gunzip '+formatted_db_path+'*.gz')
207
-
208
- isoform_hash = {}
209
- isoform_hash = load_isoform_hash(File.join(formatted_db_path, "uniprot_sprot_varsplic.fasta"))
363
+ download_ncrna(formatted_db_path, options[:no_download]) if !options[:no_ncrna]
210
364
 
211
- download_ncrna(formatted_db_path)
212
-
213
- my_array.each do |db_group|
214
-
215
- filter_incomplete_seqs(File.join(formatted_db_path, "uniprot_sprot_#{db_group}.dat"), isoform_hash, formatted_db_path)
216
- filter_incomplete_seqs(File.join(formatted_db_path, "uniprot_trembl_#{db_group}.dat"), isoform_hash, formatted_db_path)
217
-
218
- sp_fasta=File.join(formatted_db_path,"sp_#{db_group}","sp_#{db_group}.fasta")
219
- tr_fasta=File.join(formatted_db_path,"tr_#{db_group}","tr_#{db_group}.fasta")
220
- system("makeblastdb -in #{sp_fasta} -dbtype 'prot' -parse_seqids")
221
- system("makeblastdb -in #{tr_fasta} -dbtype 'prot' -parse_seqids")
222
-
365
+ if !options[:no_download]
366
+ conecta_uniprot(options[:uniprot_div], formatted_db_path)
367
+ system('gunzip '+formatted_db_path+'*.gz')
223
368
  end
224
369
 
225
- puts "download_fln_dbs.rb has finished"
370
+ if !options[:no_uniprot]
371
+ isoform_hash = load_isoform_hash(File.join(formatted_db_path, "uniprot_sprot_varsplic.fasta")) #archivo de variantes de splicing. POR QUE?
372
+ options[:uniprot_div].each do |db_group|
373
+ filter_and_makeDB(formatted_db_path, 'sprot', db_group, isoform_hash, 'sp', options)
374
+ filter_and_makeDB(formatted_db_path, 'trembl', db_group, nil, 'tr', options) if !options[:no_trembl]
375
+ end
376
+ end
377
+ puts "download_fln_dbs.rb has finished"