full_lengther_next 0.0.8 → 0.5.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. data/.gemtest +0 -0
  2. data/History.txt +2 -2
  3. data/Manifest.txt +33 -18
  4. data/Rakefile +4 -2
  5. data/bin/download_fln_dbs.rb +310 -158
  6. data/bin/full_lengther_next +160 -103
  7. data/bin/make_test_dataset.rb +236 -0
  8. data/bin/make_user_db.rb +101 -117
  9. data/bin/plot_fln.rb +270 -0
  10. data/bin/plot_taxonomy.rb +70 -0
  11. data/lib/expresscanvas.zip +0 -0
  12. data/lib/full_lengther_next.rb +3 -3
  13. data/lib/full_lengther_next/classes/artifacts.rb +66 -0
  14. data/lib/full_lengther_next/classes/blast_functions.rb +326 -0
  15. data/lib/full_lengther_next/classes/cdhit.rb +154 -0
  16. data/lib/full_lengther_next/classes/chimeric_seqs.rb +315 -57
  17. data/lib/full_lengther_next/classes/common_functions.rb +105 -63
  18. data/lib/full_lengther_next/classes/exonerate_result.rb +258 -0
  19. data/lib/full_lengther_next/classes/fl_analysis.rb +226 -617
  20. data/lib/full_lengther_next/classes/fl_string_utils.rb +4 -2
  21. data/lib/full_lengther_next/classes/fln_stats.rb +598 -557
  22. data/lib/full_lengther_next/classes/handle_db.rb +30 -0
  23. data/lib/full_lengther_next/classes/my_worker.rb +308 -138
  24. data/lib/full_lengther_next/classes/my_worker_EST.rb +54 -0
  25. data/lib/full_lengther_next/classes/my_worker_manager_EST.rb +69 -0
  26. data/lib/full_lengther_next/classes/my_worker_manager_fln.rb +389 -0
  27. data/lib/full_lengther_next/classes/nc_rna.rb +5 -7
  28. data/lib/full_lengther_next/classes/reptrans.rb +210 -0
  29. data/lib/full_lengther_next/classes/sequence.rb +439 -80
  30. data/lib/full_lengther_next/classes/test_code.rb +15 -16
  31. data/lib/full_lengther_next/classes/types.rb +12 -0
  32. data/lib/full_lengther_next/classes/une_los_hit.rb +148 -230
  33. data/lib/full_lengther_next/classes/warnings.rb +40 -0
  34. metadata +207 -93
  35. data/lib/full_lengther_next/classes/lcs.rb +0 -33
  36. data/lib/full_lengther_next/classes/my_worker_manager.rb +0 -240
File without changes
@@ -1,6 +1,6 @@
1
- === 0.0.8 2012-11-28
1
+ === 0.1.0 2013-09-12
2
2
 
3
- Protection against empty seqs when all seqs match against user_db
3
+ Major rewrite of script
4
4
 
5
5
  === 0.0.7 2012-07-25
6
6
 
@@ -1,29 +1,44 @@
1
+ Rakefile
2
+ script
3
+ script/generate
4
+ script/destroy
5
+ script/console
6
+ test
7
+ test/test_full_lengther_next.rb
8
+ test/test_helper.rb
9
+ bin/plot_taxonomy.rb
10
+ bin/plot_fln.rb
1
11
  bin/download_fln_dbs.rb
2
- bin/make_user_db.rb
3
12
  bin/full_lengther_next
13
+ bin/make_user_db.rb
14
+ bin/make_test_dataset.rb
15
+ PostInstall.txt
16
+ README.rdoc
4
17
  History.txt
5
- lib/full_lengther_next/classes/common_functions.rb
18
+ Manifest.txt
19
+ lib/full_lengther_next
20
+ lib/full_lengther_next/classes
21
+ lib/full_lengther_next/classes/blast_functions.rb
22
+ lib/full_lengther_next/classes/my_worker_manager_fln.rb
23
+ lib/full_lengther_next/classes/types.rb
6
24
  lib/full_lengther_next/classes/chimeric_seqs.rb
25
+ lib/full_lengther_next/classes/artifacts.rb
26
+ lib/full_lengther_next/classes/cdhit.rb
7
27
  lib/full_lengther_next/classes/fl_analysis.rb
8
28
  lib/full_lengther_next/classes/fl_string_utils.rb
9
- lib/full_lengther_next/classes/fln_stats.rb
10
- lib/full_lengther_next/classes/lcs.rb
11
29
  lib/full_lengther_next/classes/my_worker.rb
12
- lib/full_lengther_next/classes/my_worker_manager.rb
13
- lib/full_lengther_next/classes/nc_rna.rb
14
- lib/full_lengther_next/classes/orf.rb
15
30
  lib/full_lengther_next/classes/sequence.rb
31
+ lib/full_lengther_next/classes/my_worker_EST.rb
16
32
  lib/full_lengther_next/classes/test_code.rb
33
+ lib/full_lengther_next/classes/orf.rb
17
34
  lib/full_lengther_next/classes/une_los_hit.rb
35
+ lib/full_lengther_next/classes/warnings.rb
36
+ lib/full_lengther_next/classes/fln_stats.rb
37
+ lib/full_lengther_next/classes/my_worker_manager_EST.rb
38
+ lib/full_lengther_next/classes/nc_rna.rb
39
+ lib/full_lengther_next/classes/reptrans.rb
40
+ lib/full_lengther_next/classes/common_functions.rb
41
+ lib/full_lengther_next/classes/exonerate_result.rb
42
+ lib/full_lengther_next/classes/handle_db.rb
18
43
  lib/full_lengther_next.rb
19
- Manifest.txt
20
- PostInstall.txt
21
- Rakefile
22
- README.rdoc
23
- script
24
- script/console
25
- script/destroy
26
- script/generate
27
- test
28
- test/test_full_lengther_next.rb
29
- test/test_helper.rb
44
+ lib/expresscanvas.zip
data/Rakefile CHANGED
@@ -11,7 +11,7 @@ Hoe.plugin :newgem
11
11
  # Generate all the Rake tasks
12
12
  # Run 'rake -T' to see list of generated tasks (from gem root directory)
13
13
  $hoe = Hoe.spec 'full_lengther_next' do
14
- self.developer 'Noe Fernandez & Dario Guerrero', 'noeisneo@gmail.com & dariogf@gmail.com'
14
+ self.developer 'Pedro Seoane & Noe Fernandez & Dario Guerrero ', 'seoanezonjic@hotmail.com & noeisneo@gmail.com & dariogf@gmail.com'
15
15
  self.post_install_message = 'PostInstall.txt' # TODO remove if post-install message not required
16
16
  self.rubyforge_name = self.name # TODO this is default value
17
17
  # self.extra_deps = [['activesupport','>= 2.0.2']]
@@ -20,9 +20,11 @@ $hoe = Hoe.spec 'full_lengther_next' do
20
20
  # self.extra_deps << ['gnuplot','>=0']
21
21
  # self.extra_deps << ['term-ansicolor','>=1.0.5']
22
22
  self.extra_deps << ['xml-simple','>=1.0.12']
23
- self.extra_deps << ['scbi_blast','>=0.0.37']
23
+ self.extra_deps << ['scbi_blast','>=0.0.32']
24
24
  self.extra_deps << ['scbi_mapreduce','>=0.0.29']
25
25
  self.extra_deps << ['scbi_fasta','>=0.1.7']
26
+ self.extra_deps << ['bio-cd-hit-report', '>= 0.1.0 ']
27
+ self.extra_deps << ['bio', '>= 1.4.3']
26
28
  # self.extra_deps << ['scbi_fastq','>=0.0.13']
27
29
  self.extra_deps << ['scbi_plot','>=0.0.6']
28
30
  # self.extra_deps << ['scbi_math','>=0.0.1']
@@ -1,66 +1,94 @@
1
1
  #!/usr/bin/env ruby
2
-
2
+
3
3
  # 15-2-2011 Noe Fernandez-Pozo
4
4
  # Script to download Full-LengtherNext databases.
5
5
  # Once in UniProtKB/Swiss-Prot, a protein entry is removed from UniProtKB/TrEMBL.
6
6
 
7
+ ROOT_PATH=File.dirname(__FILE__)
8
+ $: << File.expand_path(File.join(ROOT_PATH, "../lib/full_lengther_next/classes/"))
9
+
10
+ require 'bio'
7
11
  require 'net/ftp'
8
12
  require 'open-uri'
13
+ require 'scbi_fasta'
14
+ require 'optparse'
15
+ require 'cdhit'
16
+ require 'handle_db'
17
+
18
+ ##############################################################################################
19
+ ## METHODS
20
+ #############################################################################################
21
+ def download_ncrna(formatted_db_path, no_download)
9
22
 
10
- ################################################### Functions
23
+ ncrna_zip = File.join(formatted_db_path, 'ncrna.zip')
24
+ ncrna_fasta = File.join(formatted_db_path, 'ncrna.fasta')
25
+ db_path = File.join(formatted_db_path, 'nc_rna_db')
26
+ source_file = File.join(db_path, ncrna_fasta)
11
27
 
12
- def download_ncrna(formatted_db_path)
13
28
 
14
- if !File.exists?(File.join(formatted_db_path, "nc_rna_db"))
15
- Dir.mkdir(File.join(formatted_db_path, "nc_rna_db"))
29
+ if !no_download
30
+ puts "Downloading ncRNA database"
31
+ open(ncrna_zip, 'wb') do |my_file|
32
+ my_file.print open('http://www.ncrna.org/frnadb/files/ncrna.zip').read
33
+ end
34
+ puts "\nncRNA database downloaded"
35
+ system("unzip", ncrna_zip, "-d", ncrna_fasta)
36
+ system("rm", ncrna_zip)
16
37
  end
17
38
 
18
- puts "Downloading ncRNA database"
19
- open(File.join(formatted_db_path, "nc_rna_db/ncrna_fln_100.fasta.zip"), "wb") do |my_file|
20
- my_file.print open('http://www.scbi.uma.es/downloads/FLNDB/ncrna_fln_100.fasta.zip').read
39
+ if File.exists?(ncrna_fasta)
40
+ Dir.mkdir(db_path) if !File.exists?(db_path)
41
+ db_files = File.join(db_path, 'ncrna')
42
+ filtered_fasta = filtering_ncbi_seqs(ncrna_fasta, 40)
43
+ #system("LANG=C sed 's/[^A-Za-z0-9\.> -\|]/_/g' #{ncrna_fasta}_filtered > #{ncrna_fasta}_cln")
44
+ puts "\nncRNA database decompressed and cleaned"
45
+ do_makeblastdb(filtered_fasta, db_files, 'nucl')
46
+ system("rm #{ncrna_fasta}")
47
+ puts "\nncRNA database completed"
21
48
  end
22
- puts "\nncRNA database downloaded"
23
-
24
- ncrna_zip=File.join(formatted_db_path,'nc_rna_db','ncrna_fln_100.fasta.zip')
25
- ncrna_out_dir=File.join(formatted_db_path,'nc_rna_db')
26
- system("unzip", ncrna_zip, "-d", ncrna_out_dir)
27
- system("rm", ncrna_zip)
28
-
29
- puts "\nncRNA database decompressed"
30
-
31
- ncrna_fasta=File.join(formatted_db_path,'nc_rna_db','ncrna_fln_100.fasta')
32
- system("makeblastdb", "-in", ncrna_fasta, "-dbtype", "nucl", "-parse_seqids")
33
-
34
- puts "\nncRNA database completed"
49
+ end
50
+
51
+ def filtering_ncbi_seqs(fasta_file, max_length)
52
+ fasta = FastaQualFile.new(fasta_file)
53
+ filtered_fasta = ''
54
+ fasta.each do |name, seq, comments|
55
+ name ="#{name} #{comments}"
56
+ if seq.length >= max_length
57
+ fields = name.split('|')
58
+ if fields[1] == '' || name.include?('||')
59
+ new_name = name
60
+ else #Cut huge description
61
+ new_name = fields[0]+'|'
62
+ ids = fields[1].split(',')
63
+ new_name << "#{ids.first}\|#{fields[2]}"
64
+ end
65
+ filtered_fasta << ">#{new_name}\n#{seq}\n"
66
+ end
67
+ end
68
+ return filtered_fasta
35
69
  end
36
70
 
37
71
  def conecta_uniprot(my_array, formatted_db_path)
38
72
 
73
+ Dir.mkdir(formatted_db_path) if !File.exists?(formatted_db_path)
74
+ varsplic_out=File.join(formatted_db_path,'uniprot_sprot_varsplic.fasta.gz')
75
+
39
76
  $ftp = Net::FTP.new()
40
-
41
- if !File.exists?(formatted_db_path)
42
- Dir.mkdir(formatted_db_path)
43
- end
44
-
45
- $ftp.connect('ftp.uniprot.org')
46
-
77
+ $ftp.connect('ftp.ebi.ac.uk')
47
78
  $ftp.login
48
79
 
49
80
  puts "connected to UniProt"
50
-
51
81
  my_array.each do |db_group|
52
82
  puts "Downloading #{db_group}"
53
83
  download_uniprot(db_group, formatted_db_path)
54
84
  end
55
-
56
- varsplic_out=File.join(formatted_db_path,'uniprot_sprot_varsplic.fasta.gz')
85
+
86
+ #archivo de variantes de splicing. POR QUE?
57
87
  $ftp.chdir("/pub/databases/uniprot/current_release/knowledgebase/complete")
58
88
  $ftp.getbinaryfile("uniprot_sprot_varsplic.fasta.gz", varsplic_out)
59
-
60
- puts "isoform files downloaded"
61
-
62
89
  $ftp.close
63
90
 
91
+ puts "isoform files downloaded"
64
92
  end
65
93
 
66
94
  def download_uniprot(uniprot_group, formatted_db_path)
@@ -70,156 +98,280 @@ def download_uniprot(uniprot_group, formatted_db_path)
70
98
  $ftp.chdir("/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions")
71
99
  $ftp.getbinaryfile("uniprot_sprot_#{uniprot_group}.dat.gz", sp_out)
72
100
  $ftp.getbinaryfile("uniprot_trembl_#{uniprot_group}.dat.gz", tr_out)
73
-
101
+
74
102
  puts "#{uniprot_group} files downloaded"
75
103
 
76
104
  end
77
105
 
78
- def filter_incomplete_seqs(file_name, isoform_hash, formatted_db_path)
79
-
80
- puts "filtering sequences from #{file_name}"
81
-
82
- # UniProtKB fragments with FT NON_CONS and FT NON_TER features.
83
- #
84
- # * FT NON_TER: The residue at an extremity of the sequence is not the terminal residue. If applied to position 1, this signifies that the first position is not the N-terminus of the complete molecule. If applied to the last position, it means that this position is not the C-terminus of the complete molecule. There is no description field for this key. Examples of NON_TER key feature lines:
85
- # FT NON_TER 1 1
86
- # FT NON_TER 29 29
87
- # * FT NON_CONS: Non-consecutive residues. Indicates that two residues in a sequence are not consecutive and that there are a number of unreported or missing residues between them. Example of a NON_CONS key feature line:
88
- # FT NON_CONS 1683 1684
89
- #
90
- # NON_CONS fragments are not indicated as non-consecutive in InterPro and being non-consecutive the match to methods may be incorrect if the method spans the 'break'.
91
-
92
- newseq=false
93
- print_seq=true
94
- id=''
95
- description = ''
96
- organism_name = ''
97
- seq = ''
98
- organelle = ''
99
-
100
- file_name =~ /uniprot_([a-z]+)_([a-z]+).dat/
101
- db_name = $1
102
- output_name = $2
103
- db_name.sub!('sprot','sp')
104
- db_name.sub!('trembl','tr')
105
-
106
- if !File.exists?(File.join(formatted_db_path, "#{db_name}_#{output_name}"))
107
- Dir.mkdir(File.join(formatted_db_path, "#{db_name}_#{output_name}"))
108
- end
109
-
110
- output_file = File.new(File.join(formatted_db_path, "#{db_name}_#{output_name}/#{db_name}_#{output_name}.fasta"), "w")
111
-
112
- File.open(file_name).each_line do |line|
113
- if (newseq == false)
114
- if (line =~ /^AC\s+(\w+);/)
115
- id=$1
116
- newseq = true
117
- description = ''
118
- organism_name = ''
119
- seq = ''
120
- print_seq = true
121
- organelle = ''
122
- end
123
- else
124
- if (line =~ /^DE\s+(.+)\;*/)
125
- if (description == '')
126
- description = $1
127
- description.sub!(/RecName: Full=/,'sp=')
128
- description.sub!(/SubName: Full=/,'tr=')
129
- end
130
- if (line =~ /Flags: Fragment/)
131
- # puts "#{id} #{line}"
132
- print_seq=false
133
- end
134
- elsif (line =~ /^OS\s+(.+)/)
135
- organism_name = $1
136
- elsif (line =~ /^OG\s+(.+)/)
137
- organelle = $1
138
- elsif (line =~ /^FT\s+NON_TER\s+/)
139
- print_seq=false
140
- # puts "#{id} NON_TER"
141
- elsif (line =~ /^FT\s+NON_CONS\s+(\d+)\s+/)
142
- print_seq=false
143
- # puts "#{id} NON_CONS"
144
- elsif (line =~ /^\s+([\w\s]+)/)
145
- seq += $1
146
- elsif (line =~ /^\/\//)
147
- seq.gsub!(/\s*/,'')
148
- if (seq !~ /^M/i)
149
- print_seq=false
150
- end
151
- newseq = false
152
-
153
- if (print_seq)
154
- output_file.puts ">#{id} #{description} #{organism_name} #{organelle}\n#{seq}"
155
- if (!isoform_hash[id].nil?)
156
- output_file.puts isoform_hash[id]
157
- end
158
- end
106
+
107
+
108
+ def filter_and_makeDB(formatted_db_path, dbtype, db_group, isoform_hash, prefix, options)
109
+ file_name = prefix +'_' + db_group
110
+ puts 'Building ' + file_name
111
+ fasta = File.join(formatted_db_path,"#{file_name}","#{file_name}.fasta")
112
+ blastdb_input = fasta.gsub('.fasta', '')
113
+ current_db_source = File.join(formatted_db_path, "uniprot_#{dbtype}_#{db_group}.dat")
114
+ if File.exists?(current_db_source)
115
+ seqs = filter_incomplete_seqs(current_db_source, isoform_hash, formatted_db_path, file_name, options)
116
+ if !options[:only_index]
117
+ if options[:cdhit] > 0
118
+ output_file = File.open(fasta, 'w')
119
+ output_file.puts seqs
120
+ output_file.close
121
+ system("cd-hit -i #{fasta} -o /dev/stderr -c #{options[:cdhit]} -M 0 -s 0.95 2>&1 >/dev/null| makeblastdb -in - -out #{blastdb_input} -title #{File.basename(blastdb_input)} -dbtype 'prot' -parse_seqids")
122
+ else
123
+ do_makeblastdb(seqs, blastdb_input, 'prot')
159
124
  end
160
125
  end
161
126
  end
162
- output_file.close
163
127
  end
164
128
 
165
- def load_isoform_hash(file)
129
+ def complete?(uniprot_record)
130
+ complete = TRUE
131
+ if uniprot_record.description.include?('Flags: Fragment') || #Discard non full length records
132
+ uniprot_record.seq[0] != 'M' ||
133
+ uniprot_record.seq.include?('XX') ||
134
+ uniprot_record.ft.keys.include?('NON_TER') ||# The residue at an extremity of the sequence is not the terminal residue. If applied to position 1, this signifies that the first position is not the N-terminus of the complete molecule. If applied to the last position, it means that this position is not the C-terminus of the complete molecule. There is no description field for this key
135
+ uniprot_record.ft.keys.include?('NON_CONS') # Non-consecutive residues. Indicates that two residues in a sequence are not consecutive and that there are a number of unreported or missing residues between them
136
+ complete = FALSE
137
+ end
138
+ return complete
139
+ end
140
+
141
+ def fln_record(uniprot_record, seqs, index, isoform_hash)
142
+ index_record = []
143
+ # Primary data
144
+ accession_number = uniprot_record.accession
145
+ description_data = uniprot_record.description.split(';')
146
+ description = description_data.first
147
+ description.sub!(/RecName: Full=/,'sp=')
148
+ description.sub!(/SubName: Full=/,'tr=')
149
+ description.sub!(/{\S*}/,'')
150
+
151
+ organism = uniprot_record.os.first.values.reverse.join(' ')
152
+ organelle = uniprot_record.og.join(' ')
153
+ sequence = uniprot_record.seq.gsub('U','X')
154
+
155
+ # Secondary data
156
+ index_record << accession_number
157
+ index_record << description
158
+ taxonomy = uniprot_record.oc.join(';')
159
+ index_record << taxonomy
160
+ index_record << organism
161
+ index_record << sequence
162
+ if !organelle.empty?
163
+ index_record << organelle
164
+ else
165
+ index_record << '-'
166
+ end
167
+ go_data = uniprot_record.dr['GO']
168
+ if !go_data.nil?
169
+ index_record << go_data.map{|go| go[0]}.join(';') # GO ID
170
+ index_record << go_data.map{|go| go[1]}.join(';') # GO Description
171
+ else
172
+ index_record << '-'
173
+ index_record << '-'
174
+ end
175
+ kegg_data = uniprot_record.dr['KEGG']
176
+ if !kegg_data.nil?
177
+ index_record << kegg_data.map{|kegg| kegg[0]}.join(';')
178
+ else
179
+ index_record << '-'
180
+ end
181
+ interpro_data = uniprot_record.dr['InterPro']
182
+ if !interpro_data.nil?
183
+ index_record << interpro_data.map{|ip| ip[0]}.join(';') # interpro ID
184
+ index_record << interpro_data.map{|ip| ip[1]}.join(';') # ip Description
185
+ else
186
+ index_record << '-'
187
+ index_record << '-'
188
+ end
189
+
190
+ if !description_data[1].nil? && description_data[1].include?('EC=')
191
+ index_record << description_data[1].split(' ').first.gsub('=',':')
192
+ else
193
+ index_record << '-'
194
+ end
195
+
196
+ pfam_data = uniprot_record.dr['Pfam']
197
+ if !pfam_data.nil?
198
+ index_record << pfam_data.map!{|pf| pf[0]}.join(';') # pfam ID
199
+ index_record << pfam_data.map!{|pf| pf[1]}.join(';') # pfam description
200
+ else
201
+ index_record << '-'
202
+ index_record << '-'
203
+ end
204
+
205
+ unipathway_data = uniprot_record.dr['UniPathway']
206
+ if !unipathway_data.nil?
207
+ index_record << unipathway_data.map!{|pf| pf[0]}.join(';') # unipathway ID
208
+ else
209
+ index_record << '-'
210
+ end
211
+
212
+ seqs << ">#{[accession_number, description, organism, organelle].join(' ')}\n#{sequence}\n"
213
+ index.puts index_record.join("\t")
214
+ seqs << isoform_hash[accession_number]+"\n" if !isoform_hash.nil? && !isoform_hash[accession_number].nil?
215
+ end
216
+
217
+ def ncbi_record(uniprot_record, seqs)
218
+ accession_number = uniprot_record.accession
219
+ id = uniprot_record.entry_id
220
+ organism = uniprot_record.os.first.values.reverse.join(' ')
221
+ sequence = uniprot_record.seq
222
+ description = uniprot_record.description.split(';').first
223
+ gene_name = nil
224
+ gn_field = uniprot_record.gn.first
225
+ gene_name = gn_field[:name] if !gn_field.nil?
226
+ prediction_field = uniprot_record.get('PE')
227
+ prediction_field =~ /PE\s+(\d+):/
228
+ prediction_status = $1
229
+ sequence_version_field = uniprot_record.dt['sequence']
230
+ sequence_version_field =~ /sequence version (\d+)./
231
+ sequence_version = $1
232
+ db = nil
233
+ if description.include?('RecName: Full=')
234
+ db = 'sp'
235
+ description.sub!(/RecName: Full=/,'')
236
+ elsif description.include?('SubName: Full=')
237
+ db = 'tr'
238
+ description.sub!(/SubName: Full=/,'')
239
+ end
240
+ taxonomy = uniprot_record.oc.join(';')
241
+
242
+ seqs << ">#{db}|#{accession_number}|#{id} #{description} OS=#{organism} GN=#{gene_name} PE=#{prediction_status} SV=#{sequence_version}\n#{sequence}\n"
243
+ end
244
+
245
+ def filter_incomplete_seqs(file_name, isoform_hash, formatted_db_path, db_name, options)
166
246
 
167
- isoform_hash = {}
168
- my_fasta = ''
169
- acc = ''
170
- File.open(file).each do |line|
171
- line.chomp!
172
- if (line =~ /(^>\w+\|(\w+)\-\d\|.+)/)
173
- if (isoform_hash[acc].nil?)
174
- isoform_hash[acc]= "#{my_fasta}\n"
247
+ puts "filtering sequences from #{file_name}"
248
+
249
+ db_folder = File.join(formatted_db_path, db_name)
250
+ Dir.mkdir(db_folder) if !File.exists?(db_folder)
251
+
252
+ main_name = File.join(db_folder, db_name)
253
+ index = File.open(main_name + '.index', 'w') if !options[:all]
254
+ seqs = ''
255
+ Bio::FlatFile.auto(file_name).each_entry {|uniprot_record|
256
+ if !options[:all] && !complete?(uniprot_record)
257
+ next
258
+ else #Get attributes of full length records
259
+ if options[:all]
260
+ ncbi_record(uniprot_record, seqs)
175
261
  else
176
- isoform_hash[acc]+= "#{my_fasta}\n"
262
+ fln_record(uniprot_record, seqs, index, isoform_hash)
177
263
  end
178
- my_fasta = "#{$1}\n"
179
- acc = $2
180
- else
181
- my_fasta += line
182
264
  end
183
- end
184
-
185
- return isoform_hash
265
+ }
266
+
267
+ index.close if !options[:all]
268
+ return seqs
186
269
  end
187
270
 
188
- ################################################### MAIN
189
271
 
190
- ROOT_PATH=File.dirname(__FILE__)
272
+
273
+ ##########################################################################################
274
+ ## OPTIONS
275
+ ##########################################################################################
276
+
277
+ options = {}
278
+
279
+ divs = %w{human fungi invertebrates mammals plants rodents vertebrates}
280
+
281
+ optparse = OptionParser.new do |opts|
282
+ options[:uniprot_div] = divs
283
+ opts.on( '-u', '--file String', 'Uniprot DBs to be downloaded. String structure: \'div_name1,div_name2..\'. Posible options: human, fungi, invertebrates, mammals, plants, rodents, vertebrates. Default: download all') do |uniprot_div|
284
+ temp_divs = uniprot_div.split(',')
285
+ check_valid_ids = temp_divs - divs
286
+ if !check_valid_ids.empty?
287
+ puts 'This uniprot division not exists', check_valid_ids
288
+ process.exit
289
+ else
290
+ options[:uniprot_div] = temp_divs
291
+ end
292
+ end
293
+
294
+ options[:no_download] = FALSE
295
+ opts.on( '-d', '--no_download', 'Only parse downloaded files without download them again') do
296
+ options[:no_download] = TRUE
297
+ end
298
+
299
+ options[:no_ncrna] = FALSE
300
+ opts.on( '-n', '--no_ncrna', 'No use ncrna sequences') do
301
+ options[:no_ncrna] = TRUE
302
+ end
303
+
304
+ options[:only_index] = FALSE
305
+ opts.on( '-i', '--only_index', 'Build annotation index only without blast DB') do
306
+ options[:only_index] = TRUE
307
+ end
308
+
309
+ options[:no_trembl] = FALSE
310
+ opts.on( '-t', '--no_trembl', 'No use trembl sequences') do
311
+ options[:no_trembl] = TRUE
312
+ end
313
+
314
+ options[:all] = FALSE
315
+ opts.on( '-a', '--all_sequences', 'Generate databaeses with all sequences') do
316
+ options[:all] = TRUE
317
+ end
318
+
319
+ options[:cdhit] = 0
320
+ opts.on( '-c', '--cdhit FLOAT', 'Compact databases with cdhit. 0 for deactivate, >0 - 1 to set percentage of identity. Default: 0') do |cdhit|
321
+ options[:cdhit] = cdhit.to_f
322
+ end
323
+
324
+ options[:no_uniprot] = FALSE
325
+ opts.on( '-p', '--no_uniprot', 'No use uniprot sequences') do
326
+ options[:no_uniprot] = TRUE
327
+ end
328
+
329
+
330
+ # Set a banner, displayed at the top of the help screen.
331
+ opts.banner = "Usage: #{File.basename(__FILE__)} [options] \n\n"
332
+
333
+ # This displays the help screen
334
+ opts.on( '-h', '--help', 'Display this screen' ) do
335
+ puts opts
336
+ exit
337
+ end
338
+
339
+ end # End opts
340
+
341
+ # parse options and remove from ARGV
342
+ optparse.parse!
343
+
344
+
345
+ ##############################################################################################
346
+ ## MAIN
347
+ ##############################################################################################
348
+
191
349
 
192
350
  if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB'])
193
351
  formatted_db_path = ENV['BLASTDB']
194
352
  else # otherwise use ROOTPATH + DB
195
353
  formatted_db_path = File.expand_path(File.join(ROOT_PATH, "blast_dbs"))
354
+ Dir.mkdir(formatted_db_path)
196
355
  end
197
356
 
198
- ENV['BLASTDB']=formatted_db_path
357
+ puts formatted_db_path
358
+
359
+ ENV['BLASTDB'] = formatted_db_path
199
360
  puts "Databases will be downloaded at: #{ENV['BLASTDB']}"
200
361
  puts "\nTo set the path for storing databases, execute next line in your terminal or add it to your .bash_profile:\n\n\texport BLASTDB=/my_path/\n\n"
201
362
 
202
- my_array = ["human","fungi","invertebrates","mammals","plants","rodents","vertebrates"]
203
- # my_array = ["plants","human"] # used for a shoter test
204
-
205
- conecta_uniprot(my_array, formatted_db_path)
206
- system('gunzip '+formatted_db_path+'*.gz')
207
-
208
- isoform_hash = {}
209
- isoform_hash = load_isoform_hash(File.join(formatted_db_path, "uniprot_sprot_varsplic.fasta"))
363
+ download_ncrna(formatted_db_path, options[:no_download]) if !options[:no_ncrna]
210
364
 
211
- download_ncrna(formatted_db_path)
212
-
213
- my_array.each do |db_group|
214
-
215
- filter_incomplete_seqs(File.join(formatted_db_path, "uniprot_sprot_#{db_group}.dat"), isoform_hash, formatted_db_path)
216
- filter_incomplete_seqs(File.join(formatted_db_path, "uniprot_trembl_#{db_group}.dat"), isoform_hash, formatted_db_path)
217
-
218
- sp_fasta=File.join(formatted_db_path,"sp_#{db_group}","sp_#{db_group}.fasta")
219
- tr_fasta=File.join(formatted_db_path,"tr_#{db_group}","tr_#{db_group}.fasta")
220
- system("makeblastdb -in #{sp_fasta} -dbtype 'prot' -parse_seqids")
221
- system("makeblastdb -in #{tr_fasta} -dbtype 'prot' -parse_seqids")
222
-
365
+ if !options[:no_download]
366
+ conecta_uniprot(options[:uniprot_div], formatted_db_path)
367
+ system('gunzip '+formatted_db_path+'*.gz')
223
368
  end
224
369
 
225
- puts "download_fln_dbs.rb has finished"
370
+ if !options[:no_uniprot]
371
+ isoform_hash = load_isoform_hash(File.join(formatted_db_path, "uniprot_sprot_varsplic.fasta")) #archivo de variantes de splicing. POR QUE?
372
+ options[:uniprot_div].each do |db_group|
373
+ filter_and_makeDB(formatted_db_path, 'sprot', db_group, isoform_hash, 'sp', options)
374
+ filter_and_makeDB(formatted_db_path, 'trembl', db_group, nil, 'tr', options) if !options[:no_trembl]
375
+ end
376
+ end
377
+ puts "download_fln_dbs.rb has finished"