RubyGems - full_lengther_next - Versions diffs - 0.0.8 → 0.5.6 - Mend

full_lengther_next 0.0.8 → 0.5.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

data/.gemtest +0 -0
data/History.txt +2 -2
data/Manifest.txt +33 -18
data/Rakefile +4 -2
data/bin/download_fln_dbs.rb +310 -158
data/bin/full_lengther_next +160 -103
data/bin/make_test_dataset.rb +236 -0
data/bin/make_user_db.rb +101 -117
data/bin/plot_fln.rb +270 -0
data/bin/plot_taxonomy.rb +70 -0
data/lib/expresscanvas.zip +0 -0
data/lib/full_lengther_next.rb +3 -3
data/lib/full_lengther_next/classes/artifacts.rb +66 -0
data/lib/full_lengther_next/classes/blast_functions.rb +326 -0
data/lib/full_lengther_next/classes/cdhit.rb +154 -0
data/lib/full_lengther_next/classes/chimeric_seqs.rb +315 -57
data/lib/full_lengther_next/classes/common_functions.rb +105 -63
data/lib/full_lengther_next/classes/exonerate_result.rb +258 -0
data/lib/full_lengther_next/classes/fl_analysis.rb +226 -617
data/lib/full_lengther_next/classes/fl_string_utils.rb +4 -2
data/lib/full_lengther_next/classes/fln_stats.rb +598 -557
data/lib/full_lengther_next/classes/handle_db.rb +30 -0
data/lib/full_lengther_next/classes/my_worker.rb +308 -138
data/lib/full_lengther_next/classes/my_worker_EST.rb +54 -0
data/lib/full_lengther_next/classes/my_worker_manager_EST.rb +69 -0
data/lib/full_lengther_next/classes/my_worker_manager_fln.rb +389 -0
data/lib/full_lengther_next/classes/nc_rna.rb +5 -7
data/lib/full_lengther_next/classes/reptrans.rb +210 -0
data/lib/full_lengther_next/classes/sequence.rb +439 -80
data/lib/full_lengther_next/classes/test_code.rb +15 -16
data/lib/full_lengther_next/classes/types.rb +12 -0
data/lib/full_lengther_next/classes/une_los_hit.rb +148 -230
data/lib/full_lengther_next/classes/warnings.rb +40 -0
metadata +207 -93
data/lib/full_lengther_next/classes/lcs.rb +0 -33
data/lib/full_lengther_next/classes/my_worker_manager.rb +0 -240

data/.gemtest ADDED

File without changes

data/History.txt CHANGED

@@ -1,6 +1,6 @@
-=== 0.0.8 2012-11-28
+=== 0.1.0 2013-09-12
-Protection against empty seqs when all seqs match against user_db
+Major rewrite of script
 === 0.0.7 2012-07-25

data/Manifest.txt CHANGED

@@ -1,29 +1,44 @@
+Rakefile
+script
+script/generate
+script/destroy
+script/console
+test
+test/test_full_lengther_next.rb
+test/test_helper.rb
+bin/plot_taxonomy.rb
+bin/plot_fln.rb
 bin/download_fln_dbs.rb
-bin/make_user_db.rb
 bin/full_lengther_next
+bin/make_user_db.rb
+bin/make_test_dataset.rb
+PostInstall.txt
+README.rdoc
 History.txt
-lib/full_lengther_next/classes/common_functions.rb
+Manifest.txt
+lib/full_lengther_next
+lib/full_lengther_next/classes
+lib/full_lengther_next/classes/blast_functions.rb
+lib/full_lengther_next/classes/my_worker_manager_fln.rb
+lib/full_lengther_next/classes/types.rb
 lib/full_lengther_next/classes/chimeric_seqs.rb
+lib/full_lengther_next/classes/artifacts.rb
+lib/full_lengther_next/classes/cdhit.rb
 lib/full_lengther_next/classes/fl_analysis.rb
 lib/full_lengther_next/classes/fl_string_utils.rb
-lib/full_lengther_next/classes/fln_stats.rb
-lib/full_lengther_next/classes/lcs.rb
 lib/full_lengther_next/classes/my_worker.rb
-lib/full_lengther_next/classes/my_worker_manager.rb
-lib/full_lengther_next/classes/nc_rna.rb
-lib/full_lengther_next/classes/orf.rb
 lib/full_lengther_next/classes/sequence.rb
+lib/full_lengther_next/classes/my_worker_EST.rb
 lib/full_lengther_next/classes/test_code.rb
+lib/full_lengther_next/classes/orf.rb
 lib/full_lengther_next/classes/une_los_hit.rb
+lib/full_lengther_next/classes/warnings.rb
+lib/full_lengther_next/classes/fln_stats.rb
+lib/full_lengther_next/classes/my_worker_manager_EST.rb
+lib/full_lengther_next/classes/nc_rna.rb
+lib/full_lengther_next/classes/reptrans.rb
+lib/full_lengther_next/classes/common_functions.rb
+lib/full_lengther_next/classes/exonerate_result.rb
+lib/full_lengther_next/classes/handle_db.rb
 lib/full_lengther_next.rb
-Manifest.txt
-PostInstall.txt
-Rakefile
-README.rdoc
-script
-script/console
-script/destroy
-script/generate
-test
-test/test_full_lengther_next.rb
-test/test_helper.rb
+lib/expresscanvas.zip

data/Rakefile CHANGED

@@ -11,7 +11,7 @@ Hoe.plugin :newgem
 # Generate all the Rake tasks
 # Run 'rake -T' to see list of generated tasks (from gem root directory)
 $hoe = Hoe.spec 'full_lengther_next' do
-  self.developer 'Noe Fernandez & Dario Guerrero', 'noeisneo@gmail.com & dariogf@gmail.com'
+  self.developer 'Pedro Seoane & Noe Fernandez & Dario Guerrero ', 'seoanezonjic@hotmail.com & noeisneo@gmail.com & dariogf@gmail.com'
   self.post_install_message = 'PostInstall.txt' # TODO remove if post-install message not required
   self.rubyforge_name       = self.name # TODO this is default value
   # self.extra_deps         = [['activesupport','>= 2.0.2']]
@@ -20,9 +20,11 @@ $hoe = Hoe.spec 'full_lengther_next' do
   # self.extra_deps << ['gnuplot','>=0']
   # self.extra_deps << ['term-ansicolor','>=1.0.5']
   self.extra_deps << ['xml-simple','>=1.0.12']
-  self.extra_deps << ['scbi_blast','>=0.0.37']
+  self.extra_deps << ['scbi_blast','>=0.0.32']
   self.extra_deps << ['scbi_mapreduce','>=0.0.29']
   self.extra_deps << ['scbi_fasta','>=0.1.7']
+  self.extra_deps << ['bio-cd-hit-report', '>= 0.1.0  ']
+  self.extra_deps << ['bio', '>= 1.4.3']
   # self.extra_deps << ['scbi_fastq','>=0.0.13']
   self.extra_deps << ['scbi_plot','>=0.0.6']
   # self.extra_deps << ['scbi_math','>=0.0.1']

data/bin/download_fln_dbs.rb CHANGED

@@ -1,66 +1,94 @@
 #!/usr/bin/env ruby
 # 15-2-2011 Noe Fernandez-Pozo
 # Script to download Full-LengtherNext databases.
 # Once in UniProtKB/Swiss-Prot, a protein entry is removed from UniProtKB/TrEMBL.
+ROOT_PATH=File.dirname(__FILE__)
+$: << File.expand_path(File.join(ROOT_PATH, "../lib/full_lengther_next/classes/"))
+require 'bio'
 require 'net/ftp'
 require 'open-uri'
+require 'scbi_fasta'
+require 'optparse'
+require 'cdhit'
+require 'handle_db'
+##############################################################################################
+## METHODS
+#############################################################################################
+def download_ncrna(formatted_db_path, no_download)
-################################################### Functions
+	ncrna_zip = File.join(formatted_db_path, 'ncrna.zip')
+	ncrna_fasta = File.join(formatted_db_path, 'ncrna.fasta')
+	db_path = File.join(formatted_db_path, 'nc_rna_db')
+	source_file = File.join(db_path, ncrna_fasta)
-def download_ncrna(formatted_db_path)
-	if !File.exists?(File.join(formatted_db_path, "nc_rna_db"))
-		Dir.mkdir(File.join(formatted_db_path, "nc_rna_db"))
+	if !no_download
+		puts "Downloading ncRNA database"
+		open(ncrna_zip, 'wb') do |my_file|
+		 	my_file.print open('http://www.ncrna.org/frnadb/files/ncrna.zip').read
+		end
+		puts "\nncRNA database downloaded"
+		system("unzip", ncrna_zip, "-d", ncrna_fasta)
+		system("rm", ncrna_zip)
 	end
-	puts "Downloading ncRNA database"
-	open(File.join(formatted_db_path, "nc_rna_db/ncrna_fln_100.fasta.zip"), "wb") do |my_file|
-	  my_file.print open('http://www.scbi.uma.es/downloads/FLNDB/ncrna_fln_100.fasta.zip').read
+	if  File.exists?(ncrna_fasta)
+		Dir.mkdir(db_path) if !File.exists?(db_path)
+		db_files = File.join(db_path, 'ncrna')
+		filtered_fasta = filtering_ncbi_seqs(ncrna_fasta, 40)
+		#system("LANG=C sed 's/[^A-Za-z0-9\.> -\|]/_/g' #{ncrna_fasta}_filtered > #{ncrna_fasta}_cln")
+		puts "\nncRNA database decompressed and cleaned"
+		do_makeblastdb(filtered_fasta, db_files, 'nucl')
+		system("rm #{ncrna_fasta}")
+		puts "\nncRNA database completed"
 	end
-	puts "\nncRNA database downloaded"
-	ncrna_zip=File.join(formatted_db_path,'nc_rna_db','ncrna_fln_100.fasta.zip')
-	ncrna_out_dir=File.join(formatted_db_path,'nc_rna_db')
-	system("unzip", ncrna_zip, "-d", ncrna_out_dir)
-	system("rm", ncrna_zip)
-	puts "\nncRNA database decompressed"
-	ncrna_fasta=File.join(formatted_db_path,'nc_rna_db','ncrna_fln_100.fasta')
-	system("makeblastdb", "-in", ncrna_fasta, "-dbtype", "nucl", "-parse_seqids")
-	puts "\nncRNA database completed"
+end
+def filtering_ncbi_seqs(fasta_file, max_length)
+    fasta = FastaQualFile.new(fasta_file)
+    filtered_fasta = ''
+    fasta.each do |name, seq, comments|
+    	name ="#{name} #{comments}"
+        if seq.length >= max_length
+            fields = name.split('|')
+            if fields[1] == '' || name.include?('||')
+                new_name = name
+            else #Cut huge description
+                new_name = fields[0]+'|'
+                ids = fields[1].split(',')
+                new_name << "#{ids.first}\|#{fields[2]}"
+            end
+            filtered_fasta << ">#{new_name}\n#{seq}\n"
+        end
+    end
+    return filtered_fasta
 end
 def conecta_uniprot(my_array, formatted_db_path)
+	Dir.mkdir(formatted_db_path) if !File.exists?(formatted_db_path)
+	varsplic_out=File.join(formatted_db_path,'uniprot_sprot_varsplic.fasta.gz')
 	$ftp = Net::FTP.new()
-	if !File.exists?(formatted_db_path)
-		Dir.mkdir(formatted_db_path)
-	end
-	$ftp.connect('ftp.uniprot.org')
+	$ftp.connect('ftp.ebi.ac.uk')
 	$ftp.login
 	puts "connected to UniProt"
 	my_array.each do |db_group|
 		puts "Downloading #{db_group}"
 		download_uniprot(db_group, formatted_db_path)
 	end
-	varsplic_out=File.join(formatted_db_path,'uniprot_sprot_varsplic.fasta.gz')
+	#archivo de variantes de splicing. POR QUE?
 	$ftp.chdir("/pub/databases/uniprot/current_release/knowledgebase/complete")
 	$ftp.getbinaryfile("uniprot_sprot_varsplic.fasta.gz", varsplic_out)
-	puts "isoform files downloaded"
 	$ftp.close
+	puts "isoform files downloaded"
 end
 def download_uniprot(uniprot_group, formatted_db_path)
@@ -70,156 +98,280 @@ def download_uniprot(uniprot_group, formatted_db_path)
 	$ftp.chdir("/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions")
 	$ftp.getbinaryfile("uniprot_sprot_#{uniprot_group}.dat.gz", sp_out)
 	$ftp.getbinaryfile("uniprot_trembl_#{uniprot_group}.dat.gz", tr_out)
 	puts "#{uniprot_group} files downloaded"
 end
-def filter_incomplete_seqs(file_name, isoform_hash, formatted_db_path)
-	puts "filtering sequences from #{file_name}"
-	# UniProtKB fragments with FT NON_CONS and FT NON_TER features.
-	#
-	#     * FT NON_TER: The residue at an extremity of the sequence is not the terminal residue. If applied to position 1, this signifies that the first position is not the N-terminus of the complete molecule. If applied to the last position, it means that this position is not the C-terminus of the complete molecule. There is no description field for this key. Examples of NON_TER key feature lines:
-	#       FT NON_TER 1 1
-	#       FT NON_TER 29 29
-	#     * FT NON_CONS: Non-consecutive residues. Indicates that two residues in a sequence are not consecutive and that there are a number of unreported or missing residues between them. Example of a NON_CONS key feature line:
-	#       FT NON_CONS 1683 1684
-	#
-	# NON_CONS fragments are not indicated as non-consecutive in InterPro and being non-consecutive the match to methods may be incorrect if the method spans the 'break'.
-	newseq=false
-	print_seq=true
-	id=''
-	description = ''
-	organism_name = ''
-	seq = ''
-	organelle = ''
-	file_name =~ /uniprot_([a-z]+)_([a-z]+).dat/
-	db_name = $1
-	output_name = $2
-	db_name.sub!('sprot','sp')
-	db_name.sub!('trembl','tr')
-	if !File.exists?(File.join(formatted_db_path, "#{db_name}_#{output_name}"))
-		Dir.mkdir(File.join(formatted_db_path, "#{db_name}_#{output_name}"))
-	end
-	output_file = File.new(File.join(formatted_db_path, "#{db_name}_#{output_name}/#{db_name}_#{output_name}.fasta"), "w")
-	File.open(file_name).each_line do |line|
-		if (newseq == false)
-			if (line =~ /^AC\s+(\w+);/)
-				id=$1
-				newseq = true
-				description = ''
-				organism_name = ''
-				seq = ''
-				print_seq = true
-				organelle = ''
-			end
-		else
-			if (line =~ /^DE\s+(.+)\;*/)
-				if (description == '')
-					description = $1
-					description.sub!(/RecName: Full=/,'sp=')
-					description.sub!(/SubName: Full=/,'tr=')
-				end
-				if (line =~ /Flags: Fragment/)
-					# puts "#{id} #{line}"
-					print_seq=false
-				end
-			elsif (line =~ /^OS\s+(.+)/)
-				organism_name = $1
-			elsif (line =~ /^OG\s+(.+)/)
-				organelle = $1
-			elsif (line =~ /^FT\s+NON_TER\s+/)
-				print_seq=false
-				# puts "#{id}   NON_TER"
-			elsif (line =~ /^FT\s+NON_CONS\s+(\d+)\s+/)
-				print_seq=false
-				# puts "#{id}   NON_CONS"
-			elsif (line =~ /^\s+([\w\s]+)/)
-				seq += $1
-			elsif (line =~ /^\/\//)
-				seq.gsub!(/\s*/,'')
-				if (seq !~ /^M/i)
-					print_seq=false
-				end
-				newseq = false
-				if (print_seq)
-					output_file.puts ">#{id} #{description} #{organism_name} #{organelle}\n#{seq}"
-					if (!isoform_hash[id].nil?)
-						output_file.puts isoform_hash[id]
-					end
-				end
+def filter_and_makeDB(formatted_db_path, dbtype, db_group, isoform_hash, prefix, options)
+	file_name = prefix +'_' + db_group
+	puts 'Building ' + file_name
+	fasta = File.join(formatted_db_path,"#{file_name}","#{file_name}.fasta")
+	blastdb_input = fasta.gsub('.fasta', '')
+	current_db_source = File.join(formatted_db_path, "uniprot_#{dbtype}_#{db_group}.dat")
+	if File.exists?(current_db_source)
+		seqs = filter_incomplete_seqs(current_db_source, isoform_hash, formatted_db_path, file_name, options)
+		if !options[:only_index]
+			if options[:cdhit] > 0
+				output_file = File.open(fasta, 'w')
+				output_file.puts seqs
+				output_file.close
+				system("cd-hit -i #{fasta} -o /dev/stderr -c #{options[:cdhit]} -M 0 -s 0.95 2>&1 >/dev/null| makeblastdb -in - -out #{blastdb_input} -title #{File.basename(blastdb_input)} -dbtype 'prot' -parse_seqids")
+			else
+				do_makeblastdb(seqs, blastdb_input, 'prot')
 			end
 		end
 	end
-	output_file.close
 end
-def load_isoform_hash(file)
+def complete?(uniprot_record)
+	complete = TRUE
+	if uniprot_record.description.include?('Flags: Fragment') || #Discard non full length records
+		uniprot_record.seq[0] != 'M' ||
+		uniprot_record.seq.include?('XX') ||
+		uniprot_record.ft.keys.include?('NON_TER') ||# The residue at an extremity of the sequence is not the terminal residue. If applied to position 1, this signifies that the first position is not the N-terminus of the complete molecule. If applied to the last position, it means that this position is not the C-terminus of the complete molecule. There is no description field for this key
+		uniprot_record.ft.keys.include?('NON_CONS') # Non-consecutive residues. Indicates that two residues in a sequence are not consecutive and that there are a number of unreported or missing residues between them
+		complete = FALSE
+	end
+	return complete
+end
+def fln_record(uniprot_record, seqs, index, isoform_hash)
+	index_record = []
+	# Primary data
+	accession_number = uniprot_record.accession
+	description_data = uniprot_record.description.split(';')
+	description = description_data.first
+	description.sub!(/RecName: Full=/,'sp=')
+	description.sub!(/SubName: Full=/,'tr=')
+	description.sub!(/{\S*}/,'')
+	organism = uniprot_record.os.first.values.reverse.join(' ')
+	organelle = uniprot_record.og.join(' ')
+	sequence = uniprot_record.seq.gsub('U','X')
+	# Secondary data
+	index_record << accession_number
+	index_record << description
+	taxonomy = uniprot_record.oc.join(';')
+	index_record << taxonomy
+	index_record << organism
+	index_record << sequence
+	if !organelle.empty?
+		index_record << organelle
+	else
+		index_record << '-'
+	end
+	go_data = uniprot_record.dr['GO']
+	if !go_data.nil?
+		index_record << go_data.map{|go| go[0]}.join(';') # GO ID
+		index_record << go_data.map{|go| go[1]}.join(';') # GO Description
+	else
+		index_record << '-'
+		index_record << '-'
+	end
+	kegg_data = uniprot_record.dr['KEGG']
+	if !kegg_data.nil?
+		index_record << kegg_data.map{|kegg| kegg[0]}.join(';')
+	else
+		index_record << '-'
+	end
+	interpro_data = uniprot_record.dr['InterPro']
+	if !interpro_data.nil?
+		index_record << interpro_data.map{|ip| ip[0]}.join(';') # interpro ID
+		index_record << interpro_data.map{|ip| ip[1]}.join(';') # ip Description
+	else
+		index_record << '-'
+		index_record << '-'
+	end
+	if !description_data[1].nil? && description_data[1].include?('EC=')
+		index_record << description_data[1].split(' ').first.gsub('=',':')
+	else
+		index_record << '-'
+	end
+	pfam_data = uniprot_record.dr['Pfam']
+	if !pfam_data.nil?
+		index_record << pfam_data.map!{|pf| pf[0]}.join(';') # pfam ID
+		index_record << pfam_data.map!{|pf| pf[1]}.join(';') # pfam description
+	else
+		index_record << '-'
+		index_record << '-'
+	end
+	unipathway_data = uniprot_record.dr['UniPathway']
+	if !unipathway_data.nil?
+		index_record << unipathway_data.map!{|pf| pf[0]}.join(';') # unipathway ID
+	else
+		index_record << '-'
+	end
+	seqs << ">#{[accession_number, description, organism, organelle].join(' ')}\n#{sequence}\n"
+	index.puts index_record.join("\t")
+	seqs << isoform_hash[accession_number]+"\n" if !isoform_hash.nil? && !isoform_hash[accession_number].nil?
+end
+def	ncbi_record(uniprot_record, seqs)
+	accession_number = uniprot_record.accession
+	id = uniprot_record.entry_id
+	organism = uniprot_record.os.first.values.reverse.join(' ')
+	sequence = uniprot_record.seq
+	description = uniprot_record.description.split(';').first
+	gene_name = nil
+	gn_field = uniprot_record.gn.first
+	gene_name = gn_field[:name] if !gn_field.nil?
+	prediction_field = uniprot_record.get('PE')
+	prediction_field =~ /PE\s+(\d+):/
+	prediction_status = $1
+	sequence_version_field = uniprot_record.dt['sequence']
+	sequence_version_field =~ /sequence version (\d+)./
+	sequence_version = $1
+	db = nil
+	if description.include?('RecName: Full=')
+		db = 'sp'
+		description.sub!(/RecName: Full=/,'')
+	elsif description.include?('SubName: Full=')
+		db = 'tr'
+		description.sub!(/SubName: Full=/,'')
+	end
+	taxonomy = uniprot_record.oc.join(';')
+	seqs << ">#{db}|#{accession_number}|#{id} #{description} OS=#{organism} GN=#{gene_name} PE=#{prediction_status} SV=#{sequence_version}\n#{sequence}\n"
+end
+def filter_incomplete_seqs(file_name, isoform_hash, formatted_db_path, db_name, options)
-	isoform_hash = {}
-	my_fasta = ''
-	acc = ''
-	File.open(file).each do |line|
-		line.chomp!
-		if (line =~ /(^>\w+\|(\w+)\-\d\|.+)/)
-			if (isoform_hash[acc].nil?)
-				isoform_hash[acc]= "#{my_fasta}\n"
+	puts "filtering sequences from #{file_name}"
+	db_folder = File.join(formatted_db_path, db_name)
+	Dir.mkdir(db_folder) if !File.exists?(db_folder)
+	main_name = File.join(db_folder, db_name)
+	index = File.open(main_name + '.index', 'w') if !options[:all]
+	seqs = ''
+	Bio::FlatFile.auto(file_name).each_entry {|uniprot_record|
+		if !options[:all] && !complete?(uniprot_record)
+			next
+		else #Get attributes of full length records
+			if options[:all]
+				ncbi_record(uniprot_record, seqs)
 			else
-				isoform_hash[acc]+= "#{my_fasta}\n"
+				fln_record(uniprot_record, seqs, index, isoform_hash)
 			end
-			my_fasta = "#{$1}\n"
-			acc = $2
-		else
-			my_fasta += line
 		end
-	end
-	return isoform_hash
+	}
+	index.close if !options[:all]
+	return seqs
 end
-################################################### MAIN
-ROOT_PATH=File.dirname(__FILE__)
+##########################################################################################
+## OPTIONS
+##########################################################################################
+options = {}
+divs = %w{human fungi invertebrates mammals plants rodents vertebrates}
+optparse = OptionParser.new do |opts|
+  options[:uniprot_div] = divs
+  opts.on( '-u', '--file String', 'Uniprot DBs to be downloaded. String structure: \'div_name1,div_name2..\'. Posible options: human, fungi, invertebrates, mammals, plants, rodents, vertebrates. Default: download all') do |uniprot_div|
+		temp_divs = uniprot_div.split(',')
+		check_valid_ids = temp_divs - divs
+		if !check_valid_ids.empty?
+			puts 'This uniprot division not exists', check_valid_ids
+			process.exit
+		else
+			options[:uniprot_div] = temp_divs
+		end
+  end
+  options[:no_download] = FALSE
+  opts.on( '-d', '--no_download', 'Only parse downloaded files without download them again') do
+		options[:no_download] = TRUE
+  end
+  options[:no_ncrna] = FALSE
+  opts.on( '-n', '--no_ncrna', 'No use ncrna sequences') do
+		options[:no_ncrna] = TRUE
+  end
+  options[:only_index] = FALSE
+  opts.on( '-i', '--only_index', 'Build annotation index only without blast DB') do
+		options[:only_index] = TRUE
+  end
+  options[:no_trembl] = FALSE
+  opts.on( '-t', '--no_trembl', 'No use trembl sequences') do
+		options[:no_trembl] = TRUE
+  end
+  options[:all] = FALSE
+  opts.on( '-a', '--all_sequences', 'Generate databaeses with all sequences') do
+		options[:all] = TRUE
+  end
+  options[:cdhit] = 0
+  opts.on( '-c', '--cdhit FLOAT', 'Compact databases with cdhit. 0 for deactivate, >0 - 1 to set percentage of identity. Default: 0') do |cdhit|
+		options[:cdhit] = cdhit.to_f
+  end
+	options[:no_uniprot] = FALSE
+  opts.on( '-p', '--no_uniprot', 'No use uniprot sequences') do
+		options[:no_uniprot] = TRUE
+  end
+  # Set a banner, displayed at the top of the help screen.
+  opts.banner = "Usage: #{File.basename(__FILE__)} [options]  \n\n"
+  # This displays the help screen
+  opts.on( '-h', '--help', 'Display this screen' ) do
+    puts opts
+    exit
+  end
+end # End opts
+# parse options and remove from ARGV
+optparse.parse!
+##############################################################################################
+## MAIN
+##############################################################################################
 if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB'])
   formatted_db_path = ENV['BLASTDB']
 else # otherwise use ROOTPATH + DB
   formatted_db_path = File.expand_path(File.join(ROOT_PATH, "blast_dbs"))
+  Dir.mkdir(formatted_db_path)
 end
-ENV['BLASTDB']=formatted_db_path
+puts formatted_db_path
+ENV['BLASTDB'] = formatted_db_path
 puts "Databases will be downloaded at: #{ENV['BLASTDB']}"
 puts "\nTo set the path for storing databases, execute next line in your terminal or add it to your .bash_profile:\n\n\texport BLASTDB=/my_path/\n\n"
-my_array = ["human","fungi","invertebrates","mammals","plants","rodents","vertebrates"]
-# my_array = ["plants","human"] # used for a shoter test
-conecta_uniprot(my_array, formatted_db_path)
-system('gunzip '+formatted_db_path+'*.gz')
-isoform_hash = {}
-isoform_hash = load_isoform_hash(File.join(formatted_db_path, "uniprot_sprot_varsplic.fasta"))
+download_ncrna(formatted_db_path, options[:no_download]) if !options[:no_ncrna]
-download_ncrna(formatted_db_path)
-my_array.each do |db_group|
-	filter_incomplete_seqs(File.join(formatted_db_path, "uniprot_sprot_#{db_group}.dat"), isoform_hash, formatted_db_path)
-	filter_incomplete_seqs(File.join(formatted_db_path, "uniprot_trembl_#{db_group}.dat"), isoform_hash, formatted_db_path)
-	sp_fasta=File.join(formatted_db_path,"sp_#{db_group}","sp_#{db_group}.fasta")
-	tr_fasta=File.join(formatted_db_path,"tr_#{db_group}","tr_#{db_group}.fasta")
-	system("makeblastdb -in #{sp_fasta} -dbtype 'prot' -parse_seqids")
-	system("makeblastdb -in #{tr_fasta} -dbtype 'prot' -parse_seqids")
+if !options[:no_download]
+	conecta_uniprot(options[:uniprot_div], formatted_db_path)
+	system('gunzip '+formatted_db_path+'*.gz')
 end
-puts "download_fln_dbs.rb has finished"
+if !options[:no_uniprot]
+	isoform_hash = load_isoform_hash(File.join(formatted_db_path, "uniprot_sprot_varsplic.fasta"))  #archivo de variantes de splicing. POR QUE?
+	options[:uniprot_div].each do |db_group|
+		filter_and_makeDB(formatted_db_path, 'sprot', db_group, isoform_hash, 'sp', options)
+		filter_and_makeDB(formatted_db_path, 'trembl', db_group, nil, 'tr', options) if !options[:no_trembl]
+	end
+end
+puts "download_fln_dbs.rb has finished"