full_lengther_next 0.5.7 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -22,13 +22,11 @@ $hoe = Hoe.spec 'full_lengther_next' do
22
22
  self.extra_deps << ['xml-simple','>=1.0.12']
23
23
  self.extra_deps << ['scbi_blast','>=0.0.32']
24
24
  self.extra_deps << ['scbi_mapreduce','>=0.0.29']
25
- self.extra_deps << ['scbi_fasta','>=0.1.7']
25
+ self.extra_deps << ['scbi_zcat']
26
26
  self.extra_deps << ['bio-cd-hit-report', '>= 0.1.0 ']
27
27
  self.extra_deps << ['bio', '>= 1.4.3']
28
- # self.extra_deps << ['scbi_fastq','>=0.0.13']
29
28
  self.extra_deps << ['scbi_plot','>=0.0.6']
30
- # self.extra_deps << ['scbi_math','>=0.0.1']
31
-
29
+
32
30
  end
33
31
 
34
32
  require 'newgem/tasks'
@@ -10,7 +10,7 @@ $: << File.expand_path(File.join(ROOT_PATH, "../lib/full_lengther_next/classes/"
10
10
  require 'bio'
11
11
  require 'net/ftp'
12
12
  require 'open-uri'
13
- require 'scbi_fasta'
13
+ require 'scbi_zcat'
14
14
  require 'optparse'
15
15
  require 'cdhit'
16
16
  require 'handle_db'
@@ -20,52 +20,64 @@ require 'handle_db'
20
20
  #############################################################################################
21
21
  def download_ncrna(formatted_db_path, no_download)
22
22
 
23
- ncrna_zip = File.join(formatted_db_path, 'ncrna.zip')
24
- ncrna_fasta = File.join(formatted_db_path, 'ncrna.fasta')
23
+ ncrna_zip = File.join(formatted_db_path, 'ncrna.gz')
25
24
  db_path = File.join(formatted_db_path, 'nc_rna_db')
26
- source_file = File.join(db_path, ncrna_fasta)
27
-
28
-
25
+ db_files = File.join(db_path, 'ncrna')
26
+ fasta = File.join(db_path , 'filtered.fasta')
29
27
  if !no_download
30
28
  puts "Downloading ncRNA database"
31
29
  open(ncrna_zip, 'wb') do |my_file|
32
- my_file.print open('http://www.ncrna.org/frnadb/files/ncrna.zip').read
30
+ my_file.print open('ftp://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/sequences/rnacentral_active.fasta.gz').read
31
+ #my_file.print open('http://www.ncrna.org/frnadb/files/ncrna.zip').read
33
32
  end
34
33
  puts "\nncRNA database downloaded"
35
- system("unzip", ncrna_zip, "-d", ncrna_fasta)
36
- system("rm", ncrna_zip)
37
34
  end
38
35
 
39
- if File.exists?(ncrna_fasta)
36
+ if File.exists?(ncrna_zip)
40
37
  Dir.mkdir(db_path) if !File.exists?(db_path)
41
- db_files = File.join(db_path, 'ncrna')
42
- filtered_fasta = filtering_ncbi_seqs(ncrna_fasta, 40)
43
- #system("LANG=C sed 's/[^A-Za-z0-9\.> -\|]/_/g' #{ncrna_fasta}_filtered > #{ncrna_fasta}_cln")
38
+ black_list = [' 16S ', 'rRNA', 'ribosomal', 'tRNA', 'rrn'] #rrn = ribosonal rna
39
+ filtered_fasta = filtering_ncbi_seqs(ncrna_zip, 40, black_list)
44
40
  puts "\nncRNA database decompressed and cleaned"
45
- do_makeblastdb(filtered_fasta, db_files, 'nucl')
46
- system("rm #{ncrna_fasta}")
41
+ #do_makeblastdb(filtered_fasta, db_files, 'nucl')
42
+ output_file = File.open(fasta, 'w')
43
+ output_file.puts filtered_fasta
44
+ output_file.close
45
+ system("cd-hit -i #{fasta} -o /dev/stderr -c 0.95 -n 11 -M 0 2>&1 >/dev/null | makeblastdb -in - -out #{db_files} -title #{File.basename(db_files)} -dbtype 'nucl' -parse_seqids")
47
46
  puts "\nncRNA database completed"
48
47
  end
49
48
  end
50
49
 
51
- def filtering_ncbi_seqs(fasta_file, max_length)
52
- fasta = FastaQualFile.new(fasta_file)
50
+ def filtering_ncbi_seqs(fasta_file, max_length, black_list)
51
+ fasta = ScbiZcatFile.new(fasta_file)
53
52
  filtered_fasta = ''
54
- fasta.each do |name, seq, comments|
55
- name ="#{name} #{comments}"
56
- if seq.length >= max_length
57
- fields = name.split('|')
58
- if fields[1] == '' || name.include?('||')
59
- new_name = name
60
- else #Cut huge description
61
- new_name = fields[0]+'|'
62
- ids = fields[1].split(',')
63
- new_name << "#{ids.first}\|#{fields[2]}"
64
- end
65
- filtered_fasta << ">#{new_name}\n#{seq}\n"
53
+ seq_name = nil
54
+ seq = ''
55
+ while !fasta.eof
56
+ line = fasta.readline.chomp
57
+ if line[0] == '>'
58
+ if !seq_name.nil?
59
+ filtered_fasta << "#{seq_name}\n#{seq}\n" if seq.length >= max_length && !compare_list(seq_name, black_list)
60
+ end
61
+ seq_name = line
62
+ seq = ''
63
+ else
64
+ seq << line
65
+ end
66
+ end
67
+ filtered_fasta << "#{seq_name}\n#{seq}\n" if seq.length >= max_length && !compare_list(seq_name, black_list)
68
+
69
+ return filtered_fasta
70
+ end
71
+
72
+ def compare_list(string, list)
73
+ res = FALSE
74
+ list.each do |word|
75
+ if string.include?(word)
76
+ res = TRUE
77
+ break
66
78
  end
67
79
  end
68
- return filtered_fasta
80
+ return res
69
81
  end
70
82
 
71
83
  def conecta_uniprot(my_array, formatted_db_path)
@@ -312,7 +324,7 @@ optparse = OptionParser.new do |opts|
312
324
  end
313
325
 
314
326
  options[:all] = FALSE
315
- opts.on( '-a', '--all_sequences', 'Generate databaeses with all sequences') do
327
+ opts.on( '-a', '--all_sequences', 'Generate databases with all sequences') do
316
328
  options[:all] = TRUE
317
329
  end
318
330
 
@@ -354,7 +366,6 @@ else # otherwise use ROOTPATH + DB
354
366
  Dir.mkdir(formatted_db_path)
355
367
  end
356
368
 
357
- puts formatted_db_path
358
369
 
359
370
  ENV['BLASTDB'] = formatted_db_path
360
371
  puts "Databases will be downloaded at: #{ENV['BLASTDB']}"
@@ -63,9 +63,9 @@ optparse = OptionParser.new do |opts|
63
63
  options[:ident] = ident.to_f
64
64
  end
65
65
 
66
- options[:high_clustering] = TRUE
67
- opts.on( '-k', '--high_clustering', 'Only for representative transcriptome. Add a clustering step using pfam ids. Default true' ) do
68
- options[:high_clustering] = FALSE
66
+ options[:high_clustering] = FALSE
67
+ opts.on( '-k', '--high_clustering', 'Only for representative transcriptome. Add a clustering step using pfam ids. Default false' ) do
68
+ options[:high_clustering] = TRUE
69
69
  end
70
70
 
71
71
  options[:subject_coverage] = 0.25
data/bin/make_user_db.rb CHANGED
@@ -63,6 +63,11 @@ optparse = OptionParser.new do |opts|
63
63
  options[:local] = TRUE
64
64
  end
65
65
 
66
+ options[:user_fasta] = nil
67
+ opts.on( '-f', '--user_fasta FILE', 'Use a cutom fasta file to build the user database') do |file|
68
+ options[:user_fasta] = file
69
+ end
70
+
66
71
  options[:cdhit] = 0
67
72
  opts.on( '-c', '--cdhit FLOAT', 'Compact databases with cdhit. 0 for deactivate, >0 - 1 to set percentage of identity. Default: 0') do |cdhit|
68
73
  options[:cdhit] = cdhit.to_f
@@ -85,10 +90,14 @@ optparse.parse!
85
90
  ########################################################
86
91
  ## MAIN
87
92
  ########################################################
88
-
89
- if options[:taxon].nil? || options[:uniprot_div].nil?
90
- puts 'Taxon or uniprot division was not specified'
91
- Process.exit(-1)
93
+ if options[:user_fasta].nil?
94
+ if options[:taxon].nil? || options[:uniprot_div].nil?
95
+ puts 'Taxon or uniprot division was not specified'
96
+ Process.exit(-1)
97
+ end
98
+ elsif !File.exists?(options[:user_fasta]) || options[:taxon].nil?
99
+ puts 'User fasta file not exists or taxon was not specified'
100
+ Process.exit(-1)
92
101
  end
93
102
 
94
103
  if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB'])
@@ -102,15 +111,21 @@ if !options[:local]
102
111
  else
103
112
  user_db_folder = File.join(Dir.pwd, options[:taxon])
104
113
  end
105
- output_file_path = File.join(user_db_folder, options[:taxon]+".fasta")
114
+
106
115
  user_db_folder.gsub!(' ', '_')
107
- output_file_path.gsub!(' ', '_')
108
116
  Dir.mkdir(user_db_folder) if !File.exists?(user_db_folder)
117
+ output_file_path = File.join(user_db_folder, options[:taxon]+".fasta")
118
+ output_file_path.gsub!(' ', '_')
109
119
 
110
- isoform_hash = load_isoform_hash(File.join(formatted_db_path, 'uniprot_sprot_varsplic.fasta'))
111
- seqs = get_seqs(File.join(formatted_db_path, 'sp_' + options[:uniprot_div],"sp_#{options[:uniprot_div]}.index"), options[:taxon], isoform_hash)
112
- isoform_hash = nil
113
- seqs << get_seqs(File.join(formatted_db_path, 'tr_' + options[:uniprot_div],"tr_#{options[:uniprot_div]}.index"), options[:taxon], isoform_hash)
120
+ seqs = ''
121
+ if options[:user_fasta].nil?
122
+ isoform_hash = load_isoform_hash(File.join(formatted_db_path, 'uniprot_sprot_varsplic.fasta'))
123
+ seqs = get_seqs(File.join(formatted_db_path, 'sp_' + options[:uniprot_div],"sp_#{options[:uniprot_div]}.index"), options[:taxon], isoform_hash)
124
+ isoform_hash = nil
125
+ seqs << get_seqs(File.join(formatted_db_path, 'tr_' + options[:uniprot_div],"tr_#{options[:uniprot_div]}.index"), options[:taxon], isoform_hash)
126
+ else
127
+ seqs = File.open(options[:user_fasta]).read
128
+ end
114
129
 
115
130
  if options[:cdhit] > 0
116
131
  output_file = File.open(output_file_path, 'w')
@@ -7,7 +7,7 @@ $: << File.expand_path(File.join(File.dirname(__FILE__), 'full_lengther_next', '
7
7
 
8
8
 
9
9
  module FullLengtherNext
10
- VERSION = '0.5.7'
10
+ VERSION = '0.6.0'
11
11
 
12
12
  FULL_LENGHTER_VERSION = VERSION
13
13
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: full_lengther_next
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.7
4
+ version: 0.6.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-02-17 00:00:00.000000000 Z
12
+ date: 2016-04-14 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: xml-simple
@@ -60,13 +60,13 @@ dependencies:
60
60
  - !ruby/object:Gem::Version
61
61
  version: 0.0.29
62
62
  - !ruby/object:Gem::Dependency
63
- name: scbi_fasta
63
+ name: scbi_zcat
64
64
  requirement: !ruby/object:Gem::Requirement
65
65
  none: false
66
66
  requirements:
67
67
  - - ! '>='
68
68
  - !ruby/object:Gem::Version
69
- version: 0.1.7
69
+ version: '0'
70
70
  type: :runtime
71
71
  prerelease: false
72
72
  version_requirements: !ruby/object:Gem::Requirement
@@ -74,7 +74,7 @@ dependencies:
74
74
  requirements:
75
75
  - - ! '>='
76
76
  - !ruby/object:Gem::Version
77
- version: 0.1.7
77
+ version: '0'
78
78
  - !ruby/object:Gem::Dependency
79
79
  name: bio-cd-hit-report
80
80
  requirement: !ruby/object:Gem::Requirement