full_lengther_next 0.5.7 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -22,13 +22,11 @@ $hoe = Hoe.spec 'full_lengther_next' do
22
22
  self.extra_deps << ['xml-simple','>=1.0.12']
23
23
  self.extra_deps << ['scbi_blast','>=0.0.32']
24
24
  self.extra_deps << ['scbi_mapreduce','>=0.0.29']
25
- self.extra_deps << ['scbi_fasta','>=0.1.7']
25
+ self.extra_deps << ['scbi_zcat']
26
26
  self.extra_deps << ['bio-cd-hit-report', '>= 0.1.0 ']
27
27
  self.extra_deps << ['bio', '>= 1.4.3']
28
- # self.extra_deps << ['scbi_fastq','>=0.0.13']
29
28
  self.extra_deps << ['scbi_plot','>=0.0.6']
30
- # self.extra_deps << ['scbi_math','>=0.0.1']
31
-
29
+
32
30
  end
33
31
 
34
32
  require 'newgem/tasks'
@@ -10,7 +10,7 @@ $: << File.expand_path(File.join(ROOT_PATH, "../lib/full_lengther_next/classes/"
10
10
  require 'bio'
11
11
  require 'net/ftp'
12
12
  require 'open-uri'
13
- require 'scbi_fasta'
13
+ require 'scbi_zcat'
14
14
  require 'optparse'
15
15
  require 'cdhit'
16
16
  require 'handle_db'
@@ -20,52 +20,64 @@ require 'handle_db'
20
20
  #############################################################################################
21
21
  def download_ncrna(formatted_db_path, no_download)
22
22
 
23
- ncrna_zip = File.join(formatted_db_path, 'ncrna.zip')
24
- ncrna_fasta = File.join(formatted_db_path, 'ncrna.fasta')
23
+ ncrna_zip = File.join(formatted_db_path, 'ncrna.gz')
25
24
  db_path = File.join(formatted_db_path, 'nc_rna_db')
26
- source_file = File.join(db_path, ncrna_fasta)
27
-
28
-
25
+ db_files = File.join(db_path, 'ncrna')
26
+ fasta = File.join(db_path , 'filtered.fasta')
29
27
  if !no_download
30
28
  puts "Downloading ncRNA database"
31
29
  open(ncrna_zip, 'wb') do |my_file|
32
- my_file.print open('http://www.ncrna.org/frnadb/files/ncrna.zip').read
30
+ my_file.print open('ftp://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/sequences/rnacentral_active.fasta.gz').read
31
+ #my_file.print open('http://www.ncrna.org/frnadb/files/ncrna.zip').read
33
32
  end
34
33
  puts "\nncRNA database downloaded"
35
- system("unzip", ncrna_zip, "-d", ncrna_fasta)
36
- system("rm", ncrna_zip)
37
34
  end
38
35
 
39
- if File.exists?(ncrna_fasta)
36
+ if File.exists?(ncrna_zip)
40
37
  Dir.mkdir(db_path) if !File.exists?(db_path)
41
- db_files = File.join(db_path, 'ncrna')
42
- filtered_fasta = filtering_ncbi_seqs(ncrna_fasta, 40)
43
- #system("LANG=C sed 's/[^A-Za-z0-9\.> -\|]/_/g' #{ncrna_fasta}_filtered > #{ncrna_fasta}_cln")
38
+ black_list = [' 16S ', 'rRNA', 'ribosomal', 'tRNA', 'rrn'] #rrn = ribosonal rna
39
+ filtered_fasta = filtering_ncbi_seqs(ncrna_zip, 40, black_list)
44
40
  puts "\nncRNA database decompressed and cleaned"
45
- do_makeblastdb(filtered_fasta, db_files, 'nucl')
46
- system("rm #{ncrna_fasta}")
41
+ #do_makeblastdb(filtered_fasta, db_files, 'nucl')
42
+ output_file = File.open(fasta, 'w')
43
+ output_file.puts filtered_fasta
44
+ output_file.close
45
+ system("cd-hit -i #{fasta} -o /dev/stderr -c 0.95 -n 11 -M 0 2>&1 >/dev/null | makeblastdb -in - -out #{db_files} -title #{File.basename(db_files)} -dbtype 'nucl' -parse_seqids")
47
46
  puts "\nncRNA database completed"
48
47
  end
49
48
  end
50
49
 
51
- def filtering_ncbi_seqs(fasta_file, max_length)
52
- fasta = FastaQualFile.new(fasta_file)
50
+ def filtering_ncbi_seqs(fasta_file, max_length, black_list)
51
+ fasta = ScbiZcatFile.new(fasta_file)
53
52
  filtered_fasta = ''
54
- fasta.each do |name, seq, comments|
55
- name ="#{name} #{comments}"
56
- if seq.length >= max_length
57
- fields = name.split('|')
58
- if fields[1] == '' || name.include?('||')
59
- new_name = name
60
- else #Cut huge description
61
- new_name = fields[0]+'|'
62
- ids = fields[1].split(',')
63
- new_name << "#{ids.first}\|#{fields[2]}"
64
- end
65
- filtered_fasta << ">#{new_name}\n#{seq}\n"
53
+ seq_name = nil
54
+ seq = ''
55
+ while !fasta.eof
56
+ line = fasta.readline.chomp
57
+ if line[0] == '>'
58
+ if !seq_name.nil?
59
+ filtered_fasta << "#{seq_name}\n#{seq}\n" if seq.length >= max_length && !compare_list(seq_name, black_list)
60
+ end
61
+ seq_name = line
62
+ seq = ''
63
+ else
64
+ seq << line
65
+ end
66
+ end
67
+ filtered_fasta << "#{seq_name}\n#{seq}\n" if seq.length >= max_length && !compare_list(seq_name, black_list)
68
+
69
+ return filtered_fasta
70
+ end
71
+
72
+ def compare_list(string, list)
73
+ res = FALSE
74
+ list.each do |word|
75
+ if string.include?(word)
76
+ res = TRUE
77
+ break
66
78
  end
67
79
  end
68
- return filtered_fasta
80
+ return res
69
81
  end
70
82
 
71
83
  def conecta_uniprot(my_array, formatted_db_path)
@@ -312,7 +324,7 @@ optparse = OptionParser.new do |opts|
312
324
  end
313
325
 
314
326
  options[:all] = FALSE
315
- opts.on( '-a', '--all_sequences', 'Generate databaeses with all sequences') do
327
+ opts.on( '-a', '--all_sequences', 'Generate databases with all sequences') do
316
328
  options[:all] = TRUE
317
329
  end
318
330
 
@@ -354,7 +366,6 @@ else # otherwise use ROOTPATH + DB
354
366
  Dir.mkdir(formatted_db_path)
355
367
  end
356
368
 
357
- puts formatted_db_path
358
369
 
359
370
  ENV['BLASTDB'] = formatted_db_path
360
371
  puts "Databases will be downloaded at: #{ENV['BLASTDB']}"
@@ -63,9 +63,9 @@ optparse = OptionParser.new do |opts|
63
63
  options[:ident] = ident.to_f
64
64
  end
65
65
 
66
- options[:high_clustering] = TRUE
67
- opts.on( '-k', '--high_clustering', 'Only for representative transcriptome. Add a clustering step using pfam ids. Default true' ) do
68
- options[:high_clustering] = FALSE
66
+ options[:high_clustering] = FALSE
67
+ opts.on( '-k', '--high_clustering', 'Only for representative transcriptome. Add a clustering step using pfam ids. Default false' ) do
68
+ options[:high_clustering] = TRUE
69
69
  end
70
70
 
71
71
  options[:subject_coverage] = 0.25
data/bin/make_user_db.rb CHANGED
@@ -63,6 +63,11 @@ optparse = OptionParser.new do |opts|
63
63
  options[:local] = TRUE
64
64
  end
65
65
 
66
+ options[:user_fasta] = nil
67
+ opts.on( '-f', '--user_fasta FILE', 'Use a cutom fasta file to build the user database') do |file|
68
+ options[:user_fasta] = file
69
+ end
70
+
66
71
  options[:cdhit] = 0
67
72
  opts.on( '-c', '--cdhit FLOAT', 'Compact databases with cdhit. 0 for deactivate, >0 - 1 to set percentage of identity. Default: 0') do |cdhit|
68
73
  options[:cdhit] = cdhit.to_f
@@ -85,10 +90,14 @@ optparse.parse!
85
90
  ########################################################
86
91
  ## MAIN
87
92
  ########################################################
88
-
89
- if options[:taxon].nil? || options[:uniprot_div].nil?
90
- puts 'Taxon or uniprot division was not specified'
91
- Process.exit(-1)
93
+ if options[:user_fasta].nil?
94
+ if options[:taxon].nil? || options[:uniprot_div].nil?
95
+ puts 'Taxon or uniprot division was not specified'
96
+ Process.exit(-1)
97
+ end
98
+ elsif !File.exists?(options[:user_fasta]) || options[:taxon].nil?
99
+ puts 'User fasta file not exists or taxon was not specified'
100
+ Process.exit(-1)
92
101
  end
93
102
 
94
103
  if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB'])
@@ -102,15 +111,21 @@ if !options[:local]
102
111
  else
103
112
  user_db_folder = File.join(Dir.pwd, options[:taxon])
104
113
  end
105
- output_file_path = File.join(user_db_folder, options[:taxon]+".fasta")
114
+
106
115
  user_db_folder.gsub!(' ', '_')
107
- output_file_path.gsub!(' ', '_')
108
116
  Dir.mkdir(user_db_folder) if !File.exists?(user_db_folder)
117
+ output_file_path = File.join(user_db_folder, options[:taxon]+".fasta")
118
+ output_file_path.gsub!(' ', '_')
109
119
 
110
- isoform_hash = load_isoform_hash(File.join(formatted_db_path, 'uniprot_sprot_varsplic.fasta'))
111
- seqs = get_seqs(File.join(formatted_db_path, 'sp_' + options[:uniprot_div],"sp_#{options[:uniprot_div]}.index"), options[:taxon], isoform_hash)
112
- isoform_hash = nil
113
- seqs << get_seqs(File.join(formatted_db_path, 'tr_' + options[:uniprot_div],"tr_#{options[:uniprot_div]}.index"), options[:taxon], isoform_hash)
120
+ seqs = ''
121
+ if options[:user_fasta].nil?
122
+ isoform_hash = load_isoform_hash(File.join(formatted_db_path, 'uniprot_sprot_varsplic.fasta'))
123
+ seqs = get_seqs(File.join(formatted_db_path, 'sp_' + options[:uniprot_div],"sp_#{options[:uniprot_div]}.index"), options[:taxon], isoform_hash)
124
+ isoform_hash = nil
125
+ seqs << get_seqs(File.join(formatted_db_path, 'tr_' + options[:uniprot_div],"tr_#{options[:uniprot_div]}.index"), options[:taxon], isoform_hash)
126
+ else
127
+ seqs = File.open(options[:user_fasta]).read
128
+ end
114
129
 
115
130
  if options[:cdhit] > 0
116
131
  output_file = File.open(output_file_path, 'w')
@@ -7,7 +7,7 @@ $: << File.expand_path(File.join(File.dirname(__FILE__), 'full_lengther_next', '
7
7
 
8
8
 
9
9
  module FullLengtherNext
10
- VERSION = '0.5.7'
10
+ VERSION = '0.6.0'
11
11
 
12
12
  FULL_LENGHTER_VERSION = VERSION
13
13
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: full_lengther_next
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.7
4
+ version: 0.6.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-02-17 00:00:00.000000000 Z
12
+ date: 2016-04-14 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: xml-simple
@@ -60,13 +60,13 @@ dependencies:
60
60
  - !ruby/object:Gem::Version
61
61
  version: 0.0.29
62
62
  - !ruby/object:Gem::Dependency
63
- name: scbi_fasta
63
+ name: scbi_zcat
64
64
  requirement: !ruby/object:Gem::Requirement
65
65
  none: false
66
66
  requirements:
67
67
  - - ! '>='
68
68
  - !ruby/object:Gem::Version
69
- version: 0.1.7
69
+ version: '0'
70
70
  type: :runtime
71
71
  prerelease: false
72
72
  version_requirements: !ruby/object:Gem::Requirement
@@ -74,7 +74,7 @@ dependencies:
74
74
  requirements:
75
75
  - - ! '>='
76
76
  - !ruby/object:Gem::Version
77
- version: 0.1.7
77
+ version: '0'
78
78
  - !ruby/object:Gem::Dependency
79
79
  name: bio-cd-hit-report
80
80
  requirement: !ruby/object:Gem::Requirement