seqtrimnext 2.0.29
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +3 -0
- data/Manifest.txt +114 -0
- data/PostInstall.txt +7 -0
- data/README.rdoc +159 -0
- data/Rakefile +38 -0
- data/bin/create_graphs.rb +46 -0
- data/bin/extract_seqs.rb +45 -0
- data/bin/extract_seqs_from_fasta.rb +56 -0
- data/bin/extract_seqs_from_fastq.rb +45 -0
- data/bin/fasta2fastq.rb +38 -0
- data/bin/fastq2fasta.rb +35 -0
- data/bin/gen_qual.rb +46 -0
- data/bin/get_seq.rb +46 -0
- data/bin/group_by_range.rb +17 -0
- data/bin/join_ilumina_paired.rb +130 -0
- data/bin/parse_amplicons.rb +95 -0
- data/bin/parse_json_results.rb +66 -0
- data/bin/parse_params.rb +82 -0
- data/bin/resume_clusters.rb +48 -0
- data/bin/resume_rejected.sh +9 -0
- data/bin/reverse_paired.rb +49 -0
- data/bin/seqtrimnext +368 -0
- data/bin/split_fastq.rb +42 -0
- data/bin/split_ilumina_paired.rb +65 -0
- data/bin/split_paired.rb +70 -0
- data/lib/seqtrimnext/actions/action_ab_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_ab_far_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_ab_left_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_empty_insert.rb +22 -0
- data/lib/seqtrimnext/actions/action_ignore_repeated.rb +24 -0
- data/lib/seqtrimnext/actions/action_indetermination.rb +30 -0
- data/lib/seqtrimnext/actions/action_induced_low_complexity.rb +29 -0
- data/lib/seqtrimnext/actions/action_insert.rb +32 -0
- data/lib/seqtrimnext/actions/action_is_contaminated.rb +30 -0
- data/lib/seqtrimnext/actions/action_key.rb +30 -0
- data/lib/seqtrimnext/actions/action_left_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_left_primer.rb +17 -0
- data/lib/seqtrimnext/actions/action_linker.rb +30 -0
- data/lib/seqtrimnext/actions/action_low_complexity.rb +30 -0
- data/lib/seqtrimnext/actions/action_low_high_size.rb +31 -0
- data/lib/seqtrimnext/actions/action_low_quality.rb +33 -0
- data/lib/seqtrimnext/actions/action_mid.rb +30 -0
- data/lib/seqtrimnext/actions/action_multiple_linker.rb +29 -0
- data/lib/seqtrimnext/actions/action_paired_reads.rb +28 -0
- data/lib/seqtrimnext/actions/action_poly_a.rb +29 -0
- data/lib/seqtrimnext/actions/action_poly_t.rb +29 -0
- data/lib/seqtrimnext/actions/action_rem_adit_artifacts.rb +32 -0
- data/lib/seqtrimnext/actions/action_right_adapter.rb +29 -0
- data/lib/seqtrimnext/actions/action_right_primer.rb +25 -0
- data/lib/seqtrimnext/actions/action_short_insert.rb +32 -0
- data/lib/seqtrimnext/actions/action_unexpected_poly_t.rb +29 -0
- data/lib/seqtrimnext/actions/action_unexpected_vector.rb +31 -0
- data/lib/seqtrimnext/actions/action_vectors.rb +31 -0
- data/lib/seqtrimnext/actions/seqtrim_action.rb +136 -0
- data/lib/seqtrimnext/classes/action_manager.rb +47 -0
- data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +335 -0
- data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +290 -0
- data/lib/seqtrimnext/classes/extract_stats.rb +255 -0
- data/lib/seqtrimnext/classes/gnu_plot_graph.rb +140 -0
- data/lib/seqtrimnext/classes/graph_stats.rb +74 -0
- data/lib/seqtrimnext/classes/install_database.rb +43 -0
- data/lib/seqtrimnext/classes/install_requirements.rb +123 -0
- data/lib/seqtrimnext/classes/list_db.rb +49 -0
- data/lib/seqtrimnext/classes/make_blast_db.rb +113 -0
- data/lib/seqtrimnext/classes/one_blast.rb +41 -0
- data/lib/seqtrimnext/classes/params.rb +387 -0
- data/lib/seqtrimnext/classes/piro.rb +78 -0
- data/lib/seqtrimnext/classes/plugin_manager.rb +153 -0
- data/lib/seqtrimnext/classes/scan_for_restr_site.rb +138 -0
- data/lib/seqtrimnext/classes/scbi_stats.rb +68 -0
- data/lib/seqtrimnext/classes/seqtrim.rb +317 -0
- data/lib/seqtrimnext/classes/sequence.rb +55 -0
- data/lib/seqtrimnext/classes/sequence_group.rb +72 -0
- data/lib/seqtrimnext/classes/sequence_with_action.rb +503 -0
- data/lib/seqtrimnext/plugins/plugin.rb +267 -0
- data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +189 -0
- data/lib/seqtrimnext/plugins/plugin_adapters.rb +165 -0
- data/lib/seqtrimnext/plugins/plugin_amplicons.rb +221 -0
- data/lib/seqtrimnext/plugins/plugin_contaminants.rb +209 -0
- data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +438 -0
- data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +393 -0
- data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +101 -0
- data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +199 -0
- data/lib/seqtrimnext/plugins/plugin_key.rb +70 -0
- data/lib/seqtrimnext/plugins/plugin_linker.rb +232 -0
- data/lib/seqtrimnext/plugins/plugin_low_complexity.rb +98 -0
- data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +74 -0
- data/lib/seqtrimnext/plugins/plugin_low_quality.rb +394 -0
- data/lib/seqtrimnext/plugins/plugin_mids.rb +231 -0
- data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +246 -0
- data/lib/seqtrimnext/plugins/plugin_short_insert.rb +244 -0
- data/lib/seqtrimnext/plugins/plugin_vectors.rb +191 -0
- data/lib/seqtrimnext/templates/amplicons.txt +16 -0
- data/lib/seqtrimnext/templates/genomics_454.txt +5 -0
- data/lib/seqtrimnext/templates/genomics_454_with_paired.txt +5 -0
- data/lib/seqtrimnext/templates/low_quality.txt +5 -0
- data/lib/seqtrimnext/templates/low_quality_and_low_complexity.txt +5 -0
- data/lib/seqtrimnext/templates/transcriptomics_454.txt +8 -0
- data/lib/seqtrimnext/templates/transcriptomics_plants.txt +8 -0
- data/lib/seqtrimnext/utils/extract_samples.rb +52 -0
- data/lib/seqtrimnext/utils/fasta2xml.rb +69 -0
- data/lib/seqtrimnext/utils/global_match.rb +65 -0
- data/lib/seqtrimnext/utils/hash_stats.rb +29 -0
- data/lib/seqtrimnext/utils/json_utils.rb +50 -0
- data/lib/seqtrimnext/utils/load_fasta_names_in_hash.rb +37 -0
- data/lib/seqtrimnext/utils/load_qual_in_hash.rb +37 -0
- data/lib/seqtrimnext/utils/recover_mid.rb +95 -0
- data/lib/seqtrimnext/utils/string_utils.rb +56 -0
- data/lib/seqtrimnext.rb +37 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/test/test_helper.rb +3 -0
- data/test/test_seqtrimnext.rb +11 -0
- metadata +318 -0
@@ -0,0 +1,138 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
#########################################
|
4
|
+
# Author:: Almudena Bocinos Rioboo
|
5
|
+
# This class provided the methods to read the parameter's file and to create the structure where will be storaged the param's name and the param's numeric-value
|
6
|
+
#########################################
|
7
|
+
class ScanForRestrSite
|
8
|
+
|
9
|
+
#Creates the structure and start the reading of parameter's file
|
10
|
+
def initialize(sequence,rest)
|
11
|
+
@seq_fasta=sequence
|
12
|
+
@rest=rest
|
13
|
+
puts "#{@seq_fasta} , #{@rest}"
|
14
|
+
res = execute
|
15
|
+
|
16
|
+
|
17
|
+
res.each do |e|
|
18
|
+
puts "#{e.join(',')}"
|
19
|
+
end
|
20
|
+
|
21
|
+
# selects from res,the max good hit
|
22
|
+
puts "--- MAX: --- "
|
23
|
+
|
24
|
+
max = res.max{|e1,e2| e1[1]<=> e2[1]}
|
25
|
+
|
26
|
+
puts max.join(' ; ')
|
27
|
+
|
28
|
+
# checks if the max one has the size of restriction with a margen error
|
29
|
+
margen = (@rest.size <= 4)? 0 : 1; # <- don't change
|
30
|
+
if ((max[1] != @rest.size) && (max[1] != @rest.size-margen))
|
31
|
+
puts "-the max good hit hasn't the size minimum: #{@rest.size} or #{@rest.size-margen} "
|
32
|
+
max=[]
|
33
|
+
end
|
34
|
+
|
35
|
+
|
36
|
+
|
37
|
+
#read_file(path)
|
38
|
+
end
|
39
|
+
|
40
|
+
|
41
|
+
|
42
|
+
def execute
|
43
|
+
r=[]
|
44
|
+
#
|
45
|
+
# for (my $p=0; $p < $sL-$srfL; $p++){
|
46
|
+
# $os = $ns = $xs = 0;
|
47
|
+
# for ( my $i=0; $i < $srfL; $i++ ) {
|
48
|
+
# my $c = substr($s, $i+$p, 1); # ver si decrementar antes pos
|
49
|
+
# my $cc = substr($restrSite, $i, 1);
|
50
|
+
# if ($c eq $cc) {
|
51
|
+
# ++$os;
|
52
|
+
# } elsif ($c eq "N"){
|
53
|
+
# ++$ns;
|
54
|
+
# } else {
|
55
|
+
# ++$xs;
|
56
|
+
# }
|
57
|
+
# }
|
58
|
+
# $r[$p] = [$p, $os, $ns, $xs];
|
59
|
+
# print "$p, $os, $ns, $xs\n";
|
60
|
+
# }
|
61
|
+
for p in 0..@seq_fasta.size-@rest.size
|
62
|
+
os = 0;
|
63
|
+
ns = 0;
|
64
|
+
xs = 0;
|
65
|
+
puts "-------[#{p}]-#{@seq_fasta[p,@seq_fasta.size-p]} , #{@rest}"
|
66
|
+
|
67
|
+
i=0
|
68
|
+
@rest.each_char do |cc|
|
69
|
+
c = @seq_fasta[i+p].chr
|
70
|
+
puts "(#{c}==#{cc})=>#{c==cc}"
|
71
|
+
if (c == cc)
|
72
|
+
os += 1
|
73
|
+
elsif (c == 'N')
|
74
|
+
ns += 1
|
75
|
+
else
|
76
|
+
xs += 1
|
77
|
+
end
|
78
|
+
i+=1
|
79
|
+
|
80
|
+
end
|
81
|
+
r[p]=[p,os,ns,xs]
|
82
|
+
puts r[p].join(',')
|
83
|
+
end
|
84
|
+
return r
|
85
|
+
end
|
86
|
+
|
87
|
+
# Reads param's file
|
88
|
+
def read_file(path_fichero)
|
89
|
+
File.open(path_fichero).each_line do |line|
|
90
|
+
|
91
|
+
line.chomp! # delete end of line
|
92
|
+
|
93
|
+
if !line.empty?
|
94
|
+
if !(line =~ /^#/) # if line is not a comment
|
95
|
+
# extract the parameter's name in params[0] and the parameter's value in params[1]
|
96
|
+
params = line.split(/\s*=\s*/)
|
97
|
+
|
98
|
+
# storage in the hash the pair key/value, in our case will be name/numeric-value ,
|
99
|
+
# that are save in params[0] and params[1], respectively
|
100
|
+
@h[params[0]] = params[1]
|
101
|
+
|
102
|
+
$LOG.debug "read: #{params[1]}"
|
103
|
+
end # end if comentario
|
104
|
+
end #end if line
|
105
|
+
end #end each
|
106
|
+
$LOG.info "File Params have been readed"
|
107
|
+
|
108
|
+
end# end def
|
109
|
+
|
110
|
+
# Prints the pair name/numeric-value for every parameter
|
111
|
+
def print_parameters()
|
112
|
+
@h.each do |clave, valor|
|
113
|
+
|
114
|
+
$LOG.debug "The Parameter #{clave} have the value " +valor.to_s
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
# Return the parameter's list in an array
|
119
|
+
def get_param(param)
|
120
|
+
#$LOG.debug "Get Param: #{@h[param]}"
|
121
|
+
return @h[param]
|
122
|
+
end
|
123
|
+
|
124
|
+
def set_param(param,value)
|
125
|
+
@h[param] = value
|
126
|
+
end
|
127
|
+
|
128
|
+
#attr_accessor :h # to accede to the atribute 'h' from out of this class
|
129
|
+
|
130
|
+
# Returns true if exists the parameter and nil if don't
|
131
|
+
def exists?(param_name)
|
132
|
+
return !@h[param_name].nil?
|
133
|
+
end
|
134
|
+
|
135
|
+
end
|
136
|
+
scan = ScanForRestrSite.new("AaaaACGTACGT", "AGTAC")
|
137
|
+
# scan = ScanForRestrSite.new("AaaaACGTAeCGT", "AGTAC")
|
138
|
+
|
@@ -0,0 +1,68 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'json'
|
4
|
+
|
5
|
+
class Array
|
6
|
+
|
7
|
+
def sum
|
8
|
+
r=0
|
9
|
+
each do |e|
|
10
|
+
r+=e
|
11
|
+
end
|
12
|
+
return r
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
16
|
+
|
17
|
+
class ScbiStats
|
18
|
+
|
19
|
+
def initialize(values)
|
20
|
+
|
21
|
+
@values=values
|
22
|
+
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
def get_window_value(i,window_size=10)
|
27
|
+
start_pos=[0,i-window_size].max
|
28
|
+
|
29
|
+
end_pos=[@values.length,i+window_size].min
|
30
|
+
# puts "#{@values[start_pos..end_pos]} => #{@values[start_pos..end_pos].sum}"
|
31
|
+
return @values[start_pos..end_pos].sum
|
32
|
+
end
|
33
|
+
|
34
|
+
def fat_mode(window_size=10)
|
35
|
+
|
36
|
+
fat_modes=[]
|
37
|
+
max_fat=0
|
38
|
+
|
39
|
+
@values.length.times do |i|
|
40
|
+
fat=get_window_value(i)
|
41
|
+
|
42
|
+
fat_modes << fat
|
43
|
+
|
44
|
+
if fat_modes[max_fat] < fat
|
45
|
+
max_fat=i
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
# puts fat_modes
|
50
|
+
return max_fat
|
51
|
+
# puts @values.length, @fat_modes.length
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
# istat=JSON.parse(File.read('initial_stats.json'))
|
58
|
+
#
|
59
|
+
# x=[]
|
60
|
+
# istat['qv'].each do |qv|
|
61
|
+
# x<< qv['tot'].to_i
|
62
|
+
#
|
63
|
+
# end
|
64
|
+
# # Usage:
|
65
|
+
#
|
66
|
+
# s=ScbiStats.new(x)
|
67
|
+
#
|
68
|
+
# puts s.fat_mode
|
@@ -0,0 +1,317 @@
|
|
1
|
+
######################################
|
2
|
+
# Author: Almudena Bocinos Rioboo
|
3
|
+
# This is the main class.
|
4
|
+
######################################
|
5
|
+
|
6
|
+
require 'extract_stats'
|
7
|
+
|
8
|
+
# $: << File.expand_path('~/progs/ruby/gems/scbi_drb/lib')
|
9
|
+
|
10
|
+
require 'scbi_mapreduce'
|
11
|
+
require 'seqtrim_work_manager'
|
12
|
+
require 'action_manager'
|
13
|
+
|
14
|
+
# SEQTRIM_VERSION_REVISION=27
|
15
|
+
# SEQTRIM_VERSION_STAGE = 'b'
|
16
|
+
# $SEQTRIM_VERSION = "2.0.0#{SEQTRIM_VERSION_STAGE}#{SEQTRIM_VERSION_REVISION}"
|
17
|
+
|
18
|
+
class Seqtrim
|
19
|
+
|
20
|
+
# First of all, reads the file's parameters, where are the values of all parameters and the 'plugin_list' that specifies the order of execution from the plugins.
|
21
|
+
#
|
22
|
+
# Secondly, loads the plugins in a folder .
|
23
|
+
#
|
24
|
+
# Thirdly, checks if parameter's file have the number of parameters necessary for every plugin that is going to be executed.
|
25
|
+
#
|
26
|
+
# After that, creates a thread's pool of a determinate number of workers, e.g. 10 threads,
|
27
|
+
# reads the sequences from files 'fasta' , until now without qualities,
|
28
|
+
# and executes the plugins over the sequences in the pool of threads
|
29
|
+
|
30
|
+
|
31
|
+
def get_cd_hit_cmd(cd_hit_input_file,workers,init_file_path)
|
32
|
+
|
33
|
+
num_cpus_cdhit=1
|
34
|
+
cmd=''
|
35
|
+
|
36
|
+
# if workers is an integer, reduce it by one in the server
|
37
|
+
begin
|
38
|
+
Integer(workers)
|
39
|
+
num_cpus_cdhit = workers
|
40
|
+
cmd = "cd-hit-454 -i #{cd_hit_input_file} -o clusters.fasta -M #{num_cpus_cdhit*1000} -T #{num_cpus_cdhit} > cd-hit-454.out"
|
41
|
+
|
42
|
+
rescue Exception => exception #not an integer, send via ssh to other machine
|
43
|
+
# puts exception
|
44
|
+
worker_hash={};workers.map{|e| worker_hash[e] = (worker_hash[e]||0) +1}
|
45
|
+
|
46
|
+
max_worker = worker_hash.sort_by{|k,v| -v}.first
|
47
|
+
puts "Found these workers: #{worker_hash.sort_by{|k,v| -v}}"
|
48
|
+
num_cpus_cdhit=max_worker[1]
|
49
|
+
|
50
|
+
init=''
|
51
|
+
cd=''
|
52
|
+
|
53
|
+
|
54
|
+
cmd = "cd-hit-454 -i #{cd_hit_input_file} -o clusters.fasta -M #{num_cpus_cdhit*1000} -T #{num_cpus_cdhit} > cd-hit-454.out"
|
55
|
+
|
56
|
+
# worker is different to current machine, send over ssh
|
57
|
+
if max_worker[0]!= workers[0]
|
58
|
+
|
59
|
+
|
60
|
+
if File.exists?(init_file_path)
|
61
|
+
init=". #{init_file_path}; "
|
62
|
+
end
|
63
|
+
|
64
|
+
pwd=`pwd`.chomp
|
65
|
+
|
66
|
+
cd =''
|
67
|
+
|
68
|
+
if File.exists?(pwd)
|
69
|
+
cd = "cd #{pwd}; "
|
70
|
+
end
|
71
|
+
cmd = "ssh #{max_worker[0]} \"#{init} #{cd} #{cmd}\""
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
return cmd
|
76
|
+
end
|
77
|
+
|
78
|
+
def check_global_params(params)
|
79
|
+
errors=[]
|
80
|
+
|
81
|
+
# check plugin list
|
82
|
+
comment='Plugins applied to every sequence, separated by commas. Order is important'
|
83
|
+
# default_value='PluginLowHighSize,PluginMids,PluginIndeterminations,PluginAbAdapters,PluginContaminants,PluginLinker,PluginVectors,PluginLowQuality'
|
84
|
+
# params.check_param(errors,'plugin_list','String',default_value,comment)
|
85
|
+
params.check_param(errors,'plugin_list','PluginList',nil,comment)
|
86
|
+
|
87
|
+
|
88
|
+
comment='Should SeqTrimNext analysis be based on NGS? (if setting to false, a classic Sanger sequencing is considered)'
|
89
|
+
default_value='true'
|
90
|
+
params.check_param(errors,'next_generation_sequences','String',default_value,comment)
|
91
|
+
|
92
|
+
|
93
|
+
comment='Remove duplicated (clonal) sequences (using CD-HIT 454)'
|
94
|
+
default_value='true'
|
95
|
+
params.check_param(errors,'remove_clonality','String',default_value,comment)
|
96
|
+
|
97
|
+
|
98
|
+
comment='Minimum insert size for every trimmed sequence'
|
99
|
+
default_value = 40
|
100
|
+
params.check_param(errors,'min_insert_size_trimmed','Integer',default_value,comment)
|
101
|
+
|
102
|
+
comment='Minimum insert size for each end of paired-end reads; true paired-ends have both single-ends longer than this value'
|
103
|
+
default_value = 40
|
104
|
+
params.check_param(errors,'min_insert_size_paired','Integer',default_value,comment)
|
105
|
+
|
106
|
+
|
107
|
+
comment='Do not reject unexpectedly long sequences found in the raw data'
|
108
|
+
default_value='true'
|
109
|
+
params.check_param(errors,'accept_very_long_sequences','String',default_value,comment)
|
110
|
+
|
111
|
+
comment='Seqtrim version'
|
112
|
+
default_value=Seqtrimnext::SEQTRIM_VERSION
|
113
|
+
params.check_param(errors,'seqtrim_version','String',default_value,comment)
|
114
|
+
|
115
|
+
if !errors.empty?
|
116
|
+
$LOG.error 'Please, define the following global parameters in params file:'
|
117
|
+
errors.each do |error|
|
118
|
+
$LOG.error ' -' + error
|
119
|
+
end #end each
|
120
|
+
end #end if
|
121
|
+
|
122
|
+
return errors.empty?
|
123
|
+
|
124
|
+
end
|
125
|
+
|
126
|
+
|
127
|
+
|
128
|
+
def initialize(options)
|
129
|
+
# ,options[:fasta],options[:qual],,,,
|
130
|
+
params_path=options[:template]
|
131
|
+
|
132
|
+
ip=options[:server_ip]
|
133
|
+
port=options[:port]
|
134
|
+
workers=options[:workers]
|
135
|
+
only_workers=options[:only_workers]
|
136
|
+
chunk_size = options[:chunk_size]
|
137
|
+
use_json = options[:json]
|
138
|
+
|
139
|
+
|
140
|
+
# it is the server part
|
141
|
+
if !only_workers then
|
142
|
+
|
143
|
+
sequence_reader = nil
|
144
|
+
cd_hit_input_file = nil
|
145
|
+
|
146
|
+
# TODO - FIX seqtrim to not iterate two times over input, so STDIN can be used
|
147
|
+
|
148
|
+
# open sequence reader and expand input files paths
|
149
|
+
if options[:fastq]
|
150
|
+
if options[:fastq]=='-'
|
151
|
+
seqs_path = STDIN
|
152
|
+
else
|
153
|
+
seqs_path = File.expand_path(options[:fastq])
|
154
|
+
end
|
155
|
+
cd_hit_input_file = seqs_path
|
156
|
+
sequence_reader = FastqFile.new(seqs_path,'r',:sanger, true)
|
157
|
+
# cd_hit_input_file = 'cd-hit-input.fasta'
|
158
|
+
cd_hit_input_file = seqs_path
|
159
|
+
# $LOG.info "Converting input file for cd-hit-454"
|
160
|
+
# $LOG.info "Conversion done"
|
161
|
+
|
162
|
+
else
|
163
|
+
|
164
|
+
seqs_path = File.expand_path(options[:fasta])
|
165
|
+
cd_hit_input_file = seqs_path
|
166
|
+
|
167
|
+
qual_path = File.expand_path(options[:qual]) if qual_path
|
168
|
+
sequence_reader = FastaQualFile.new(options[:fasta],options[:qual],true)
|
169
|
+
|
170
|
+
end
|
171
|
+
|
172
|
+
|
173
|
+
$LOG.info "Loading params"
|
174
|
+
# Reads the parameter's file
|
175
|
+
params = Params.new(params_path)
|
176
|
+
|
177
|
+
$LOG.info "Checking global params"
|
178
|
+
if !check_global_params(params)
|
179
|
+
exit
|
180
|
+
end
|
181
|
+
|
182
|
+
# Load actions
|
183
|
+
$LOG.info "Loading actions"
|
184
|
+
action_manager = ActionManager.new()
|
185
|
+
|
186
|
+
# load plugins
|
187
|
+
plugin_list = params.get_param('plugin_list') # puts in plugin_list the plugins's array
|
188
|
+
$LOG.info "Loading plugins [#{plugin_list}]"
|
189
|
+
|
190
|
+
|
191
|
+
plugin_manager = PluginManager.new(plugin_list,params) # creates an instance from PluginManager. This must storage the plugins and load it
|
192
|
+
|
193
|
+
|
194
|
+
|
195
|
+
# load plugin params
|
196
|
+
$LOG.info "Check plugin params"
|
197
|
+
if !plugin_manager.check_plugins_params(params) then
|
198
|
+
$LOG.error "Plugin check failed"
|
199
|
+
|
200
|
+
# save used params to file
|
201
|
+
params.save_file('used_params.txt')
|
202
|
+
|
203
|
+
exit
|
204
|
+
end
|
205
|
+
|
206
|
+
|
207
|
+
if !Dir.exists?(OUTPUT_PATH)
|
208
|
+
Dir.mkdir(OUTPUT_PATH)
|
209
|
+
end
|
210
|
+
|
211
|
+
$LOG.info "Calculatings stats"
|
212
|
+
# Extract global stats
|
213
|
+
ExtractStats.new(sequence_reader,params)
|
214
|
+
|
215
|
+
|
216
|
+
# save used params to file
|
217
|
+
params.save_file(File.join(OUTPUT_PATH,'used_params.txt'))
|
218
|
+
|
219
|
+
piro_on = (params.get_param('next_generation_sequences')=='true')
|
220
|
+
|
221
|
+
# format blast database with truncated file
|
222
|
+
#MakeBlastDb.format_db(es.truncated_file_path,File.basename(es.truncated_file_path,File.extname(es.truncated_file_path)),'./') if piro_on
|
223
|
+
|
224
|
+
# leer mids
|
225
|
+
params.load_mids(File.join($FORMATTED_DB_PATH,'mids.fasta'))
|
226
|
+
params.load_linkers(File.join($FORMATTED_DB_PATH,'linkers.fasta'))
|
227
|
+
|
228
|
+
|
229
|
+
|
230
|
+
|
231
|
+
|
232
|
+
#execute cd-hit
|
233
|
+
if params.get_param('remove_clonality')=='true'
|
234
|
+
|
235
|
+
cmd=get_cd_hit_cmd(cd_hit_input_file,workers,File.join($SEQTRIM_PATH,'init_env'))
|
236
|
+
|
237
|
+
$LOG.info "Executing cd-hit-454: #{cmd}"
|
238
|
+
|
239
|
+
if !File.exists?('clusters.fasta.clstr')
|
240
|
+
system(cmd)
|
241
|
+
end
|
242
|
+
|
243
|
+
if File.exists?('clusters.fasta.clstr')
|
244
|
+
params.load_repeated_seqs('clusters.fasta.clstr')
|
245
|
+
else
|
246
|
+
$LOG.error("Exiting due to not found clusters.fasta.clstr. Maybe cd-hit failed. Check cd-hit.out")
|
247
|
+
exit
|
248
|
+
end
|
249
|
+
end
|
250
|
+
|
251
|
+
|
252
|
+
############ SCBI DRB ###########
|
253
|
+
# port = 50000
|
254
|
+
# ip = "10.250.255.6"
|
255
|
+
# port = 50000
|
256
|
+
# ip = "localhost"
|
257
|
+
#
|
258
|
+
# workers=20
|
259
|
+
# only_workers=false
|
260
|
+
# launch work manager
|
261
|
+
|
262
|
+
|
263
|
+
end # end only_workers
|
264
|
+
|
265
|
+
custom_worker_file = File.join(File.dirname(__FILE__), 'em_classes','seqtrim_worker.rb')
|
266
|
+
|
267
|
+
$LOG.info "Workers:\n#{workers}"
|
268
|
+
|
269
|
+
if only_workers then
|
270
|
+
|
271
|
+
worker_launcher = ScbiMapreduce::WorkerLauncher.new(ip,port, workers, custom_worker_file, STDOUT)
|
272
|
+
worker_launcher.launch_workers_and_wait
|
273
|
+
else
|
274
|
+
$LOG.info 'Starting server'
|
275
|
+
|
276
|
+
SeqtrimWorkManager.init_work_manager(sequence_reader, params,chunk_size,use_json)
|
277
|
+
|
278
|
+
begin
|
279
|
+
cpus=1
|
280
|
+
|
281
|
+
if RUBY_PLATFORM.downcase.include?("darwin")
|
282
|
+
cpus=`hwprefs -cpu_count`.chomp.to_i
|
283
|
+
else
|
284
|
+
cpus=`grep processor /proc/cpuinfo |wc -l`.chomp.to_i
|
285
|
+
end
|
286
|
+
rescue
|
287
|
+
cpus=1
|
288
|
+
end
|
289
|
+
|
290
|
+
# if workers is an integer, reduce it by one (because of the server)
|
291
|
+
begin
|
292
|
+
Integer(workers)
|
293
|
+
if workers>1 && workers<cpus
|
294
|
+
workers-=1
|
295
|
+
end
|
296
|
+
rescue
|
297
|
+
if workers.count>1 && workers.count<cpus
|
298
|
+
workers.shift
|
299
|
+
end
|
300
|
+
end
|
301
|
+
|
302
|
+
# launch processor server passing the ip, port and all required params
|
303
|
+
# server = Server.new(ip,port, workers, SeqtrimWorkManager,custom_worker_file, STDOUT,File.join($SEQTRIM_PATH,'init_env'))
|
304
|
+
server = ScbiMapreduce::Manager.new(ip,port, workers, SeqtrimWorkManager,custom_worker_file, STDOUT,'~/.seqtrimnext')
|
305
|
+
server.chunk_size=chunk_size
|
306
|
+
server.start_server
|
307
|
+
|
308
|
+
# close sequence reader
|
309
|
+
sequence_reader.close
|
310
|
+
$LOG.info 'Closing server'
|
311
|
+
end
|
312
|
+
|
313
|
+
############ SCBI DRB ###########
|
314
|
+
|
315
|
+
end
|
316
|
+
|
317
|
+
end # Seqtrim class
|
@@ -0,0 +1,55 @@
|
|
1
|
+
########################################################
|
2
|
+
# Author: Almudena Bocinos Rioboo
|
3
|
+
#
|
4
|
+
# Defines the class Sequence's attribute
|
5
|
+
#
|
6
|
+
########################################################
|
7
|
+
|
8
|
+
class Sequence
|
9
|
+
#storages the name and the contains from fasta sequence
|
10
|
+
def initialize(seq_name,seq_fasta,seq_qual, seq_comment = '')
|
11
|
+
|
12
|
+
@seq_fasta=seq_fasta
|
13
|
+
@seq_name=seq_name
|
14
|
+
@seq_qual=seq_qual
|
15
|
+
@seq_comment = seq_comment
|
16
|
+
|
17
|
+
@seq_rejected=false
|
18
|
+
@seq_repeated=false
|
19
|
+
@seq_reversed=false
|
20
|
+
|
21
|
+
@seq_rejected_by_message=''
|
22
|
+
|
23
|
+
@ns_present = ns_present?
|
24
|
+
@xs_present = xs_present?
|
25
|
+
|
26
|
+
|
27
|
+
|
28
|
+
# puts "INIT SEQ >>>> #{seq_name} #{seq_specie}"
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
attr_accessor :seq_name, :seq_fasta, :seq_qual, :seq_comment , :seq_rejected, :seq_repeated , :seq_reversed
|
33
|
+
attr_accessor :seq_rejected_by_message
|
34
|
+
|
35
|
+
def ns_present?
|
36
|
+
return (@seq_fasta.index('N') != nil)
|
37
|
+
end
|
38
|
+
|
39
|
+
def xs_present?
|
40
|
+
return (@seq_fasta.index('X') != nil)
|
41
|
+
end
|
42
|
+
|
43
|
+
def seq_is_long_enough(seq_min_length)
|
44
|
+
return (@seq_fasta.length>=seq_min_length)
|
45
|
+
end
|
46
|
+
|
47
|
+
def to_fasta
|
48
|
+
return ">"+@seq_name.to_s+"\n"+@seq_fasta
|
49
|
+
end
|
50
|
+
|
51
|
+
def to_qual
|
52
|
+
return ">"+@seq_name.to_s+"\n"+"#{@seq_qual}"
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
class SequenceGroup
|
4
|
+
|
5
|
+
attr_accessor :stats,:output_text,:output_files
|
6
|
+
|
7
|
+
|
8
|
+
def initialize(seqs)
|
9
|
+
@stats={}
|
10
|
+
@seqs=seqs
|
11
|
+
@output_text={}
|
12
|
+
@output_files={}
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
def push(seq)
|
17
|
+
@seqs.push seq
|
18
|
+
end
|
19
|
+
|
20
|
+
def delete(seq)
|
21
|
+
@seqs.delete(seq)
|
22
|
+
end
|
23
|
+
|
24
|
+
def empty?
|
25
|
+
return @seqs.empty?
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
def each
|
30
|
+
@seqs.each do |seq|
|
31
|
+
yield seq
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def each_with_index
|
36
|
+
@seqs.each_with_index do |seq,i|
|
37
|
+
yield seq,i
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
def reverse_each
|
43
|
+
@seqs.reverse_each do |seq|
|
44
|
+
yield seq
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def add(array)
|
49
|
+
@seqs += array
|
50
|
+
end
|
51
|
+
|
52
|
+
def count
|
53
|
+
return @seqs.count
|
54
|
+
end
|
55
|
+
|
56
|
+
def include?(s)
|
57
|
+
return @seqs.include?(s)
|
58
|
+
end
|
59
|
+
|
60
|
+
def remove_all_seqs
|
61
|
+
@seqs=[]
|
62
|
+
end
|
63
|
+
|
64
|
+
# def job_identifier
|
65
|
+
# return @seqs[0].seq_name
|
66
|
+
# end
|
67
|
+
|
68
|
+
def inspect
|
69
|
+
return "Group with #{@seqs.count} sequences"
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|