seqtrimnext 2.0.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +3 -0
- data/Manifest.txt +114 -0
- data/PostInstall.txt +7 -0
- data/README.rdoc +159 -0
- data/Rakefile +38 -0
- data/bin/create_graphs.rb +46 -0
- data/bin/extract_seqs.rb +45 -0
- data/bin/extract_seqs_from_fasta.rb +56 -0
- data/bin/extract_seqs_from_fastq.rb +45 -0
- data/bin/fasta2fastq.rb +38 -0
- data/bin/fastq2fasta.rb +35 -0
- data/bin/gen_qual.rb +46 -0
- data/bin/get_seq.rb +46 -0
- data/bin/group_by_range.rb +17 -0
- data/bin/join_ilumina_paired.rb +130 -0
- data/bin/parse_amplicons.rb +95 -0
- data/bin/parse_json_results.rb +66 -0
- data/bin/parse_params.rb +82 -0
- data/bin/resume_clusters.rb +48 -0
- data/bin/resume_rejected.sh +9 -0
- data/bin/reverse_paired.rb +49 -0
- data/bin/seqtrimnext +368 -0
- data/bin/split_fastq.rb +42 -0
- data/bin/split_ilumina_paired.rb +65 -0
- data/bin/split_paired.rb +70 -0
- data/lib/seqtrimnext/actions/action_ab_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_ab_far_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_ab_left_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_empty_insert.rb +22 -0
- data/lib/seqtrimnext/actions/action_ignore_repeated.rb +24 -0
- data/lib/seqtrimnext/actions/action_indetermination.rb +30 -0
- data/lib/seqtrimnext/actions/action_induced_low_complexity.rb +29 -0
- data/lib/seqtrimnext/actions/action_insert.rb +32 -0
- data/lib/seqtrimnext/actions/action_is_contaminated.rb +30 -0
- data/lib/seqtrimnext/actions/action_key.rb +30 -0
- data/lib/seqtrimnext/actions/action_left_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_left_primer.rb +17 -0
- data/lib/seqtrimnext/actions/action_linker.rb +30 -0
- data/lib/seqtrimnext/actions/action_low_complexity.rb +30 -0
- data/lib/seqtrimnext/actions/action_low_high_size.rb +31 -0
- data/lib/seqtrimnext/actions/action_low_quality.rb +33 -0
- data/lib/seqtrimnext/actions/action_mid.rb +30 -0
- data/lib/seqtrimnext/actions/action_multiple_linker.rb +29 -0
- data/lib/seqtrimnext/actions/action_paired_reads.rb +28 -0
- data/lib/seqtrimnext/actions/action_poly_a.rb +29 -0
- data/lib/seqtrimnext/actions/action_poly_t.rb +29 -0
- data/lib/seqtrimnext/actions/action_rem_adit_artifacts.rb +32 -0
- data/lib/seqtrimnext/actions/action_right_adapter.rb +29 -0
- data/lib/seqtrimnext/actions/action_right_primer.rb +25 -0
- data/lib/seqtrimnext/actions/action_short_insert.rb +32 -0
- data/lib/seqtrimnext/actions/action_unexpected_poly_t.rb +29 -0
- data/lib/seqtrimnext/actions/action_unexpected_vector.rb +31 -0
- data/lib/seqtrimnext/actions/action_vectors.rb +31 -0
- data/lib/seqtrimnext/actions/seqtrim_action.rb +136 -0
- data/lib/seqtrimnext/classes/action_manager.rb +47 -0
- data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +335 -0
- data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +290 -0
- data/lib/seqtrimnext/classes/extract_stats.rb +255 -0
- data/lib/seqtrimnext/classes/gnu_plot_graph.rb +140 -0
- data/lib/seqtrimnext/classes/graph_stats.rb +74 -0
- data/lib/seqtrimnext/classes/install_database.rb +43 -0
- data/lib/seqtrimnext/classes/install_requirements.rb +123 -0
- data/lib/seqtrimnext/classes/list_db.rb +49 -0
- data/lib/seqtrimnext/classes/make_blast_db.rb +113 -0
- data/lib/seqtrimnext/classes/one_blast.rb +41 -0
- data/lib/seqtrimnext/classes/params.rb +387 -0
- data/lib/seqtrimnext/classes/piro.rb +78 -0
- data/lib/seqtrimnext/classes/plugin_manager.rb +153 -0
- data/lib/seqtrimnext/classes/scan_for_restr_site.rb +138 -0
- data/lib/seqtrimnext/classes/scbi_stats.rb +68 -0
- data/lib/seqtrimnext/classes/seqtrim.rb +317 -0
- data/lib/seqtrimnext/classes/sequence.rb +55 -0
- data/lib/seqtrimnext/classes/sequence_group.rb +72 -0
- data/lib/seqtrimnext/classes/sequence_with_action.rb +503 -0
- data/lib/seqtrimnext/plugins/plugin.rb +267 -0
- data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +189 -0
- data/lib/seqtrimnext/plugins/plugin_adapters.rb +165 -0
- data/lib/seqtrimnext/plugins/plugin_amplicons.rb +221 -0
- data/lib/seqtrimnext/plugins/plugin_contaminants.rb +209 -0
- data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +438 -0
- data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +393 -0
- data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +101 -0
- data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +199 -0
- data/lib/seqtrimnext/plugins/plugin_key.rb +70 -0
- data/lib/seqtrimnext/plugins/plugin_linker.rb +232 -0
- data/lib/seqtrimnext/plugins/plugin_low_complexity.rb +98 -0
- data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +74 -0
- data/lib/seqtrimnext/plugins/plugin_low_quality.rb +394 -0
- data/lib/seqtrimnext/plugins/plugin_mids.rb +231 -0
- data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +246 -0
- data/lib/seqtrimnext/plugins/plugin_short_insert.rb +244 -0
- data/lib/seqtrimnext/plugins/plugin_vectors.rb +191 -0
- data/lib/seqtrimnext/templates/amplicons.txt +16 -0
- data/lib/seqtrimnext/templates/genomics_454.txt +5 -0
- data/lib/seqtrimnext/templates/genomics_454_with_paired.txt +5 -0
- data/lib/seqtrimnext/templates/low_quality.txt +5 -0
- data/lib/seqtrimnext/templates/low_quality_and_low_complexity.txt +5 -0
- data/lib/seqtrimnext/templates/transcriptomics_454.txt +8 -0
- data/lib/seqtrimnext/templates/transcriptomics_plants.txt +8 -0
- data/lib/seqtrimnext/utils/extract_samples.rb +52 -0
- data/lib/seqtrimnext/utils/fasta2xml.rb +69 -0
- data/lib/seqtrimnext/utils/global_match.rb +65 -0
- data/lib/seqtrimnext/utils/hash_stats.rb +29 -0
- data/lib/seqtrimnext/utils/json_utils.rb +50 -0
- data/lib/seqtrimnext/utils/load_fasta_names_in_hash.rb +37 -0
- data/lib/seqtrimnext/utils/load_qual_in_hash.rb +37 -0
- data/lib/seqtrimnext/utils/recover_mid.rb +95 -0
- data/lib/seqtrimnext/utils/string_utils.rb +56 -0
- data/lib/seqtrimnext.rb +37 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/test/test_helper.rb +3 -0
- data/test/test_seqtrimnext.rb +11 -0
- metadata +318 -0
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
#########################################
|
|
4
|
+
# Author:: Almudena Bocinos Rioboo
|
|
5
|
+
# This class provided the methods to read the parameter's file and to create the structure where will be storaged the param's name and the param's numeric-value
|
|
6
|
+
#########################################
|
|
7
|
+
class ScanForRestrSite
|
|
8
|
+
|
|
9
|
+
#Creates the structure and start the reading of parameter's file
|
|
10
|
+
def initialize(sequence,rest)
|
|
11
|
+
@seq_fasta=sequence
|
|
12
|
+
@rest=rest
|
|
13
|
+
puts "#{@seq_fasta} , #{@rest}"
|
|
14
|
+
res = execute
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
res.each do |e|
|
|
18
|
+
puts "#{e.join(',')}"
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# selects from res,the max good hit
|
|
22
|
+
puts "--- MAX: --- "
|
|
23
|
+
|
|
24
|
+
max = res.max{|e1,e2| e1[1]<=> e2[1]}
|
|
25
|
+
|
|
26
|
+
puts max.join(' ; ')
|
|
27
|
+
|
|
28
|
+
# checks if the max one has the size of restriction with a margen error
|
|
29
|
+
margen = (@rest.size <= 4)? 0 : 1; # <- don't change
|
|
30
|
+
if ((max[1] != @rest.size) && (max[1] != @rest.size-margen))
|
|
31
|
+
puts "-the max good hit hasn't the size minimum: #{@rest.size} or #{@rest.size-margen} "
|
|
32
|
+
max=[]
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
#read_file(path)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def execute
|
|
43
|
+
r=[]
|
|
44
|
+
#
|
|
45
|
+
# for (my $p=0; $p < $sL-$srfL; $p++){
|
|
46
|
+
# $os = $ns = $xs = 0;
|
|
47
|
+
# for ( my $i=0; $i < $srfL; $i++ ) {
|
|
48
|
+
# my $c = substr($s, $i+$p, 1); # ver si decrementar antes pos
|
|
49
|
+
# my $cc = substr($restrSite, $i, 1);
|
|
50
|
+
# if ($c eq $cc) {
|
|
51
|
+
# ++$os;
|
|
52
|
+
# } elsif ($c eq "N"){
|
|
53
|
+
# ++$ns;
|
|
54
|
+
# } else {
|
|
55
|
+
# ++$xs;
|
|
56
|
+
# }
|
|
57
|
+
# }
|
|
58
|
+
# $r[$p] = [$p, $os, $ns, $xs];
|
|
59
|
+
# print "$p, $os, $ns, $xs\n";
|
|
60
|
+
# }
|
|
61
|
+
for p in 0..@seq_fasta.size-@rest.size
|
|
62
|
+
os = 0;
|
|
63
|
+
ns = 0;
|
|
64
|
+
xs = 0;
|
|
65
|
+
puts "-------[#{p}]-#{@seq_fasta[p,@seq_fasta.size-p]} , #{@rest}"
|
|
66
|
+
|
|
67
|
+
i=0
|
|
68
|
+
@rest.each_char do |cc|
|
|
69
|
+
c = @seq_fasta[i+p].chr
|
|
70
|
+
puts "(#{c}==#{cc})=>#{c==cc}"
|
|
71
|
+
if (c == cc)
|
|
72
|
+
os += 1
|
|
73
|
+
elsif (c == 'N')
|
|
74
|
+
ns += 1
|
|
75
|
+
else
|
|
76
|
+
xs += 1
|
|
77
|
+
end
|
|
78
|
+
i+=1
|
|
79
|
+
|
|
80
|
+
end
|
|
81
|
+
r[p]=[p,os,ns,xs]
|
|
82
|
+
puts r[p].join(',')
|
|
83
|
+
end
|
|
84
|
+
return r
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# Reads param's file
|
|
88
|
+
def read_file(path_fichero)
|
|
89
|
+
File.open(path_fichero).each_line do |line|
|
|
90
|
+
|
|
91
|
+
line.chomp! # delete end of line
|
|
92
|
+
|
|
93
|
+
if !line.empty?
|
|
94
|
+
if !(line =~ /^#/) # if line is not a comment
|
|
95
|
+
# extract the parameter's name in params[0] and the parameter's value in params[1]
|
|
96
|
+
params = line.split(/\s*=\s*/)
|
|
97
|
+
|
|
98
|
+
# storage in the hash the pair key/value, in our case will be name/numeric-value ,
|
|
99
|
+
# that are save in params[0] and params[1], respectively
|
|
100
|
+
@h[params[0]] = params[1]
|
|
101
|
+
|
|
102
|
+
$LOG.debug "read: #{params[1]}"
|
|
103
|
+
end # end if comentario
|
|
104
|
+
end #end if line
|
|
105
|
+
end #end each
|
|
106
|
+
$LOG.info "File Params have been readed"
|
|
107
|
+
|
|
108
|
+
end# end def
|
|
109
|
+
|
|
110
|
+
# Prints the pair name/numeric-value for every parameter
|
|
111
|
+
def print_parameters()
|
|
112
|
+
@h.each do |clave, valor|
|
|
113
|
+
|
|
114
|
+
$LOG.debug "The Parameter #{clave} have the value " +valor.to_s
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# Return the parameter's list in an array
|
|
119
|
+
def get_param(param)
|
|
120
|
+
#$LOG.debug "Get Param: #{@h[param]}"
|
|
121
|
+
return @h[param]
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
def set_param(param,value)
|
|
125
|
+
@h[param] = value
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
#attr_accessor :h # to accede to the atribute 'h' from out of this class
|
|
129
|
+
|
|
130
|
+
# Returns true if exists the parameter and nil if don't
|
|
131
|
+
def exists?(param_name)
|
|
132
|
+
return !@h[param_name].nil?
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
end
|
|
136
|
+
scan = ScanForRestrSite.new("AaaaACGTACGT", "AGTAC")
|
|
137
|
+
# scan = ScanForRestrSite.new("AaaaACGTAeCGT", "AGTAC")
|
|
138
|
+
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
class Array
|
|
6
|
+
|
|
7
|
+
def sum
|
|
8
|
+
r=0
|
|
9
|
+
each do |e|
|
|
10
|
+
r+=e
|
|
11
|
+
end
|
|
12
|
+
return r
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
class ScbiStats
|
|
18
|
+
|
|
19
|
+
def initialize(values)
|
|
20
|
+
|
|
21
|
+
@values=values
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def get_window_value(i,window_size=10)
|
|
27
|
+
start_pos=[0,i-window_size].max
|
|
28
|
+
|
|
29
|
+
end_pos=[@values.length,i+window_size].min
|
|
30
|
+
# puts "#{@values[start_pos..end_pos]} => #{@values[start_pos..end_pos].sum}"
|
|
31
|
+
return @values[start_pos..end_pos].sum
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def fat_mode(window_size=10)
|
|
35
|
+
|
|
36
|
+
fat_modes=[]
|
|
37
|
+
max_fat=0
|
|
38
|
+
|
|
39
|
+
@values.length.times do |i|
|
|
40
|
+
fat=get_window_value(i)
|
|
41
|
+
|
|
42
|
+
fat_modes << fat
|
|
43
|
+
|
|
44
|
+
if fat_modes[max_fat] < fat
|
|
45
|
+
max_fat=i
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
end
|
|
49
|
+
# puts fat_modes
|
|
50
|
+
return max_fat
|
|
51
|
+
# puts @values.length, @fat_modes.length
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# istat=JSON.parse(File.read('initial_stats.json'))
|
|
58
|
+
#
|
|
59
|
+
# x=[]
|
|
60
|
+
# istat['qv'].each do |qv|
|
|
61
|
+
# x<< qv['tot'].to_i
|
|
62
|
+
#
|
|
63
|
+
# end
|
|
64
|
+
# # Usage:
|
|
65
|
+
#
|
|
66
|
+
# s=ScbiStats.new(x)
|
|
67
|
+
#
|
|
68
|
+
# puts s.fat_mode
|
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
######################################
|
|
2
|
+
# Author: Almudena Bocinos Rioboo
|
|
3
|
+
# This is the main class.
|
|
4
|
+
######################################
|
|
5
|
+
|
|
6
|
+
require 'extract_stats'
|
|
7
|
+
|
|
8
|
+
# $: << File.expand_path('~/progs/ruby/gems/scbi_drb/lib')
|
|
9
|
+
|
|
10
|
+
require 'scbi_mapreduce'
|
|
11
|
+
require 'seqtrim_work_manager'
|
|
12
|
+
require 'action_manager'
|
|
13
|
+
|
|
14
|
+
# SEQTRIM_VERSION_REVISION=27
|
|
15
|
+
# SEQTRIM_VERSION_STAGE = 'b'
|
|
16
|
+
# $SEQTRIM_VERSION = "2.0.0#{SEQTRIM_VERSION_STAGE}#{SEQTRIM_VERSION_REVISION}"
|
|
17
|
+
|
|
18
|
+
class Seqtrim
|
|
19
|
+
|
|
20
|
+
# First of all, reads the file's parameters, where are the values of all parameters and the 'plugin_list' that specifies the order of execution from the plugins.
|
|
21
|
+
#
|
|
22
|
+
# Secondly, loads the plugins in a folder .
|
|
23
|
+
#
|
|
24
|
+
# Thirdly, checks if parameter's file have the number of parameters necessary for every plugin that is going to be executed.
|
|
25
|
+
#
|
|
26
|
+
# After that, creates a thread's pool of a determinate number of workers, e.g. 10 threads,
|
|
27
|
+
# reads the sequences from files 'fasta' , until now without qualities,
|
|
28
|
+
# and executes the plugins over the sequences in the pool of threads
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def get_cd_hit_cmd(cd_hit_input_file,workers,init_file_path)
|
|
32
|
+
|
|
33
|
+
num_cpus_cdhit=1
|
|
34
|
+
cmd=''
|
|
35
|
+
|
|
36
|
+
# if workers is an integer, reduce it by one in the server
|
|
37
|
+
begin
|
|
38
|
+
Integer(workers)
|
|
39
|
+
num_cpus_cdhit = workers
|
|
40
|
+
cmd = "cd-hit-454 -i #{cd_hit_input_file} -o clusters.fasta -M #{num_cpus_cdhit*1000} -T #{num_cpus_cdhit} > cd-hit-454.out"
|
|
41
|
+
|
|
42
|
+
rescue Exception => exception #not an integer, send via ssh to other machine
|
|
43
|
+
# puts exception
|
|
44
|
+
worker_hash={};workers.map{|e| worker_hash[e] = (worker_hash[e]||0) +1}
|
|
45
|
+
|
|
46
|
+
max_worker = worker_hash.sort_by{|k,v| -v}.first
|
|
47
|
+
puts "Found these workers: #{worker_hash.sort_by{|k,v| -v}}"
|
|
48
|
+
num_cpus_cdhit=max_worker[1]
|
|
49
|
+
|
|
50
|
+
init=''
|
|
51
|
+
cd=''
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
cmd = "cd-hit-454 -i #{cd_hit_input_file} -o clusters.fasta -M #{num_cpus_cdhit*1000} -T #{num_cpus_cdhit} > cd-hit-454.out"
|
|
55
|
+
|
|
56
|
+
# worker is different to current machine, send over ssh
|
|
57
|
+
if max_worker[0]!= workers[0]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
if File.exists?(init_file_path)
|
|
61
|
+
init=". #{init_file_path}; "
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
pwd=`pwd`.chomp
|
|
65
|
+
|
|
66
|
+
cd =''
|
|
67
|
+
|
|
68
|
+
if File.exists?(pwd)
|
|
69
|
+
cd = "cd #{pwd}; "
|
|
70
|
+
end
|
|
71
|
+
cmd = "ssh #{max_worker[0]} \"#{init} #{cd} #{cmd}\""
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
return cmd
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def check_global_params(params)
|
|
79
|
+
errors=[]
|
|
80
|
+
|
|
81
|
+
# check plugin list
|
|
82
|
+
comment='Plugins applied to every sequence, separated by commas. Order is important'
|
|
83
|
+
# default_value='PluginLowHighSize,PluginMids,PluginIndeterminations,PluginAbAdapters,PluginContaminants,PluginLinker,PluginVectors,PluginLowQuality'
|
|
84
|
+
# params.check_param(errors,'plugin_list','String',default_value,comment)
|
|
85
|
+
params.check_param(errors,'plugin_list','PluginList',nil,comment)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
comment='Should SeqTrimNext analysis be based on NGS? (if setting to false, a classic Sanger sequencing is considered)'
|
|
89
|
+
default_value='true'
|
|
90
|
+
params.check_param(errors,'next_generation_sequences','String',default_value,comment)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
comment='Remove duplicated (clonal) sequences (using CD-HIT 454)'
|
|
94
|
+
default_value='true'
|
|
95
|
+
params.check_param(errors,'remove_clonality','String',default_value,comment)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
comment='Minimum insert size for every trimmed sequence'
|
|
99
|
+
default_value = 40
|
|
100
|
+
params.check_param(errors,'min_insert_size_trimmed','Integer',default_value,comment)
|
|
101
|
+
|
|
102
|
+
comment='Minimum insert size for each end of paired-end reads; true paired-ends have both single-ends longer than this value'
|
|
103
|
+
default_value = 40
|
|
104
|
+
params.check_param(errors,'min_insert_size_paired','Integer',default_value,comment)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
comment='Do not reject unexpectedly long sequences found in the raw data'
|
|
108
|
+
default_value='true'
|
|
109
|
+
params.check_param(errors,'accept_very_long_sequences','String',default_value,comment)
|
|
110
|
+
|
|
111
|
+
comment='Seqtrim version'
|
|
112
|
+
default_value=Seqtrimnext::SEQTRIM_VERSION
|
|
113
|
+
params.check_param(errors,'seqtrim_version','String',default_value,comment)
|
|
114
|
+
|
|
115
|
+
if !errors.empty?
|
|
116
|
+
$LOG.error 'Please, define the following global parameters in params file:'
|
|
117
|
+
errors.each do |error|
|
|
118
|
+
$LOG.error ' -' + error
|
|
119
|
+
end #end each
|
|
120
|
+
end #end if
|
|
121
|
+
|
|
122
|
+
return errors.empty?
|
|
123
|
+
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def initialize(options)
|
|
129
|
+
# ,options[:fasta],options[:qual],,,,
|
|
130
|
+
params_path=options[:template]
|
|
131
|
+
|
|
132
|
+
ip=options[:server_ip]
|
|
133
|
+
port=options[:port]
|
|
134
|
+
workers=options[:workers]
|
|
135
|
+
only_workers=options[:only_workers]
|
|
136
|
+
chunk_size = options[:chunk_size]
|
|
137
|
+
use_json = options[:json]
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
# it is the server part
|
|
141
|
+
if !only_workers then
|
|
142
|
+
|
|
143
|
+
sequence_reader = nil
|
|
144
|
+
cd_hit_input_file = nil
|
|
145
|
+
|
|
146
|
+
# TODO - FIX seqtrim to not iterate two times over input, so STDIN can be used
|
|
147
|
+
|
|
148
|
+
# open sequence reader and expand input files paths
|
|
149
|
+
if options[:fastq]
|
|
150
|
+
if options[:fastq]=='-'
|
|
151
|
+
seqs_path = STDIN
|
|
152
|
+
else
|
|
153
|
+
seqs_path = File.expand_path(options[:fastq])
|
|
154
|
+
end
|
|
155
|
+
cd_hit_input_file = seqs_path
|
|
156
|
+
sequence_reader = FastqFile.new(seqs_path,'r',:sanger, true)
|
|
157
|
+
# cd_hit_input_file = 'cd-hit-input.fasta'
|
|
158
|
+
cd_hit_input_file = seqs_path
|
|
159
|
+
# $LOG.info "Converting input file for cd-hit-454"
|
|
160
|
+
# $LOG.info "Conversion done"
|
|
161
|
+
|
|
162
|
+
else
|
|
163
|
+
|
|
164
|
+
seqs_path = File.expand_path(options[:fasta])
|
|
165
|
+
cd_hit_input_file = seqs_path
|
|
166
|
+
|
|
167
|
+
qual_path = File.expand_path(options[:qual]) if qual_path
|
|
168
|
+
sequence_reader = FastaQualFile.new(options[:fasta],options[:qual],true)
|
|
169
|
+
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
$LOG.info "Loading params"
|
|
174
|
+
# Reads the parameter's file
|
|
175
|
+
params = Params.new(params_path)
|
|
176
|
+
|
|
177
|
+
$LOG.info "Checking global params"
|
|
178
|
+
if !check_global_params(params)
|
|
179
|
+
exit
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
# Load actions
|
|
183
|
+
$LOG.info "Loading actions"
|
|
184
|
+
action_manager = ActionManager.new()
|
|
185
|
+
|
|
186
|
+
# load plugins
|
|
187
|
+
plugin_list = params.get_param('plugin_list') # puts in plugin_list the plugins's array
|
|
188
|
+
$LOG.info "Loading plugins [#{plugin_list}]"
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
plugin_manager = PluginManager.new(plugin_list,params) # creates an instance from PluginManager. This must storage the plugins and load it
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
# load plugin params
|
|
196
|
+
$LOG.info "Check plugin params"
|
|
197
|
+
if !plugin_manager.check_plugins_params(params) then
|
|
198
|
+
$LOG.error "Plugin check failed"
|
|
199
|
+
|
|
200
|
+
# save used params to file
|
|
201
|
+
params.save_file('used_params.txt')
|
|
202
|
+
|
|
203
|
+
exit
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
if !Dir.exists?(OUTPUT_PATH)
|
|
208
|
+
Dir.mkdir(OUTPUT_PATH)
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
$LOG.info "Calculatings stats"
|
|
212
|
+
# Extract global stats
|
|
213
|
+
ExtractStats.new(sequence_reader,params)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
# save used params to file
|
|
217
|
+
params.save_file(File.join(OUTPUT_PATH,'used_params.txt'))
|
|
218
|
+
|
|
219
|
+
piro_on = (params.get_param('next_generation_sequences')=='true')
|
|
220
|
+
|
|
221
|
+
# format blast database with truncated file
|
|
222
|
+
#MakeBlastDb.format_db(es.truncated_file_path,File.basename(es.truncated_file_path,File.extname(es.truncated_file_path)),'./') if piro_on
|
|
223
|
+
|
|
224
|
+
# leer mids
|
|
225
|
+
params.load_mids(File.join($FORMATTED_DB_PATH,'mids.fasta'))
|
|
226
|
+
params.load_linkers(File.join($FORMATTED_DB_PATH,'linkers.fasta'))
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
#execute cd-hit
|
|
233
|
+
if params.get_param('remove_clonality')=='true'
|
|
234
|
+
|
|
235
|
+
cmd=get_cd_hit_cmd(cd_hit_input_file,workers,File.join($SEQTRIM_PATH,'init_env'))
|
|
236
|
+
|
|
237
|
+
$LOG.info "Executing cd-hit-454: #{cmd}"
|
|
238
|
+
|
|
239
|
+
if !File.exists?('clusters.fasta.clstr')
|
|
240
|
+
system(cmd)
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
if File.exists?('clusters.fasta.clstr')
|
|
244
|
+
params.load_repeated_seqs('clusters.fasta.clstr')
|
|
245
|
+
else
|
|
246
|
+
$LOG.error("Exiting due to not found clusters.fasta.clstr. Maybe cd-hit failed. Check cd-hit.out")
|
|
247
|
+
exit
|
|
248
|
+
end
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
############ SCBI DRB ###########
|
|
253
|
+
# port = 50000
|
|
254
|
+
# ip = "10.250.255.6"
|
|
255
|
+
# port = 50000
|
|
256
|
+
# ip = "localhost"
|
|
257
|
+
#
|
|
258
|
+
# workers=20
|
|
259
|
+
# only_workers=false
|
|
260
|
+
# launch work manager
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
end # end only_workers
|
|
264
|
+
|
|
265
|
+
custom_worker_file = File.join(File.dirname(__FILE__), 'em_classes','seqtrim_worker.rb')
|
|
266
|
+
|
|
267
|
+
$LOG.info "Workers:\n#{workers}"
|
|
268
|
+
|
|
269
|
+
if only_workers then
|
|
270
|
+
|
|
271
|
+
worker_launcher = ScbiMapreduce::WorkerLauncher.new(ip,port, workers, custom_worker_file, STDOUT)
|
|
272
|
+
worker_launcher.launch_workers_and_wait
|
|
273
|
+
else
|
|
274
|
+
$LOG.info 'Starting server'
|
|
275
|
+
|
|
276
|
+
SeqtrimWorkManager.init_work_manager(sequence_reader, params,chunk_size,use_json)
|
|
277
|
+
|
|
278
|
+
begin
|
|
279
|
+
cpus=1
|
|
280
|
+
|
|
281
|
+
if RUBY_PLATFORM.downcase.include?("darwin")
|
|
282
|
+
cpus=`hwprefs -cpu_count`.chomp.to_i
|
|
283
|
+
else
|
|
284
|
+
cpus=`grep processor /proc/cpuinfo |wc -l`.chomp.to_i
|
|
285
|
+
end
|
|
286
|
+
rescue
|
|
287
|
+
cpus=1
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
# if workers is an integer, reduce it by one (because of the server)
|
|
291
|
+
begin
|
|
292
|
+
Integer(workers)
|
|
293
|
+
if workers>1 && workers<cpus
|
|
294
|
+
workers-=1
|
|
295
|
+
end
|
|
296
|
+
rescue
|
|
297
|
+
if workers.count>1 && workers.count<cpus
|
|
298
|
+
workers.shift
|
|
299
|
+
end
|
|
300
|
+
end
|
|
301
|
+
|
|
302
|
+
# launch processor server passing the ip, port and all required params
|
|
303
|
+
# server = Server.new(ip,port, workers, SeqtrimWorkManager,custom_worker_file, STDOUT,File.join($SEQTRIM_PATH,'init_env'))
|
|
304
|
+
server = ScbiMapreduce::Manager.new(ip,port, workers, SeqtrimWorkManager,custom_worker_file, STDOUT,'~/.seqtrimnext')
|
|
305
|
+
server.chunk_size=chunk_size
|
|
306
|
+
server.start_server
|
|
307
|
+
|
|
308
|
+
# close sequence reader
|
|
309
|
+
sequence_reader.close
|
|
310
|
+
$LOG.info 'Closing server'
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
############ SCBI DRB ###########
|
|
314
|
+
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
end # Seqtrim class
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
########################################################
|
|
2
|
+
# Author: Almudena Bocinos Rioboo
|
|
3
|
+
#
|
|
4
|
+
# Defines the class Sequence's attribute
|
|
5
|
+
#
|
|
6
|
+
########################################################
|
|
7
|
+
|
|
8
|
+
class Sequence
|
|
9
|
+
#storages the name and the contains from fasta sequence
|
|
10
|
+
def initialize(seq_name,seq_fasta,seq_qual, seq_comment = '')
|
|
11
|
+
|
|
12
|
+
@seq_fasta=seq_fasta
|
|
13
|
+
@seq_name=seq_name
|
|
14
|
+
@seq_qual=seq_qual
|
|
15
|
+
@seq_comment = seq_comment
|
|
16
|
+
|
|
17
|
+
@seq_rejected=false
|
|
18
|
+
@seq_repeated=false
|
|
19
|
+
@seq_reversed=false
|
|
20
|
+
|
|
21
|
+
@seq_rejected_by_message=''
|
|
22
|
+
|
|
23
|
+
@ns_present = ns_present?
|
|
24
|
+
@xs_present = xs_present?
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# puts "INIT SEQ >>>> #{seq_name} #{seq_specie}"
|
|
29
|
+
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
attr_accessor :seq_name, :seq_fasta, :seq_qual, :seq_comment , :seq_rejected, :seq_repeated , :seq_reversed
|
|
33
|
+
attr_accessor :seq_rejected_by_message
|
|
34
|
+
|
|
35
|
+
def ns_present?
|
|
36
|
+
return (@seq_fasta.index('N') != nil)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def xs_present?
|
|
40
|
+
return (@seq_fasta.index('X') != nil)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def seq_is_long_enough(seq_min_length)
|
|
44
|
+
return (@seq_fasta.length>=seq_min_length)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def to_fasta
|
|
48
|
+
return ">"+@seq_name.to_s+"\n"+@seq_fasta
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def to_qual
|
|
52
|
+
return ">"+@seq_name.to_s+"\n"+"#{@seq_qual}"
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
end
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
class SequenceGroup
|
|
4
|
+
|
|
5
|
+
attr_accessor :stats,:output_text,:output_files
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def initialize(seqs)
|
|
9
|
+
@stats={}
|
|
10
|
+
@seqs=seqs
|
|
11
|
+
@output_text={}
|
|
12
|
+
@output_files={}
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def push(seq)
|
|
17
|
+
@seqs.push seq
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def delete(seq)
|
|
21
|
+
@seqs.delete(seq)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def empty?
|
|
25
|
+
return @seqs.empty?
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def each
|
|
30
|
+
@seqs.each do |seq|
|
|
31
|
+
yield seq
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def each_with_index
|
|
36
|
+
@seqs.each_with_index do |seq,i|
|
|
37
|
+
yield seq,i
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def reverse_each
|
|
43
|
+
@seqs.reverse_each do |seq|
|
|
44
|
+
yield seq
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def add(array)
|
|
49
|
+
@seqs += array
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def count
|
|
53
|
+
return @seqs.count
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def include?(s)
|
|
57
|
+
return @seqs.include?(s)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def remove_all_seqs
|
|
61
|
+
@seqs=[]
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# def job_identifier
|
|
65
|
+
# return @seqs[0].seq_name
|
|
66
|
+
# end
|
|
67
|
+
|
|
68
|
+
def inspect
|
|
69
|
+
return "Group with #{@seqs.count} sequences"
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
end
|