seqtrimnext 2.0.29

Sign up to get free protection for your applications and to get access to all the features.
Files changed (115) hide show
  1. data/History.txt +3 -0
  2. data/Manifest.txt +114 -0
  3. data/PostInstall.txt +7 -0
  4. data/README.rdoc +159 -0
  5. data/Rakefile +38 -0
  6. data/bin/create_graphs.rb +46 -0
  7. data/bin/extract_seqs.rb +45 -0
  8. data/bin/extract_seqs_from_fasta.rb +56 -0
  9. data/bin/extract_seqs_from_fastq.rb +45 -0
  10. data/bin/fasta2fastq.rb +38 -0
  11. data/bin/fastq2fasta.rb +35 -0
  12. data/bin/gen_qual.rb +46 -0
  13. data/bin/get_seq.rb +46 -0
  14. data/bin/group_by_range.rb +17 -0
  15. data/bin/join_ilumina_paired.rb +130 -0
  16. data/bin/parse_amplicons.rb +95 -0
  17. data/bin/parse_json_results.rb +66 -0
  18. data/bin/parse_params.rb +82 -0
  19. data/bin/resume_clusters.rb +48 -0
  20. data/bin/resume_rejected.sh +9 -0
  21. data/bin/reverse_paired.rb +49 -0
  22. data/bin/seqtrimnext +368 -0
  23. data/bin/split_fastq.rb +42 -0
  24. data/bin/split_ilumina_paired.rb +65 -0
  25. data/bin/split_paired.rb +70 -0
  26. data/lib/seqtrimnext/actions/action_ab_adapter.rb +32 -0
  27. data/lib/seqtrimnext/actions/action_ab_far_adapter.rb +32 -0
  28. data/lib/seqtrimnext/actions/action_ab_left_adapter.rb +32 -0
  29. data/lib/seqtrimnext/actions/action_empty_insert.rb +22 -0
  30. data/lib/seqtrimnext/actions/action_ignore_repeated.rb +24 -0
  31. data/lib/seqtrimnext/actions/action_indetermination.rb +30 -0
  32. data/lib/seqtrimnext/actions/action_induced_low_complexity.rb +29 -0
  33. data/lib/seqtrimnext/actions/action_insert.rb +32 -0
  34. data/lib/seqtrimnext/actions/action_is_contaminated.rb +30 -0
  35. data/lib/seqtrimnext/actions/action_key.rb +30 -0
  36. data/lib/seqtrimnext/actions/action_left_adapter.rb +32 -0
  37. data/lib/seqtrimnext/actions/action_left_primer.rb +17 -0
  38. data/lib/seqtrimnext/actions/action_linker.rb +30 -0
  39. data/lib/seqtrimnext/actions/action_low_complexity.rb +30 -0
  40. data/lib/seqtrimnext/actions/action_low_high_size.rb +31 -0
  41. data/lib/seqtrimnext/actions/action_low_quality.rb +33 -0
  42. data/lib/seqtrimnext/actions/action_mid.rb +30 -0
  43. data/lib/seqtrimnext/actions/action_multiple_linker.rb +29 -0
  44. data/lib/seqtrimnext/actions/action_paired_reads.rb +28 -0
  45. data/lib/seqtrimnext/actions/action_poly_a.rb +29 -0
  46. data/lib/seqtrimnext/actions/action_poly_t.rb +29 -0
  47. data/lib/seqtrimnext/actions/action_rem_adit_artifacts.rb +32 -0
  48. data/lib/seqtrimnext/actions/action_right_adapter.rb +29 -0
  49. data/lib/seqtrimnext/actions/action_right_primer.rb +25 -0
  50. data/lib/seqtrimnext/actions/action_short_insert.rb +32 -0
  51. data/lib/seqtrimnext/actions/action_unexpected_poly_t.rb +29 -0
  52. data/lib/seqtrimnext/actions/action_unexpected_vector.rb +31 -0
  53. data/lib/seqtrimnext/actions/action_vectors.rb +31 -0
  54. data/lib/seqtrimnext/actions/seqtrim_action.rb +136 -0
  55. data/lib/seqtrimnext/classes/action_manager.rb +47 -0
  56. data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +335 -0
  57. data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +290 -0
  58. data/lib/seqtrimnext/classes/extract_stats.rb +255 -0
  59. data/lib/seqtrimnext/classes/gnu_plot_graph.rb +140 -0
  60. data/lib/seqtrimnext/classes/graph_stats.rb +74 -0
  61. data/lib/seqtrimnext/classes/install_database.rb +43 -0
  62. data/lib/seqtrimnext/classes/install_requirements.rb +123 -0
  63. data/lib/seqtrimnext/classes/list_db.rb +49 -0
  64. data/lib/seqtrimnext/classes/make_blast_db.rb +113 -0
  65. data/lib/seqtrimnext/classes/one_blast.rb +41 -0
  66. data/lib/seqtrimnext/classes/params.rb +387 -0
  67. data/lib/seqtrimnext/classes/piro.rb +78 -0
  68. data/lib/seqtrimnext/classes/plugin_manager.rb +153 -0
  69. data/lib/seqtrimnext/classes/scan_for_restr_site.rb +138 -0
  70. data/lib/seqtrimnext/classes/scbi_stats.rb +68 -0
  71. data/lib/seqtrimnext/classes/seqtrim.rb +317 -0
  72. data/lib/seqtrimnext/classes/sequence.rb +55 -0
  73. data/lib/seqtrimnext/classes/sequence_group.rb +72 -0
  74. data/lib/seqtrimnext/classes/sequence_with_action.rb +503 -0
  75. data/lib/seqtrimnext/plugins/plugin.rb +267 -0
  76. data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +189 -0
  77. data/lib/seqtrimnext/plugins/plugin_adapters.rb +165 -0
  78. data/lib/seqtrimnext/plugins/plugin_amplicons.rb +221 -0
  79. data/lib/seqtrimnext/plugins/plugin_contaminants.rb +209 -0
  80. data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +438 -0
  81. data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +393 -0
  82. data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +101 -0
  83. data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +199 -0
  84. data/lib/seqtrimnext/plugins/plugin_key.rb +70 -0
  85. data/lib/seqtrimnext/plugins/plugin_linker.rb +232 -0
  86. data/lib/seqtrimnext/plugins/plugin_low_complexity.rb +98 -0
  87. data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +74 -0
  88. data/lib/seqtrimnext/plugins/plugin_low_quality.rb +394 -0
  89. data/lib/seqtrimnext/plugins/plugin_mids.rb +231 -0
  90. data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +246 -0
  91. data/lib/seqtrimnext/plugins/plugin_short_insert.rb +244 -0
  92. data/lib/seqtrimnext/plugins/plugin_vectors.rb +191 -0
  93. data/lib/seqtrimnext/templates/amplicons.txt +16 -0
  94. data/lib/seqtrimnext/templates/genomics_454.txt +5 -0
  95. data/lib/seqtrimnext/templates/genomics_454_with_paired.txt +5 -0
  96. data/lib/seqtrimnext/templates/low_quality.txt +5 -0
  97. data/lib/seqtrimnext/templates/low_quality_and_low_complexity.txt +5 -0
  98. data/lib/seqtrimnext/templates/transcriptomics_454.txt +8 -0
  99. data/lib/seqtrimnext/templates/transcriptomics_plants.txt +8 -0
  100. data/lib/seqtrimnext/utils/extract_samples.rb +52 -0
  101. data/lib/seqtrimnext/utils/fasta2xml.rb +69 -0
  102. data/lib/seqtrimnext/utils/global_match.rb +65 -0
  103. data/lib/seqtrimnext/utils/hash_stats.rb +29 -0
  104. data/lib/seqtrimnext/utils/json_utils.rb +50 -0
  105. data/lib/seqtrimnext/utils/load_fasta_names_in_hash.rb +37 -0
  106. data/lib/seqtrimnext/utils/load_qual_in_hash.rb +37 -0
  107. data/lib/seqtrimnext/utils/recover_mid.rb +95 -0
  108. data/lib/seqtrimnext/utils/string_utils.rb +56 -0
  109. data/lib/seqtrimnext.rb +37 -0
  110. data/script/console +10 -0
  111. data/script/destroy +14 -0
  112. data/script/generate +14 -0
  113. data/test/test_helper.rb +3 -0
  114. data/test/test_seqtrimnext.rb +11 -0
  115. metadata +318 -0
@@ -0,0 +1,138 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ #########################################
4
+ # Author:: Almudena Bocinos Rioboo
5
+ # This class provided the methods to read the parameter's file and to create the structure where will be storaged the param's name and the param's numeric-value
6
+ #########################################
7
+ class ScanForRestrSite
8
+
9
+ #Creates the structure and start the reading of parameter's file
10
+ def initialize(sequence,rest)
11
+ @seq_fasta=sequence
12
+ @rest=rest
13
+ puts "#{@seq_fasta} , #{@rest}"
14
+ res = execute
15
+
16
+
17
+ res.each do |e|
18
+ puts "#{e.join(',')}"
19
+ end
20
+
21
+ # selects from res,the max good hit
22
+ puts "--- MAX: --- "
23
+
24
+ max = res.max{|e1,e2| e1[1]<=> e2[1]}
25
+
26
+ puts max.join(' ; ')
27
+
28
+ # checks if the max one has the size of restriction with a margen error
29
+ margen = (@rest.size <= 4)? 0 : 1; # <- don't change
30
+ if ((max[1] != @rest.size) && (max[1] != @rest.size-margen))
31
+ puts "-the max good hit hasn't the size minimum: #{@rest.size} or #{@rest.size-margen} "
32
+ max=[]
33
+ end
34
+
35
+
36
+
37
+ #read_file(path)
38
+ end
39
+
40
+
41
+
42
+ def execute
43
+ r=[]
44
+ #
45
+ # for (my $p=0; $p < $sL-$srfL; $p++){
46
+ # $os = $ns = $xs = 0;
47
+ # for ( my $i=0; $i < $srfL; $i++ ) {
48
+ # my $c = substr($s, $i+$p, 1); # ver si decrementar antes pos
49
+ # my $cc = substr($restrSite, $i, 1);
50
+ # if ($c eq $cc) {
51
+ # ++$os;
52
+ # } elsif ($c eq "N"){
53
+ # ++$ns;
54
+ # } else {
55
+ # ++$xs;
56
+ # }
57
+ # }
58
+ # $r[$p] = [$p, $os, $ns, $xs];
59
+ # print "$p, $os, $ns, $xs\n";
60
+ # }
61
+ for p in 0..@seq_fasta.size-@rest.size
62
+ os = 0;
63
+ ns = 0;
64
+ xs = 0;
65
+ puts "-------[#{p}]-#{@seq_fasta[p,@seq_fasta.size-p]} , #{@rest}"
66
+
67
+ i=0
68
+ @rest.each_char do |cc|
69
+ c = @seq_fasta[i+p].chr
70
+ puts "(#{c}==#{cc})=>#{c==cc}"
71
+ if (c == cc)
72
+ os += 1
73
+ elsif (c == 'N')
74
+ ns += 1
75
+ else
76
+ xs += 1
77
+ end
78
+ i+=1
79
+
80
+ end
81
+ r[p]=[p,os,ns,xs]
82
+ puts r[p].join(',')
83
+ end
84
+ return r
85
+ end
86
+
87
+ # Reads param's file
88
+ def read_file(path_fichero)
89
+ File.open(path_fichero).each_line do |line|
90
+
91
+ line.chomp! # delete end of line
92
+
93
+ if !line.empty?
94
+ if !(line =~ /^#/) # if line is not a comment
95
+ # extract the parameter's name in params[0] and the parameter's value in params[1]
96
+ params = line.split(/\s*=\s*/)
97
+
98
+ # storage in the hash the pair key/value, in our case will be name/numeric-value ,
99
+ # that are save in params[0] and params[1], respectively
100
+ @h[params[0]] = params[1]
101
+
102
+ $LOG.debug "read: #{params[1]}"
103
+ end # end if comentario
104
+ end #end if line
105
+ end #end each
106
+ $LOG.info "File Params have been readed"
107
+
108
+ end# end def
109
+
110
+ # Prints the pair name/numeric-value for every parameter
111
+ def print_parameters()
112
+ @h.each do |clave, valor|
113
+
114
+ $LOG.debug "The Parameter #{clave} have the value " +valor.to_s
115
+ end
116
+ end
117
+
118
+ # Return the parameter's list in an array
119
+ def get_param(param)
120
+ #$LOG.debug "Get Param: #{@h[param]}"
121
+ return @h[param]
122
+ end
123
+
124
+ def set_param(param,value)
125
+ @h[param] = value
126
+ end
127
+
128
+ #attr_accessor :h # to accede to the atribute 'h' from out of this class
129
+
130
+ # Returns true if exists the parameter and nil if don't
131
+ def exists?(param_name)
132
+ return !@h[param_name].nil?
133
+ end
134
+
135
+ end
136
+ scan = ScanForRestrSite.new("AaaaACGTACGT", "AGTAC")
137
+ # scan = ScanForRestrSite.new("AaaaACGTAeCGT", "AGTAC")
138
+
@@ -0,0 +1,68 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'json'
4
+
5
+ class Array
6
+
7
+ def sum
8
+ r=0
9
+ each do |e|
10
+ r+=e
11
+ end
12
+ return r
13
+ end
14
+
15
+ end
16
+
17
+ class ScbiStats
18
+
19
+ def initialize(values)
20
+
21
+ @values=values
22
+
23
+
24
+ end
25
+
26
+ def get_window_value(i,window_size=10)
27
+ start_pos=[0,i-window_size].max
28
+
29
+ end_pos=[@values.length,i+window_size].min
30
+ # puts "#{@values[start_pos..end_pos]} => #{@values[start_pos..end_pos].sum}"
31
+ return @values[start_pos..end_pos].sum
32
+ end
33
+
34
+ def fat_mode(window_size=10)
35
+
36
+ fat_modes=[]
37
+ max_fat=0
38
+
39
+ @values.length.times do |i|
40
+ fat=get_window_value(i)
41
+
42
+ fat_modes << fat
43
+
44
+ if fat_modes[max_fat] < fat
45
+ max_fat=i
46
+ end
47
+
48
+ end
49
+ # puts fat_modes
50
+ return max_fat
51
+ # puts @values.length, @fat_modes.length
52
+ end
53
+
54
+ end
55
+
56
+
57
+ # istat=JSON.parse(File.read('initial_stats.json'))
58
+ #
59
+ # x=[]
60
+ # istat['qv'].each do |qv|
61
+ # x<< qv['tot'].to_i
62
+ #
63
+ # end
64
+ # # Usage:
65
+ #
66
+ # s=ScbiStats.new(x)
67
+ #
68
+ # puts s.fat_mode
@@ -0,0 +1,317 @@
1
+ ######################################
2
+ # Author: Almudena Bocinos Rioboo
3
+ # This is the main class.
4
+ ######################################
5
+
6
+ require 'extract_stats'
7
+
8
+ # $: << File.expand_path('~/progs/ruby/gems/scbi_drb/lib')
9
+
10
+ require 'scbi_mapreduce'
11
+ require 'seqtrim_work_manager'
12
+ require 'action_manager'
13
+
14
+ # SEQTRIM_VERSION_REVISION=27
15
+ # SEQTRIM_VERSION_STAGE = 'b'
16
+ # $SEQTRIM_VERSION = "2.0.0#{SEQTRIM_VERSION_STAGE}#{SEQTRIM_VERSION_REVISION}"
17
+
18
+ class Seqtrim
19
+
20
+ # First of all, reads the file's parameters, where are the values of all parameters and the 'plugin_list' that specifies the order of execution from the plugins.
21
+ #
22
+ # Secondly, loads the plugins in a folder .
23
+ #
24
+ # Thirdly, checks if parameter's file have the number of parameters necessary for every plugin that is going to be executed.
25
+ #
26
+ # After that, creates a thread's pool of a determinate number of workers, e.g. 10 threads,
27
+ # reads the sequences from files 'fasta' , until now without qualities,
28
+ # and executes the plugins over the sequences in the pool of threads
29
+
30
+
31
+ def get_cd_hit_cmd(cd_hit_input_file,workers,init_file_path)
32
+
33
+ num_cpus_cdhit=1
34
+ cmd=''
35
+
36
+ # if workers is an integer, reduce it by one in the server
37
+ begin
38
+ Integer(workers)
39
+ num_cpus_cdhit = workers
40
+ cmd = "cd-hit-454 -i #{cd_hit_input_file} -o clusters.fasta -M #{num_cpus_cdhit*1000} -T #{num_cpus_cdhit} > cd-hit-454.out"
41
+
42
+ rescue Exception => exception #not an integer, send via ssh to other machine
43
+ # puts exception
44
+ worker_hash={};workers.map{|e| worker_hash[e] = (worker_hash[e]||0) +1}
45
+
46
+ max_worker = worker_hash.sort_by{|k,v| -v}.first
47
+ puts "Found these workers: #{worker_hash.sort_by{|k,v| -v}}"
48
+ num_cpus_cdhit=max_worker[1]
49
+
50
+ init=''
51
+ cd=''
52
+
53
+
54
+ cmd = "cd-hit-454 -i #{cd_hit_input_file} -o clusters.fasta -M #{num_cpus_cdhit*1000} -T #{num_cpus_cdhit} > cd-hit-454.out"
55
+
56
+ # worker is different to current machine, send over ssh
57
+ if max_worker[0]!= workers[0]
58
+
59
+
60
+ if File.exists?(init_file_path)
61
+ init=". #{init_file_path}; "
62
+ end
63
+
64
+ pwd=`pwd`.chomp
65
+
66
+ cd =''
67
+
68
+ if File.exists?(pwd)
69
+ cd = "cd #{pwd}; "
70
+ end
71
+ cmd = "ssh #{max_worker[0]} \"#{init} #{cd} #{cmd}\""
72
+ end
73
+ end
74
+
75
+ return cmd
76
+ end
77
+
78
+ def check_global_params(params)
79
+ errors=[]
80
+
81
+ # check plugin list
82
+ comment='Plugins applied to every sequence, separated by commas. Order is important'
83
+ # default_value='PluginLowHighSize,PluginMids,PluginIndeterminations,PluginAbAdapters,PluginContaminants,PluginLinker,PluginVectors,PluginLowQuality'
84
+ # params.check_param(errors,'plugin_list','String',default_value,comment)
85
+ params.check_param(errors,'plugin_list','PluginList',nil,comment)
86
+
87
+
88
+ comment='Should SeqTrimNext analysis be based on NGS? (if setting to false, a classic Sanger sequencing is considered)'
89
+ default_value='true'
90
+ params.check_param(errors,'next_generation_sequences','String',default_value,comment)
91
+
92
+
93
+ comment='Remove duplicated (clonal) sequences (using CD-HIT 454)'
94
+ default_value='true'
95
+ params.check_param(errors,'remove_clonality','String',default_value,comment)
96
+
97
+
98
+ comment='Minimum insert size for every trimmed sequence'
99
+ default_value = 40
100
+ params.check_param(errors,'min_insert_size_trimmed','Integer',default_value,comment)
101
+
102
+ comment='Minimum insert size for each end of paired-end reads; true paired-ends have both single-ends longer than this value'
103
+ default_value = 40
104
+ params.check_param(errors,'min_insert_size_paired','Integer',default_value,comment)
105
+
106
+
107
+ comment='Do not reject unexpectedly long sequences found in the raw data'
108
+ default_value='true'
109
+ params.check_param(errors,'accept_very_long_sequences','String',default_value,comment)
110
+
111
+ comment='Seqtrim version'
112
+ default_value=Seqtrimnext::SEQTRIM_VERSION
113
+ params.check_param(errors,'seqtrim_version','String',default_value,comment)
114
+
115
+ if !errors.empty?
116
+ $LOG.error 'Please, define the following global parameters in params file:'
117
+ errors.each do |error|
118
+ $LOG.error ' -' + error
119
+ end #end each
120
+ end #end if
121
+
122
+ return errors.empty?
123
+
124
+ end
125
+
126
+
127
+
128
+ def initialize(options)
129
+ # ,options[:fasta],options[:qual],,,,
130
+ params_path=options[:template]
131
+
132
+ ip=options[:server_ip]
133
+ port=options[:port]
134
+ workers=options[:workers]
135
+ only_workers=options[:only_workers]
136
+ chunk_size = options[:chunk_size]
137
+ use_json = options[:json]
138
+
139
+
140
+ # it is the server part
141
+ if !only_workers then
142
+
143
+ sequence_reader = nil
144
+ cd_hit_input_file = nil
145
+
146
+ # TODO - FIX seqtrim to not iterate two times over input, so STDIN can be used
147
+
148
+ # open sequence reader and expand input files paths
149
+ if options[:fastq]
150
+ if options[:fastq]=='-'
151
+ seqs_path = STDIN
152
+ else
153
+ seqs_path = File.expand_path(options[:fastq])
154
+ end
155
+ cd_hit_input_file = seqs_path
156
+ sequence_reader = FastqFile.new(seqs_path,'r',:sanger, true)
157
+ # cd_hit_input_file = 'cd-hit-input.fasta'
158
+ cd_hit_input_file = seqs_path
159
+ # $LOG.info "Converting input file for cd-hit-454"
160
+ # $LOG.info "Conversion done"
161
+
162
+ else
163
+
164
+ seqs_path = File.expand_path(options[:fasta])
165
+ cd_hit_input_file = seqs_path
166
+
167
+ qual_path = File.expand_path(options[:qual]) if qual_path
168
+ sequence_reader = FastaQualFile.new(options[:fasta],options[:qual],true)
169
+
170
+ end
171
+
172
+
173
+ $LOG.info "Loading params"
174
+ # Reads the parameter's file
175
+ params = Params.new(params_path)
176
+
177
+ $LOG.info "Checking global params"
178
+ if !check_global_params(params)
179
+ exit
180
+ end
181
+
182
+ # Load actions
183
+ $LOG.info "Loading actions"
184
+ action_manager = ActionManager.new()
185
+
186
+ # load plugins
187
+ plugin_list = params.get_param('plugin_list') # puts in plugin_list the plugins's array
188
+ $LOG.info "Loading plugins [#{plugin_list}]"
189
+
190
+
191
+ plugin_manager = PluginManager.new(plugin_list,params) # creates an instance from PluginManager. This must storage the plugins and load it
192
+
193
+
194
+
195
+ # load plugin params
196
+ $LOG.info "Check plugin params"
197
+ if !plugin_manager.check_plugins_params(params) then
198
+ $LOG.error "Plugin check failed"
199
+
200
+ # save used params to file
201
+ params.save_file('used_params.txt')
202
+
203
+ exit
204
+ end
205
+
206
+
207
+ if !Dir.exists?(OUTPUT_PATH)
208
+ Dir.mkdir(OUTPUT_PATH)
209
+ end
210
+
211
+ $LOG.info "Calculatings stats"
212
+ # Extract global stats
213
+ ExtractStats.new(sequence_reader,params)
214
+
215
+
216
+ # save used params to file
217
+ params.save_file(File.join(OUTPUT_PATH,'used_params.txt'))
218
+
219
+ piro_on = (params.get_param('next_generation_sequences')=='true')
220
+
221
+ # format blast database with truncated file
222
+ #MakeBlastDb.format_db(es.truncated_file_path,File.basename(es.truncated_file_path,File.extname(es.truncated_file_path)),'./') if piro_on
223
+
224
+ # leer mids
225
+ params.load_mids(File.join($FORMATTED_DB_PATH,'mids.fasta'))
226
+ params.load_linkers(File.join($FORMATTED_DB_PATH,'linkers.fasta'))
227
+
228
+
229
+
230
+
231
+
232
+ #execute cd-hit
233
+ if params.get_param('remove_clonality')=='true'
234
+
235
+ cmd=get_cd_hit_cmd(cd_hit_input_file,workers,File.join($SEQTRIM_PATH,'init_env'))
236
+
237
+ $LOG.info "Executing cd-hit-454: #{cmd}"
238
+
239
+ if !File.exists?('clusters.fasta.clstr')
240
+ system(cmd)
241
+ end
242
+
243
+ if File.exists?('clusters.fasta.clstr')
244
+ params.load_repeated_seqs('clusters.fasta.clstr')
245
+ else
246
+ $LOG.error("Exiting due to not found clusters.fasta.clstr. Maybe cd-hit failed. Check cd-hit.out")
247
+ exit
248
+ end
249
+ end
250
+
251
+
252
+ ############ SCBI DRB ###########
253
+ # port = 50000
254
+ # ip = "10.250.255.6"
255
+ # port = 50000
256
+ # ip = "localhost"
257
+ #
258
+ # workers=20
259
+ # only_workers=false
260
+ # launch work manager
261
+
262
+
263
+ end # end only_workers
264
+
265
+ custom_worker_file = File.join(File.dirname(__FILE__), 'em_classes','seqtrim_worker.rb')
266
+
267
+ $LOG.info "Workers:\n#{workers}"
268
+
269
+ if only_workers then
270
+
271
+ worker_launcher = ScbiMapreduce::WorkerLauncher.new(ip,port, workers, custom_worker_file, STDOUT)
272
+ worker_launcher.launch_workers_and_wait
273
+ else
274
+ $LOG.info 'Starting server'
275
+
276
+ SeqtrimWorkManager.init_work_manager(sequence_reader, params,chunk_size,use_json)
277
+
278
+ begin
279
+ cpus=1
280
+
281
+ if RUBY_PLATFORM.downcase.include?("darwin")
282
+ cpus=`hwprefs -cpu_count`.chomp.to_i
283
+ else
284
+ cpus=`grep processor /proc/cpuinfo |wc -l`.chomp.to_i
285
+ end
286
+ rescue
287
+ cpus=1
288
+ end
289
+
290
+ # if workers is an integer, reduce it by one (because of the server)
291
+ begin
292
+ Integer(workers)
293
+ if workers>1 && workers<cpus
294
+ workers-=1
295
+ end
296
+ rescue
297
+ if workers.count>1 && workers.count<cpus
298
+ workers.shift
299
+ end
300
+ end
301
+
302
+ # launch processor server passing the ip, port and all required params
303
+ # server = Server.new(ip,port, workers, SeqtrimWorkManager,custom_worker_file, STDOUT,File.join($SEQTRIM_PATH,'init_env'))
304
+ server = ScbiMapreduce::Manager.new(ip,port, workers, SeqtrimWorkManager,custom_worker_file, STDOUT,'~/.seqtrimnext')
305
+ server.chunk_size=chunk_size
306
+ server.start_server
307
+
308
+ # close sequence reader
309
+ sequence_reader.close
310
+ $LOG.info 'Closing server'
311
+ end
312
+
313
+ ############ SCBI DRB ###########
314
+
315
+ end
316
+
317
+ end # Seqtrim class
@@ -0,0 +1,55 @@
1
+ ########################################################
2
+ # Author: Almudena Bocinos Rioboo
3
+ #
4
+ # Defines the class Sequence's attribute
5
+ #
6
+ ########################################################
7
+
8
+ class Sequence
9
+ #storages the name and the contains from fasta sequence
10
+ def initialize(seq_name,seq_fasta,seq_qual, seq_comment = '')
11
+
12
+ @seq_fasta=seq_fasta
13
+ @seq_name=seq_name
14
+ @seq_qual=seq_qual
15
+ @seq_comment = seq_comment
16
+
17
+ @seq_rejected=false
18
+ @seq_repeated=false
19
+ @seq_reversed=false
20
+
21
+ @seq_rejected_by_message=''
22
+
23
+ @ns_present = ns_present?
24
+ @xs_present = xs_present?
25
+
26
+
27
+
28
+ # puts "INIT SEQ >>>> #{seq_name} #{seq_specie}"
29
+
30
+ end
31
+
32
+ attr_accessor :seq_name, :seq_fasta, :seq_qual, :seq_comment , :seq_rejected, :seq_repeated , :seq_reversed
33
+ attr_accessor :seq_rejected_by_message
34
+
35
+ def ns_present?
36
+ return (@seq_fasta.index('N') != nil)
37
+ end
38
+
39
+ def xs_present?
40
+ return (@seq_fasta.index('X') != nil)
41
+ end
42
+
43
+ def seq_is_long_enough(seq_min_length)
44
+ return (@seq_fasta.length>=seq_min_length)
45
+ end
46
+
47
+ def to_fasta
48
+ return ">"+@seq_name.to_s+"\n"+@seq_fasta
49
+ end
50
+
51
+ def to_qual
52
+ return ">"+@seq_name.to_s+"\n"+"#{@seq_qual}"
53
+ end
54
+
55
+ end
@@ -0,0 +1,72 @@
1
+
2
+
3
+ class SequenceGroup
4
+
5
+ attr_accessor :stats,:output_text,:output_files
6
+
7
+
8
+ def initialize(seqs)
9
+ @stats={}
10
+ @seqs=seqs
11
+ @output_text={}
12
+ @output_files={}
13
+ end
14
+
15
+
16
+ def push(seq)
17
+ @seqs.push seq
18
+ end
19
+
20
+ def delete(seq)
21
+ @seqs.delete(seq)
22
+ end
23
+
24
+ def empty?
25
+ return @seqs.empty?
26
+ end
27
+
28
+
29
+ def each
30
+ @seqs.each do |seq|
31
+ yield seq
32
+ end
33
+ end
34
+
35
+ def each_with_index
36
+ @seqs.each_with_index do |seq,i|
37
+ yield seq,i
38
+ end
39
+ end
40
+
41
+
42
+ def reverse_each
43
+ @seqs.reverse_each do |seq|
44
+ yield seq
45
+ end
46
+ end
47
+
48
+ def add(array)
49
+ @seqs += array
50
+ end
51
+
52
+ def count
53
+ return @seqs.count
54
+ end
55
+
56
+ def include?(s)
57
+ return @seqs.include?(s)
58
+ end
59
+
60
+ def remove_all_seqs
61
+ @seqs=[]
62
+ end
63
+
64
+ # def job_identifier
65
+ # return @seqs[0].seq_name
66
+ # end
67
+
68
+ def inspect
69
+ return "Group with #{@seqs.count} sequences"
70
+ end
71
+
72
+ end