seqtrimnext 2.0.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. data/History.txt +3 -0
  2. data/Manifest.txt +114 -0
  3. data/PostInstall.txt +7 -0
  4. data/README.rdoc +159 -0
  5. data/Rakefile +38 -0
  6. data/bin/create_graphs.rb +46 -0
  7. data/bin/extract_seqs.rb +45 -0
  8. data/bin/extract_seqs_from_fasta.rb +56 -0
  9. data/bin/extract_seqs_from_fastq.rb +45 -0
  10. data/bin/fasta2fastq.rb +38 -0
  11. data/bin/fastq2fasta.rb +35 -0
  12. data/bin/gen_qual.rb +46 -0
  13. data/bin/get_seq.rb +46 -0
  14. data/bin/group_by_range.rb +17 -0
  15. data/bin/join_ilumina_paired.rb +130 -0
  16. data/bin/parse_amplicons.rb +95 -0
  17. data/bin/parse_json_results.rb +66 -0
  18. data/bin/parse_params.rb +82 -0
  19. data/bin/resume_clusters.rb +48 -0
  20. data/bin/resume_rejected.sh +9 -0
  21. data/bin/reverse_paired.rb +49 -0
  22. data/bin/seqtrimnext +368 -0
  23. data/bin/split_fastq.rb +42 -0
  24. data/bin/split_ilumina_paired.rb +65 -0
  25. data/bin/split_paired.rb +70 -0
  26. data/lib/seqtrimnext/actions/action_ab_adapter.rb +32 -0
  27. data/lib/seqtrimnext/actions/action_ab_far_adapter.rb +32 -0
  28. data/lib/seqtrimnext/actions/action_ab_left_adapter.rb +32 -0
  29. data/lib/seqtrimnext/actions/action_empty_insert.rb +22 -0
  30. data/lib/seqtrimnext/actions/action_ignore_repeated.rb +24 -0
  31. data/lib/seqtrimnext/actions/action_indetermination.rb +30 -0
  32. data/lib/seqtrimnext/actions/action_induced_low_complexity.rb +29 -0
  33. data/lib/seqtrimnext/actions/action_insert.rb +32 -0
  34. data/lib/seqtrimnext/actions/action_is_contaminated.rb +30 -0
  35. data/lib/seqtrimnext/actions/action_key.rb +30 -0
  36. data/lib/seqtrimnext/actions/action_left_adapter.rb +32 -0
  37. data/lib/seqtrimnext/actions/action_left_primer.rb +17 -0
  38. data/lib/seqtrimnext/actions/action_linker.rb +30 -0
  39. data/lib/seqtrimnext/actions/action_low_complexity.rb +30 -0
  40. data/lib/seqtrimnext/actions/action_low_high_size.rb +31 -0
  41. data/lib/seqtrimnext/actions/action_low_quality.rb +33 -0
  42. data/lib/seqtrimnext/actions/action_mid.rb +30 -0
  43. data/lib/seqtrimnext/actions/action_multiple_linker.rb +29 -0
  44. data/lib/seqtrimnext/actions/action_paired_reads.rb +28 -0
  45. data/lib/seqtrimnext/actions/action_poly_a.rb +29 -0
  46. data/lib/seqtrimnext/actions/action_poly_t.rb +29 -0
  47. data/lib/seqtrimnext/actions/action_rem_adit_artifacts.rb +32 -0
  48. data/lib/seqtrimnext/actions/action_right_adapter.rb +29 -0
  49. data/lib/seqtrimnext/actions/action_right_primer.rb +25 -0
  50. data/lib/seqtrimnext/actions/action_short_insert.rb +32 -0
  51. data/lib/seqtrimnext/actions/action_unexpected_poly_t.rb +29 -0
  52. data/lib/seqtrimnext/actions/action_unexpected_vector.rb +31 -0
  53. data/lib/seqtrimnext/actions/action_vectors.rb +31 -0
  54. data/lib/seqtrimnext/actions/seqtrim_action.rb +136 -0
  55. data/lib/seqtrimnext/classes/action_manager.rb +47 -0
  56. data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +335 -0
  57. data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +290 -0
  58. data/lib/seqtrimnext/classes/extract_stats.rb +255 -0
  59. data/lib/seqtrimnext/classes/gnu_plot_graph.rb +140 -0
  60. data/lib/seqtrimnext/classes/graph_stats.rb +74 -0
  61. data/lib/seqtrimnext/classes/install_database.rb +43 -0
  62. data/lib/seqtrimnext/classes/install_requirements.rb +123 -0
  63. data/lib/seqtrimnext/classes/list_db.rb +49 -0
  64. data/lib/seqtrimnext/classes/make_blast_db.rb +113 -0
  65. data/lib/seqtrimnext/classes/one_blast.rb +41 -0
  66. data/lib/seqtrimnext/classes/params.rb +387 -0
  67. data/lib/seqtrimnext/classes/piro.rb +78 -0
  68. data/lib/seqtrimnext/classes/plugin_manager.rb +153 -0
  69. data/lib/seqtrimnext/classes/scan_for_restr_site.rb +138 -0
  70. data/lib/seqtrimnext/classes/scbi_stats.rb +68 -0
  71. data/lib/seqtrimnext/classes/seqtrim.rb +317 -0
  72. data/lib/seqtrimnext/classes/sequence.rb +55 -0
  73. data/lib/seqtrimnext/classes/sequence_group.rb +72 -0
  74. data/lib/seqtrimnext/classes/sequence_with_action.rb +503 -0
  75. data/lib/seqtrimnext/plugins/plugin.rb +267 -0
  76. data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +189 -0
  77. data/lib/seqtrimnext/plugins/plugin_adapters.rb +165 -0
  78. data/lib/seqtrimnext/plugins/plugin_amplicons.rb +221 -0
  79. data/lib/seqtrimnext/plugins/plugin_contaminants.rb +209 -0
  80. data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +438 -0
  81. data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +393 -0
  82. data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +101 -0
  83. data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +199 -0
  84. data/lib/seqtrimnext/plugins/plugin_key.rb +70 -0
  85. data/lib/seqtrimnext/plugins/plugin_linker.rb +232 -0
  86. data/lib/seqtrimnext/plugins/plugin_low_complexity.rb +98 -0
  87. data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +74 -0
  88. data/lib/seqtrimnext/plugins/plugin_low_quality.rb +394 -0
  89. data/lib/seqtrimnext/plugins/plugin_mids.rb +231 -0
  90. data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +246 -0
  91. data/lib/seqtrimnext/plugins/plugin_short_insert.rb +244 -0
  92. data/lib/seqtrimnext/plugins/plugin_vectors.rb +191 -0
  93. data/lib/seqtrimnext/templates/amplicons.txt +16 -0
  94. data/lib/seqtrimnext/templates/genomics_454.txt +5 -0
  95. data/lib/seqtrimnext/templates/genomics_454_with_paired.txt +5 -0
  96. data/lib/seqtrimnext/templates/low_quality.txt +5 -0
  97. data/lib/seqtrimnext/templates/low_quality_and_low_complexity.txt +5 -0
  98. data/lib/seqtrimnext/templates/transcriptomics_454.txt +8 -0
  99. data/lib/seqtrimnext/templates/transcriptomics_plants.txt +8 -0
  100. data/lib/seqtrimnext/utils/extract_samples.rb +52 -0
  101. data/lib/seqtrimnext/utils/fasta2xml.rb +69 -0
  102. data/lib/seqtrimnext/utils/global_match.rb +65 -0
  103. data/lib/seqtrimnext/utils/hash_stats.rb +29 -0
  104. data/lib/seqtrimnext/utils/json_utils.rb +50 -0
  105. data/lib/seqtrimnext/utils/load_fasta_names_in_hash.rb +37 -0
  106. data/lib/seqtrimnext/utils/load_qual_in_hash.rb +37 -0
  107. data/lib/seqtrimnext/utils/recover_mid.rb +95 -0
  108. data/lib/seqtrimnext/utils/string_utils.rb +56 -0
  109. data/lib/seqtrimnext.rb +37 -0
  110. data/script/console +10 -0
  111. data/script/destroy +14 -0
  112. data/script/generate +14 -0
  113. data/test/test_helper.rb +3 -0
  114. data/test/test_seqtrimnext.rb +11 -0
  115. metadata +318 -0
@@ -0,0 +1,138 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ #########################################
4
+ # Author:: Almudena Bocinos Rioboo
5
+ # This class provided the methods to read the parameter's file and to create the structure where will be storaged the param's name and the param's numeric-value
6
+ #########################################
7
+ class ScanForRestrSite
8
+
9
+ #Creates the structure and start the reading of parameter's file
10
+ def initialize(sequence,rest)
11
+ @seq_fasta=sequence
12
+ @rest=rest
13
+ puts "#{@seq_fasta} , #{@rest}"
14
+ res = execute
15
+
16
+
17
+ res.each do |e|
18
+ puts "#{e.join(',')}"
19
+ end
20
+
21
+ # selects from res,the max good hit
22
+ puts "--- MAX: --- "
23
+
24
+ max = res.max{|e1,e2| e1[1]<=> e2[1]}
25
+
26
+ puts max.join(' ; ')
27
+
28
+ # checks if the max one has the size of restriction with a margen error
29
+ margen = (@rest.size <= 4)? 0 : 1; # <- don't change
30
+ if ((max[1] != @rest.size) && (max[1] != @rest.size-margen))
31
+ puts "-the max good hit hasn't the size minimum: #{@rest.size} or #{@rest.size-margen} "
32
+ max=[]
33
+ end
34
+
35
+
36
+
37
+ #read_file(path)
38
+ end
39
+
40
+
41
+
42
+ def execute
43
+ r=[]
44
+ #
45
+ # for (my $p=0; $p < $sL-$srfL; $p++){
46
+ # $os = $ns = $xs = 0;
47
+ # for ( my $i=0; $i < $srfL; $i++ ) {
48
+ # my $c = substr($s, $i+$p, 1); # ver si decrementar antes pos
49
+ # my $cc = substr($restrSite, $i, 1);
50
+ # if ($c eq $cc) {
51
+ # ++$os;
52
+ # } elsif ($c eq "N"){
53
+ # ++$ns;
54
+ # } else {
55
+ # ++$xs;
56
+ # }
57
+ # }
58
+ # $r[$p] = [$p, $os, $ns, $xs];
59
+ # print "$p, $os, $ns, $xs\n";
60
+ # }
61
+ for p in 0..@seq_fasta.size-@rest.size
62
+ os = 0;
63
+ ns = 0;
64
+ xs = 0;
65
+ puts "-------[#{p}]-#{@seq_fasta[p,@seq_fasta.size-p]} , #{@rest}"
66
+
67
+ i=0
68
+ @rest.each_char do |cc|
69
+ c = @seq_fasta[i+p].chr
70
+ puts "(#{c}==#{cc})=>#{c==cc}"
71
+ if (c == cc)
72
+ os += 1
73
+ elsif (c == 'N')
74
+ ns += 1
75
+ else
76
+ xs += 1
77
+ end
78
+ i+=1
79
+
80
+ end
81
+ r[p]=[p,os,ns,xs]
82
+ puts r[p].join(',')
83
+ end
84
+ return r
85
+ end
86
+
87
+ # Reads param's file
88
+ def read_file(path_fichero)
89
+ File.open(path_fichero).each_line do |line|
90
+
91
+ line.chomp! # delete end of line
92
+
93
+ if !line.empty?
94
+ if !(line =~ /^#/) # if line is not a comment
95
+ # extract the parameter's name in params[0] and the parameter's value in params[1]
96
+ params = line.split(/\s*=\s*/)
97
+
98
+ # storage in the hash the pair key/value, in our case will be name/numeric-value ,
99
+ # that are save in params[0] and params[1], respectively
100
+ @h[params[0]] = params[1]
101
+
102
+ $LOG.debug "read: #{params[1]}"
103
+ end # end if comentario
104
+ end #end if line
105
+ end #end each
106
+ $LOG.info "File Params have been readed"
107
+
108
+ end# end def
109
+
110
+ # Prints the pair name/numeric-value for every parameter
111
+ def print_parameters()
112
+ @h.each do |clave, valor|
113
+
114
+ $LOG.debug "The Parameter #{clave} have the value " +valor.to_s
115
+ end
116
+ end
117
+
118
+ # Return the parameter's list in an array
119
+ def get_param(param)
120
+ #$LOG.debug "Get Param: #{@h[param]}"
121
+ return @h[param]
122
+ end
123
+
124
+ def set_param(param,value)
125
+ @h[param] = value
126
+ end
127
+
128
+ #attr_accessor :h # to accede to the atribute 'h' from out of this class
129
+
130
+ # Returns true if exists the parameter and nil if don't
131
+ def exists?(param_name)
132
+ return !@h[param_name].nil?
133
+ end
134
+
135
+ end
136
+ scan = ScanForRestrSite.new("AaaaACGTACGT", "AGTAC")
137
+ # scan = ScanForRestrSite.new("AaaaACGTAeCGT", "AGTAC")
138
+
@@ -0,0 +1,68 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'json'
4
+
5
+ class Array
6
+
7
+ def sum
8
+ r=0
9
+ each do |e|
10
+ r+=e
11
+ end
12
+ return r
13
+ end
14
+
15
+ end
16
+
17
+ class ScbiStats
18
+
19
+ def initialize(values)
20
+
21
+ @values=values
22
+
23
+
24
+ end
25
+
26
+ def get_window_value(i,window_size=10)
27
+ start_pos=[0,i-window_size].max
28
+
29
+ end_pos=[@values.length,i+window_size].min
30
+ # puts "#{@values[start_pos..end_pos]} => #{@values[start_pos..end_pos].sum}"
31
+ return @values[start_pos..end_pos].sum
32
+ end
33
+
34
+ def fat_mode(window_size=10)
35
+
36
+ fat_modes=[]
37
+ max_fat=0
38
+
39
+ @values.length.times do |i|
40
+ fat=get_window_value(i)
41
+
42
+ fat_modes << fat
43
+
44
+ if fat_modes[max_fat] < fat
45
+ max_fat=i
46
+ end
47
+
48
+ end
49
+ # puts fat_modes
50
+ return max_fat
51
+ # puts @values.length, @fat_modes.length
52
+ end
53
+
54
+ end
55
+
56
+
57
+ # istat=JSON.parse(File.read('initial_stats.json'))
58
+ #
59
+ # x=[]
60
+ # istat['qv'].each do |qv|
61
+ # x<< qv['tot'].to_i
62
+ #
63
+ # end
64
+ # # Usage:
65
+ #
66
+ # s=ScbiStats.new(x)
67
+ #
68
+ # puts s.fat_mode
@@ -0,0 +1,317 @@
1
+ ######################################
2
+ # Author: Almudena Bocinos Rioboo
3
+ # This is the main class.
4
+ ######################################
5
+
6
+ require 'extract_stats'
7
+
8
+ # $: << File.expand_path('~/progs/ruby/gems/scbi_drb/lib')
9
+
10
+ require 'scbi_mapreduce'
11
+ require 'seqtrim_work_manager'
12
+ require 'action_manager'
13
+
14
+ # SEQTRIM_VERSION_REVISION=27
15
+ # SEQTRIM_VERSION_STAGE = 'b'
16
+ # $SEQTRIM_VERSION = "2.0.0#{SEQTRIM_VERSION_STAGE}#{SEQTRIM_VERSION_REVISION}"
17
+
18
+ class Seqtrim
19
+
20
+ # First of all, reads the file's parameters, where are the values of all parameters and the 'plugin_list' that specifies the order of execution from the plugins.
21
+ #
22
+ # Secondly, loads the plugins in a folder .
23
+ #
24
+ # Thirdly, checks if parameter's file have the number of parameters necessary for every plugin that is going to be executed.
25
+ #
26
+ # After that, creates a thread's pool of a determinate number of workers, e.g. 10 threads,
27
+ # reads the sequences from files 'fasta' , until now without qualities,
28
+ # and executes the plugins over the sequences in the pool of threads
29
+
30
+
31
+ def get_cd_hit_cmd(cd_hit_input_file,workers,init_file_path)
32
+
33
+ num_cpus_cdhit=1
34
+ cmd=''
35
+
36
+ # if workers is an integer, reduce it by one in the server
37
+ begin
38
+ Integer(workers)
39
+ num_cpus_cdhit = workers
40
+ cmd = "cd-hit-454 -i #{cd_hit_input_file} -o clusters.fasta -M #{num_cpus_cdhit*1000} -T #{num_cpus_cdhit} > cd-hit-454.out"
41
+
42
+ rescue Exception => exception #not an integer, send via ssh to other machine
43
+ # puts exception
44
+ worker_hash={};workers.map{|e| worker_hash[e] = (worker_hash[e]||0) +1}
45
+
46
+ max_worker = worker_hash.sort_by{|k,v| -v}.first
47
+ puts "Found these workers: #{worker_hash.sort_by{|k,v| -v}}"
48
+ num_cpus_cdhit=max_worker[1]
49
+
50
+ init=''
51
+ cd=''
52
+
53
+
54
+ cmd = "cd-hit-454 -i #{cd_hit_input_file} -o clusters.fasta -M #{num_cpus_cdhit*1000} -T #{num_cpus_cdhit} > cd-hit-454.out"
55
+
56
+ # worker is different to current machine, send over ssh
57
+ if max_worker[0]!= workers[0]
58
+
59
+
60
+ if File.exists?(init_file_path)
61
+ init=". #{init_file_path}; "
62
+ end
63
+
64
+ pwd=`pwd`.chomp
65
+
66
+ cd =''
67
+
68
+ if File.exists?(pwd)
69
+ cd = "cd #{pwd}; "
70
+ end
71
+ cmd = "ssh #{max_worker[0]} \"#{init} #{cd} #{cmd}\""
72
+ end
73
+ end
74
+
75
+ return cmd
76
+ end
77
+
78
+ def check_global_params(params)
79
+ errors=[]
80
+
81
+ # check plugin list
82
+ comment='Plugins applied to every sequence, separated by commas. Order is important'
83
+ # default_value='PluginLowHighSize,PluginMids,PluginIndeterminations,PluginAbAdapters,PluginContaminants,PluginLinker,PluginVectors,PluginLowQuality'
84
+ # params.check_param(errors,'plugin_list','String',default_value,comment)
85
+ params.check_param(errors,'plugin_list','PluginList',nil,comment)
86
+
87
+
88
+ comment='Should SeqTrimNext analysis be based on NGS? (if setting to false, a classic Sanger sequencing is considered)'
89
+ default_value='true'
90
+ params.check_param(errors,'next_generation_sequences','String',default_value,comment)
91
+
92
+
93
+ comment='Remove duplicated (clonal) sequences (using CD-HIT 454)'
94
+ default_value='true'
95
+ params.check_param(errors,'remove_clonality','String',default_value,comment)
96
+
97
+
98
+ comment='Minimum insert size for every trimmed sequence'
99
+ default_value = 40
100
+ params.check_param(errors,'min_insert_size_trimmed','Integer',default_value,comment)
101
+
102
+ comment='Minimum insert size for each end of paired-end reads; true paired-ends have both single-ends longer than this value'
103
+ default_value = 40
104
+ params.check_param(errors,'min_insert_size_paired','Integer',default_value,comment)
105
+
106
+
107
+ comment='Do not reject unexpectedly long sequences found in the raw data'
108
+ default_value='true'
109
+ params.check_param(errors,'accept_very_long_sequences','String',default_value,comment)
110
+
111
+ comment='Seqtrim version'
112
+ default_value=Seqtrimnext::SEQTRIM_VERSION
113
+ params.check_param(errors,'seqtrim_version','String',default_value,comment)
114
+
115
+ if !errors.empty?
116
+ $LOG.error 'Please, define the following global parameters in params file:'
117
+ errors.each do |error|
118
+ $LOG.error ' -' + error
119
+ end #end each
120
+ end #end if
121
+
122
+ return errors.empty?
123
+
124
+ end
125
+
126
+
127
+
128
+ def initialize(options)
129
+ # ,options[:fasta],options[:qual],,,,
130
+ params_path=options[:template]
131
+
132
+ ip=options[:server_ip]
133
+ port=options[:port]
134
+ workers=options[:workers]
135
+ only_workers=options[:only_workers]
136
+ chunk_size = options[:chunk_size]
137
+ use_json = options[:json]
138
+
139
+
140
+ # it is the server part
141
+ if !only_workers then
142
+
143
+ sequence_reader = nil
144
+ cd_hit_input_file = nil
145
+
146
+ # TODO - FIX seqtrim to not iterate two times over input, so STDIN can be used
147
+
148
+ # open sequence reader and expand input files paths
149
+ if options[:fastq]
150
+ if options[:fastq]=='-'
151
+ seqs_path = STDIN
152
+ else
153
+ seqs_path = File.expand_path(options[:fastq])
154
+ end
155
+ cd_hit_input_file = seqs_path
156
+ sequence_reader = FastqFile.new(seqs_path,'r',:sanger, true)
157
+ # cd_hit_input_file = 'cd-hit-input.fasta'
158
+ cd_hit_input_file = seqs_path
159
+ # $LOG.info "Converting input file for cd-hit-454"
160
+ # $LOG.info "Conversion done"
161
+
162
+ else
163
+
164
+ seqs_path = File.expand_path(options[:fasta])
165
+ cd_hit_input_file = seqs_path
166
+
167
+ qual_path = File.expand_path(options[:qual]) if qual_path
168
+ sequence_reader = FastaQualFile.new(options[:fasta],options[:qual],true)
169
+
170
+ end
171
+
172
+
173
+ $LOG.info "Loading params"
174
+ # Reads the parameter's file
175
+ params = Params.new(params_path)
176
+
177
+ $LOG.info "Checking global params"
178
+ if !check_global_params(params)
179
+ exit
180
+ end
181
+
182
+ # Load actions
183
+ $LOG.info "Loading actions"
184
+ action_manager = ActionManager.new()
185
+
186
+ # load plugins
187
+ plugin_list = params.get_param('plugin_list') # puts in plugin_list the plugins's array
188
+ $LOG.info "Loading plugins [#{plugin_list}]"
189
+
190
+
191
+ plugin_manager = PluginManager.new(plugin_list,params) # creates an instance from PluginManager. This must storage the plugins and load it
192
+
193
+
194
+
195
+ # load plugin params
196
+ $LOG.info "Check plugin params"
197
+ if !plugin_manager.check_plugins_params(params) then
198
+ $LOG.error "Plugin check failed"
199
+
200
+ # save used params to file
201
+ params.save_file('used_params.txt')
202
+
203
+ exit
204
+ end
205
+
206
+
207
+ if !Dir.exists?(OUTPUT_PATH)
208
+ Dir.mkdir(OUTPUT_PATH)
209
+ end
210
+
211
+ $LOG.info "Calculatings stats"
212
+ # Extract global stats
213
+ ExtractStats.new(sequence_reader,params)
214
+
215
+
216
+ # save used params to file
217
+ params.save_file(File.join(OUTPUT_PATH,'used_params.txt'))
218
+
219
+ piro_on = (params.get_param('next_generation_sequences')=='true')
220
+
221
+ # format blast database with truncated file
222
+ #MakeBlastDb.format_db(es.truncated_file_path,File.basename(es.truncated_file_path,File.extname(es.truncated_file_path)),'./') if piro_on
223
+
224
+ # leer mids
225
+ params.load_mids(File.join($FORMATTED_DB_PATH,'mids.fasta'))
226
+ params.load_linkers(File.join($FORMATTED_DB_PATH,'linkers.fasta'))
227
+
228
+
229
+
230
+
231
+
232
+ #execute cd-hit
233
+ if params.get_param('remove_clonality')=='true'
234
+
235
+ cmd=get_cd_hit_cmd(cd_hit_input_file,workers,File.join($SEQTRIM_PATH,'init_env'))
236
+
237
+ $LOG.info "Executing cd-hit-454: #{cmd}"
238
+
239
+ if !File.exists?('clusters.fasta.clstr')
240
+ system(cmd)
241
+ end
242
+
243
+ if File.exists?('clusters.fasta.clstr')
244
+ params.load_repeated_seqs('clusters.fasta.clstr')
245
+ else
246
+ $LOG.error("Exiting due to not found clusters.fasta.clstr. Maybe cd-hit failed. Check cd-hit.out")
247
+ exit
248
+ end
249
+ end
250
+
251
+
252
+ ############ SCBI DRB ###########
253
+ # port = 50000
254
+ # ip = "10.250.255.6"
255
+ # port = 50000
256
+ # ip = "localhost"
257
+ #
258
+ # workers=20
259
+ # only_workers=false
260
+ # launch work manager
261
+
262
+
263
+ end # end only_workers
264
+
265
+ custom_worker_file = File.join(File.dirname(__FILE__), 'em_classes','seqtrim_worker.rb')
266
+
267
+ $LOG.info "Workers:\n#{workers}"
268
+
269
+ if only_workers then
270
+
271
+ worker_launcher = ScbiMapreduce::WorkerLauncher.new(ip,port, workers, custom_worker_file, STDOUT)
272
+ worker_launcher.launch_workers_and_wait
273
+ else
274
+ $LOG.info 'Starting server'
275
+
276
+ SeqtrimWorkManager.init_work_manager(sequence_reader, params,chunk_size,use_json)
277
+
278
+ begin
279
+ cpus=1
280
+
281
+ if RUBY_PLATFORM.downcase.include?("darwin")
282
+ cpus=`hwprefs -cpu_count`.chomp.to_i
283
+ else
284
+ cpus=`grep processor /proc/cpuinfo |wc -l`.chomp.to_i
285
+ end
286
+ rescue
287
+ cpus=1
288
+ end
289
+
290
+ # if workers is an integer, reduce it by one (because of the server)
291
+ begin
292
+ Integer(workers)
293
+ if workers>1 && workers<cpus
294
+ workers-=1
295
+ end
296
+ rescue
297
+ if workers.count>1 && workers.count<cpus
298
+ workers.shift
299
+ end
300
+ end
301
+
302
+ # launch processor server passing the ip, port and all required params
303
+ # server = Server.new(ip,port, workers, SeqtrimWorkManager,custom_worker_file, STDOUT,File.join($SEQTRIM_PATH,'init_env'))
304
+ server = ScbiMapreduce::Manager.new(ip,port, workers, SeqtrimWorkManager,custom_worker_file, STDOUT,'~/.seqtrimnext')
305
+ server.chunk_size=chunk_size
306
+ server.start_server
307
+
308
+ # close sequence reader
309
+ sequence_reader.close
310
+ $LOG.info 'Closing server'
311
+ end
312
+
313
+ ############ SCBI DRB ###########
314
+
315
+ end
316
+
317
+ end # Seqtrim class
@@ -0,0 +1,55 @@
1
+ ########################################################
2
+ # Author: Almudena Bocinos Rioboo
3
+ #
4
+ # Defines the class Sequence's attribute
5
+ #
6
+ ########################################################
7
+
8
+ class Sequence
9
+ #storages the name and the contains from fasta sequence
10
+ def initialize(seq_name,seq_fasta,seq_qual, seq_comment = '')
11
+
12
+ @seq_fasta=seq_fasta
13
+ @seq_name=seq_name
14
+ @seq_qual=seq_qual
15
+ @seq_comment = seq_comment
16
+
17
+ @seq_rejected=false
18
+ @seq_repeated=false
19
+ @seq_reversed=false
20
+
21
+ @seq_rejected_by_message=''
22
+
23
+ @ns_present = ns_present?
24
+ @xs_present = xs_present?
25
+
26
+
27
+
28
+ # puts "INIT SEQ >>>> #{seq_name} #{seq_specie}"
29
+
30
+ end
31
+
32
+ attr_accessor :seq_name, :seq_fasta, :seq_qual, :seq_comment , :seq_rejected, :seq_repeated , :seq_reversed
33
+ attr_accessor :seq_rejected_by_message
34
+
35
+ def ns_present?
36
+ return (@seq_fasta.index('N') != nil)
37
+ end
38
+
39
+ def xs_present?
40
+ return (@seq_fasta.index('X') != nil)
41
+ end
42
+
43
+ def seq_is_long_enough(seq_min_length)
44
+ return (@seq_fasta.length>=seq_min_length)
45
+ end
46
+
47
+ def to_fasta
48
+ return ">"+@seq_name.to_s+"\n"+@seq_fasta
49
+ end
50
+
51
+ def to_qual
52
+ return ">"+@seq_name.to_s+"\n"+"#{@seq_qual}"
53
+ end
54
+
55
+ end
@@ -0,0 +1,72 @@
1
+
2
+
3
+ class SequenceGroup
4
+
5
+ attr_accessor :stats,:output_text,:output_files
6
+
7
+
8
+ def initialize(seqs)
9
+ @stats={}
10
+ @seqs=seqs
11
+ @output_text={}
12
+ @output_files={}
13
+ end
14
+
15
+
16
+ def push(seq)
17
+ @seqs.push seq
18
+ end
19
+
20
+ def delete(seq)
21
+ @seqs.delete(seq)
22
+ end
23
+
24
+ def empty?
25
+ return @seqs.empty?
26
+ end
27
+
28
+
29
+ def each
30
+ @seqs.each do |seq|
31
+ yield seq
32
+ end
33
+ end
34
+
35
+ def each_with_index
36
+ @seqs.each_with_index do |seq,i|
37
+ yield seq,i
38
+ end
39
+ end
40
+
41
+
42
+ def reverse_each
43
+ @seqs.reverse_each do |seq|
44
+ yield seq
45
+ end
46
+ end
47
+
48
+ def add(array)
49
+ @seqs += array
50
+ end
51
+
52
+ def count
53
+ return @seqs.count
54
+ end
55
+
56
+ def include?(s)
57
+ return @seqs.include?(s)
58
+ end
59
+
60
+ def remove_all_seqs
61
+ @seqs=[]
62
+ end
63
+
64
+ # def job_identifier
65
+ # return @seqs[0].seq_name
66
+ # end
67
+
68
+ def inspect
69
+ return "Group with #{@seqs.count} sequences"
70
+ end
71
+
72
+ end