seqtrimnext 2.0.51 → 2.0.52
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +7 -0
- data/Manifest.txt +3 -3
- data/README.rdoc +18 -3
- data/Rakefile +2 -1
- data/bin/parse_params.rb +5 -1
- data/bin/seqtrimnext +53 -21
- data/lib/seqtrimnext/actions/{action_classify.rb → action_user_contaminant.rb} +2 -2
- data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +64 -20
- data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +375 -240
- data/lib/seqtrimnext/classes/extract_stats.rb +26 -23
- data/lib/seqtrimnext/classes/params.rb +109 -123
- data/lib/seqtrimnext/classes/plugin_manager.rb +2 -4
- data/lib/seqtrimnext/classes/seqtrim.rb +24 -29
- data/lib/seqtrimnext/classes/sequence.rb +2 -2
- data/lib/seqtrimnext/classes/sequence_group.rb +21 -1
- data/lib/seqtrimnext/classes/sequence_with_action.rb +25 -13
- data/lib/seqtrimnext/plugins/plugin.rb +42 -12
- data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +1 -8
- data/lib/seqtrimnext/plugins/plugin_adapters.rb +0 -9
- data/lib/seqtrimnext/plugins/plugin_amplicons.rb +0 -12
- data/lib/seqtrimnext/plugins/plugin_contaminants.rb +5 -8
- data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +1 -10
- data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +1 -11
- data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +1 -7
- data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +1 -8
- data/lib/seqtrimnext/plugins/plugin_key.rb +1 -9
- data/lib/seqtrimnext/plugins/plugin_linker.rb +0 -9
- data/lib/seqtrimnext/plugins/plugin_low_complexity.rb +6 -21
- data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +3 -13
- data/lib/seqtrimnext/plugins/plugin_low_quality.rb +126 -330
- data/lib/seqtrimnext/plugins/plugin_mids.rb +0 -11
- data/lib/seqtrimnext/plugins/plugin_short_insert.rb +1 -10
- data/lib/seqtrimnext/plugins/plugin_user_contaminants.rb +40 -32
- data/lib/seqtrimnext/plugins/plugin_vectors.rb +0 -9
- data/lib/seqtrimnext/templates/amplicons.txt +1 -8
- data/lib/seqtrimnext/templates/genomics_454.txt +12 -8
- data/lib/seqtrimnext/templates/genomics_454_with_paired.txt +19 -1
- data/lib/seqtrimnext/templates/genomics_short_reads.txt +26 -1
- data/lib/seqtrimnext/templates/genomics_short_reads_2.txt +24 -1
- data/lib/seqtrimnext/templates/only_quality.txt +24 -0
- data/lib/seqtrimnext/templates/sanger.txt +25 -0
- data/lib/seqtrimnext/templates/transcriptomics_454.txt +18 -1
- data/lib/seqtrimnext/templates/transcriptomics_plants.txt +22 -1
- data/lib/seqtrimnext/templates/transcriptomics_short_reads.txt +23 -1
- data/lib/seqtrimnext.rb +1 -1
- metadata +20 -7
- data/lib/seqtrimnext/plugins/plugin_adapters_old.rb +0 -165
- data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +0 -245
@@ -2,25 +2,25 @@
|
|
2
2
|
#finds the classes that were in the folder 'classes'
|
3
3
|
|
4
4
|
# ROOT_PATH=File.dirname(File.dirname(File.dirname(__FILE__)))
|
5
|
-
#
|
5
|
+
#
|
6
6
|
# $: << File.expand_path(File.join(ROOT_PATH, 'classes'))
|
7
7
|
# $: << File.expand_path(File.join(ROOT_PATH, 'classes','blast'))
|
8
|
-
#
|
8
|
+
#
|
9
9
|
# #finds the classes that were in the folder 'plugins'
|
10
10
|
# $: << File.expand_path(File.join(ROOT_PATH, 'plugins'))
|
11
|
-
#
|
11
|
+
#
|
12
12
|
# #finds the classes that were in the folder 'plugins'
|
13
13
|
# $: << File.expand_path(File.join(ROOT_PATH, 'actions'))
|
14
|
-
#
|
14
|
+
#
|
15
15
|
# #finds the classes that were in the folder 'utils'
|
16
16
|
# $: << File.expand_path(File.join(ROOT_PATH, 'utils'))
|
17
|
-
#
|
17
|
+
#
|
18
18
|
# $: << File.expand_path(File.join(ROOT_PATH, 'classes','em_classes'))
|
19
|
-
#
|
19
|
+
#
|
20
20
|
# $: << File.expand_path(ROOT_PATH)
|
21
21
|
|
22
22
|
$: << File.expand_path('~/progs/ruby/gems/seqtrimnext/lib/')
|
23
|
-
$: << File.expand_path('~/progs/ruby/gems/scbi_mapreduce/lib')
|
23
|
+
# $: << File.expand_path('~/progs/ruby/gems/scbi_mapreduce/lib')
|
24
24
|
|
25
25
|
require 'seqtrimnext'
|
26
26
|
|
@@ -32,7 +32,7 @@ if ENV['BLASTDB']# && Dir.exists?(ENV['BLASTDB'])
|
|
32
32
|
$DB_PATH = File.dirname($FORMATTED_DB_PATH)
|
33
33
|
else
|
34
34
|
$FORMATTED_DB_PATH = File.expand_path(File.join(ROOT_PATH, "DB",'formatted'))
|
35
|
-
$DB_PATH = File.expand_path(File.join(ROOT_PATH, "DB"))
|
35
|
+
$DB_PATH = File.expand_path(File.join(ROOT_PATH, "DB"))
|
36
36
|
end
|
37
37
|
|
38
38
|
ENV['BLASTDB']=$FORMATTED_DB_PATH
|
@@ -47,254 +47,389 @@ require 'params'
|
|
47
47
|
require 'action_manager'
|
48
48
|
require 'plugin_manager'
|
49
49
|
# require 'sequence_with_action'
|
50
|
-
#
|
50
|
+
#
|
51
51
|
require 'scbi_fastq'
|
52
52
|
require 'sequence_group'
|
53
53
|
|
54
54
|
class SeqtrimWorker < ScbiMapreduce::Worker
|
55
55
|
|
56
|
-
|
57
|
-
running_seqs=SequenceGroup.new(obj)
|
58
|
-
|
59
|
-
# execute plugins
|
60
|
-
@plugin_manager.execute_plugins(running_seqs)
|
61
|
-
|
62
|
-
# add output data
|
63
|
-
add_output_data(running_seqs)
|
64
|
-
|
65
|
-
return running_seqs
|
66
|
-
end
|
67
|
-
|
68
|
-
def receive_initial_config(obj)
|
69
|
-
|
70
|
-
# Reads the parameters
|
71
|
-
$WORKER_LOG.info "Params received"
|
72
|
-
# @params = Params.new(params_path)
|
73
|
-
@params = obj
|
74
|
-
|
75
|
-
@use_qual=@params.get_param('use_qual')
|
76
|
-
@use_json=@params.get_param('use_json')
|
77
|
-
end
|
56
|
+
def process_object(obj)
|
78
57
|
|
79
|
-
|
80
|
-
|
81
|
-
# $WORKER_LOG.level = Logger::ERROR
|
82
|
-
$WORKER_LOG.level = Logger::WARN
|
83
|
-
$WORKER_LOG.info "Loading actions"
|
84
|
-
|
85
|
-
@action_manager = ActionManager.new
|
86
|
-
|
87
|
-
$WORKER_LOG.info "Loading plugins"
|
88
|
-
@plugin_list = @params.get_param('plugin_list') # puts in plugin_list the plugins's array
|
89
|
-
$WORKER_LOG.info "PLUGIN LIST:" + @plugin_list
|
90
|
-
|
91
|
-
@plugin_manager = PluginManager.new(@plugin_list,@params) # creates an instance from PluginManager. This must storage the plugins and load it
|
92
|
-
|
93
|
-
rescue Exception => e
|
94
|
-
puts (e.message+ e.backtrace.join("\n"))
|
95
|
-
|
96
|
-
end
|
97
|
-
|
98
|
-
|
99
|
-
def closing_worker
|
100
|
-
|
101
|
-
end
|
102
|
-
|
103
|
-
|
104
|
-
def add_output_data(obj)
|
105
|
-
obj.output_text=[]
|
106
|
-
|
107
|
-
obj.each do |seq|
|
108
|
-
obj.output_text << seq.to_text
|
109
|
-
write_seq_to_files(obj.output_files,seq, obj.stats)
|
110
|
-
end
|
111
|
-
|
112
|
-
# @remove seqs since they are not needed anymore to write output files
|
113
|
-
obj.remove_all_seqs
|
114
|
-
end
|
115
|
-
|
116
|
-
def add_stat(stats,key,subkey,value,count=1)
|
117
|
-
|
118
|
-
stats[key]={} if !stats[key]
|
119
|
-
stats[key][subkey]={} if !stats[key][subkey]
|
120
|
-
stats[key][subkey][value]=0 if !stats[key][subkey][value]
|
121
|
-
|
122
|
-
stats[key][subkey][value]+=count
|
123
|
-
end
|
58
|
+
running_seqs=SequenceGroup.new(obj.flatten)
|
124
59
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
# fasta_file = get_paired_file(mid_id)
|
183
|
-
|
184
|
-
n="#{seq.seq_name}_left"
|
185
|
-
c="template=#{seq.seq_name} dir=R library=#{mid_id}"
|
186
|
-
f=inserts[0].reverse.tr('actgACTG','tgacTGAC')
|
187
|
-
q=[]
|
188
|
-
if @use_qual
|
189
|
-
q=qual_inserts[0].reverse
|
190
|
-
end
|
191
|
-
|
192
|
-
paired_file(files,dir_name,file_name)<<FastqFile.to_fastq(n,f,q,c)
|
193
|
-
|
194
|
-
|
195
|
-
n="#{seq.seq_name}_right"
|
196
|
-
c="template=#{seq.seq_name} dir=F library=#{mid_id}"
|
197
|
-
f=inserts[1]
|
198
|
-
q=[]
|
199
|
-
if @use_qual
|
200
|
-
q=qual_inserts[1]
|
201
|
-
end
|
202
|
-
|
203
|
-
paired_file(files,dir_name,file_name)<<FastqFile.to_fastq(n,f,q,c)
|
204
|
-
|
205
|
-
|
206
|
-
elsif (inserts.count == 1) # sequence with one insert
|
207
|
-
|
208
|
-
if (mid.nil? || (mid.message=='no_MID') ) # without mid
|
209
|
-
mid_id = 'no_MID'
|
210
|
-
mid_message = ' No MID found'
|
211
|
-
else
|
212
|
-
mid_id = mid.tag_id
|
213
|
-
mid_message=''
|
214
|
-
if mid_id != mid_message
|
215
|
-
mid_message = ' '+mid.message
|
216
|
-
end
|
217
|
-
end
|
218
|
-
|
219
|
-
# save fasta and qual in no MID file
|
220
|
-
has_low_complexity = seq.get_actions(ActionLowComplexity)
|
221
|
-
|
222
|
-
if has_low_complexity.empty?
|
223
|
-
add_stat(stats,'sequences','count','output_seqs')
|
224
|
-
|
225
|
-
# fasta_file = get_sequence_file(mid_id)
|
226
|
-
# sff_file=get_sffinfo_file(mid_id)
|
227
|
-
fasta_file=sequence_file(files,dir_name,file_name)
|
228
|
-
sff_file=sffinfo_file(files,dir_name,file_name)
|
229
|
-
else
|
230
|
-
add_stat(stats,'sequences','count','output_seqs_low_complexity')
|
231
|
-
|
232
|
-
# fasta_file = get_low_complexity_file(mid_id)
|
233
|
-
# sff_file=get_low_sffinfo_file(mid_id)
|
234
|
-
fasta_file=low_complexity_file(files,dir_name,file_name)
|
235
|
-
sff_file=low_sffinfo_file(files,dir_name,file_name)
|
60
|
+
# execute plugins
|
61
|
+
@plugin_manager.execute_plugins(running_seqs)
|
62
|
+
|
63
|
+
# add output data
|
64
|
+
add_output_data(running_seqs)
|
65
|
+
|
66
|
+
return running_seqs
|
67
|
+
end
|
68
|
+
|
69
|
+
def receive_initial_config(obj)
|
70
|
+
|
71
|
+
# Reads the parameters
|
72
|
+
$WORKER_LOG.info "Params received"
|
73
|
+
# @params = Params.new(params_path)
|
74
|
+
@params = obj
|
75
|
+
@tuple_size=@params.get_param('tuple_size')
|
76
|
+
|
77
|
+
@use_qual=@params.get_param('use_qual')
|
78
|
+
@use_json=@params.get_param('use_json')
|
79
|
+
end
|
80
|
+
|
81
|
+
def starting_worker
|
82
|
+
|
83
|
+
# $WORKER_LOG.level = Logger::ERROR
|
84
|
+
$WORKER_LOG.level = Logger::WARN
|
85
|
+
$WORKER_LOG.info "Loading actions"
|
86
|
+
|
87
|
+
@action_manager = ActionManager.new
|
88
|
+
|
89
|
+
$WORKER_LOG.info "Loading plugins"
|
90
|
+
@plugin_list = @params.get_param('plugin_list') # puts in plugin_list the plugins's array
|
91
|
+
$WORKER_LOG.info "PLUGIN LIST:" + @plugin_list
|
92
|
+
|
93
|
+
@plugin_manager = PluginManager.new(@plugin_list,@params) # creates an instance from PluginManager. This must storage the plugins and load it
|
94
|
+
|
95
|
+
rescue Exception => e
|
96
|
+
puts (e.message+ e.backtrace.join("\n"))
|
97
|
+
|
98
|
+
end
|
99
|
+
|
100
|
+
|
101
|
+
def closing_worker
|
102
|
+
|
103
|
+
end
|
104
|
+
|
105
|
+
|
106
|
+
def add_output_data(obj)
|
107
|
+
obj.output_text=[]
|
108
|
+
|
109
|
+
if @tuple_size>1
|
110
|
+
obj.each_slice(@tuple_size) do |seqs|
|
111
|
+
|
112
|
+
write_seq_to_files_tuple(obj.output_files,seqs, obj.stats)
|
113
|
+
|
114
|
+
seqs.each do |seq|
|
115
|
+
obj.output_text << seq.to_text
|
236
116
|
end
|
237
|
-
|
238
|
-
q=[]
|
239
|
-
if @use_qual
|
240
|
-
q=qual_inserts[0]
|
241
|
-
end
|
242
|
-
|
243
|
-
n=seq.seq_name
|
244
|
-
c=mid_message
|
245
|
-
f=inserts[0]
|
246
|
-
|
247
|
-
fasta_file << FastqFile.to_fastq(n,f,q,c)
|
248
|
-
|
249
|
-
inserts_pos = seq.get_actions(ActionInsert)
|
250
|
-
|
251
|
-
sff_file<< "#{n} #{inserts_pos[0].start_pos+1} #{inserts_pos[0].end_pos+1}"
|
252
|
-
|
253
117
|
end
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
118
|
+
|
119
|
+
else
|
120
|
+
obj.each do |seq|
|
121
|
+
write_seq_to_files_normal(obj.output_files,seq, obj.stats)
|
122
|
+
obj.output_text << seq.to_text
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
# @remove seqs since they are not needed anymore to write output files
|
127
|
+
obj.remove_all_seqs
|
128
|
+
end
|
129
|
+
|
130
|
+
def add_stat(stats,key,subkey,value,count=1)
|
131
|
+
|
132
|
+
stats[key]={} if !stats[key]
|
133
|
+
stats[key][subkey]={} if !stats[key][subkey]
|
134
|
+
stats[key][subkey][value]=0 if !stats[key][subkey][value]
|
135
|
+
|
136
|
+
stats[key][subkey][value]+=count
|
137
|
+
end
|
138
|
+
|
139
|
+
def write_seq_to_files_tuple(files,seqs, stats)
|
140
|
+
|
141
|
+
|
142
|
+
seq1=seqs[0]
|
143
|
+
seq2=seqs[1]
|
144
|
+
|
145
|
+
dir_name,file_name,priority=seq1.get_file_tag_path
|
146
|
+
dir_name2,file_name2,priority2=seq2.get_file_tag_path
|
147
|
+
|
148
|
+
# both paired sequences must go in same file, there are priorities
|
149
|
+
if (dir_name!=dir_name2) || (file_name!=file_name2)
|
150
|
+
if priority2>priority
|
151
|
+
dir_name=dir_name2
|
152
|
+
file_name=file_name2
|
153
|
+
end
|
275
154
|
end
|
155
|
+
|
156
|
+
# get current inserts
|
157
|
+
inserts1 = seq1.get_inserts
|
158
|
+
inserts2 = seq2.get_inserts
|
276
159
|
|
277
|
-
|
278
|
-
|
160
|
+
# qualities are optional
|
161
|
+
if @use_qual
|
162
|
+
qual_inserts1 = seq1.get_qual_inserts
|
163
|
+
qual_inserts2 = seq2.get_qual_inserts
|
279
164
|
end
|
280
|
-
|
281
|
-
|
282
|
-
|
165
|
+
|
166
|
+
|
167
|
+
|
168
|
+
# save json if necessary
|
169
|
+
if @use_json
|
170
|
+
json_file(files)<< seq1.to_json
|
171
|
+
json_file(files)<< seq2.to_json
|
283
172
|
end
|
284
173
|
|
285
|
-
|
286
|
-
|
174
|
+
# find mids
|
175
|
+
mid1 = seq1.get_actions(ActionMid).first
|
176
|
+
mid2 = seq2.get_actions(ActionMid).first
|
177
|
+
|
178
|
+
|
179
|
+
if !inserts1.empty? && !inserts2.empty? # both have inserts
|
180
|
+
# save_two_inserts(files,seq, stats,inserts,qual_inserts,mid,dir_name,file_name)
|
181
|
+
save_two_inserts_tuple(files,seq1,seq2, stats,inserts1,inserts2,qual_inserts1,qual_inserts2,mid1,dir_name,file_name)
|
182
|
+
else
|
183
|
+
save_rejected_empty_or_single(files,seq1, stats,inserts1,qual_inserts1,mid1,dir_name,file_name)
|
184
|
+
save_rejected_empty_or_single(files,seq2, stats,inserts2,qual_inserts2,mid2,dir_name,file_name)
|
185
|
+
end
|
186
|
+
|
187
|
+
end
|
188
|
+
|
189
|
+
def save_two_inserts_tuple(files,seq1,seq2, stats,inserts1,inserts2,qual_inserts1,qual_inserts2,mid,dir_name,file_name)
|
190
|
+
|
191
|
+
add_stat(stats,'sequences','count','output_seqs_paired')
|
192
|
+
add_stat(stats,'sequences','count','output_seqs_paired')
|
193
|
+
|
194
|
+
mid_id,mid_message=get_mid_message(mid)
|
195
|
+
|
196
|
+
# save left read
|
197
|
+
n="#{seq1.seq_name}"
|
198
|
+
c=seq1.get_comment_line # "template=#{seq1.seq_name} dir=R library=#{mid_id}"
|
199
|
+
f=inserts1[0]#.reverse.tr('actgACTG','tgacTGAC')
|
200
|
+
q=[]
|
201
|
+
if @use_qual
|
202
|
+
q=qual_inserts1[0] #.reverse
|
287
203
|
end
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
204
|
+
|
205
|
+
paired_file_ilu1(files,dir_name,file_name)<<FastqFile.to_fastq(n,f,q,c)
|
206
|
+
|
207
|
+
# save right read
|
208
|
+
n="#{seq2.seq_name}"
|
209
|
+
c=seq2.get_comment_line # "template=#{seq2.seq_name} dir=F library=#{mid_id}"
|
210
|
+
f=inserts2[0]
|
211
|
+
q=[]
|
212
|
+
if @use_qual
|
213
|
+
q=qual_inserts2[0]
|
214
|
+
end
|
215
|
+
|
216
|
+
paired_file_ilu2(files,dir_name,file_name)<<FastqFile.to_fastq(n,f,q,c)
|
217
|
+
|
218
|
+
end
|
219
|
+
|
220
|
+
|
221
|
+
def save_rejected_empty_or_single(files,seq, stats,inserts,qual_inserts,mid,dir_name,file_name)
|
222
|
+
if (seq.seq_rejected) # save to rejected sequences
|
223
|
+
save_rejected_seq(files,seq, stats)
|
224
|
+
elsif (inserts.empty?) #sequence with no inserts
|
225
|
+
save_empty_insert(files,seq, stats)
|
226
|
+
elsif (inserts.count == 1) # sequence with one insert
|
227
|
+
save_one_insert(files,seq, stats,inserts,qual_inserts,mid,dir_name,file_name)
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
|
232
|
+
# SAVE NORMAL ===============================
|
233
|
+
def save_rejected_seq(files,seq, stats)
|
234
|
+
# message = seq.seq_rejected_by_message
|
235
|
+
message= seq.get_comment_line
|
236
|
+
rejected_output_file(files)<<('>'+seq.seq_name+ ' ' + message)
|
237
|
+
|
238
|
+
add_stat(stats,'sequences','rejected',seq.seq_rejected_by_message)
|
239
|
+
add_stat(stats,'sequences','count','rejected')
|
240
|
+
end
|
241
|
+
|
242
|
+
def save_empty_insert(files,seq, stats)
|
243
|
+
seq.seq_rejected=true
|
244
|
+
seq.seq_rejected_by_message='short insert'
|
245
|
+
|
246
|
+
message = 'No valid inserts found'
|
247
|
+
|
248
|
+
rejected_output_file(files)<<('>'+seq.seq_name+ ' ' + message)
|
249
|
+
|
250
|
+
add_stat(stats,'sequences','rejected',message)
|
251
|
+
add_stat(stats,'sequences','count','rejected')
|
252
|
+
|
253
|
+
end
|
254
|
+
|
255
|
+
def get_mid_message(mid)
|
256
|
+
if (mid.nil? || (mid.message=='no_MID') ) # without mid
|
257
|
+
mid_id = 'no_MID'
|
258
|
+
mid_message = ' No MID found'
|
259
|
+
else
|
260
|
+
mid_id = mid.tag_id
|
261
|
+
mid_message=''
|
262
|
+
if mid_id != mid_message
|
263
|
+
mid_message = ' '+mid.message
|
295
264
|
end
|
265
|
+
end
|
266
|
+
return mid_id,mid_message
|
267
|
+
end
|
268
|
+
|
269
|
+
def save_two_inserts(files,seq, stats,inserts,qual_inserts,mid,dir_name,file_name)
|
270
|
+
add_stat(stats,'sequences','count','output_seqs_paired')
|
271
|
+
|
272
|
+
mid_id,mid_message=get_mid_message(mid)
|
273
|
+
|
274
|
+
# save left read
|
275
|
+
n="#{seq.seq_name}_left"
|
276
|
+
c="template=#{seq.seq_name} dir=R library=#{mid_id} #{seq.get_comment_line}"
|
277
|
+
f=inserts[0].reverse.tr('actgACTG','tgacTGAC')
|
278
|
+
q=[]
|
279
|
+
if @use_qual
|
280
|
+
q=qual_inserts[0].reverse
|
281
|
+
end
|
282
|
+
|
283
|
+
paired_file(files,dir_name,file_name)<<FastqFile.to_fastq(n,f,q,c)
|
284
|
+
|
285
|
+
# save right read
|
286
|
+
n="#{seq.seq_name}_right"
|
287
|
+
c="template=#{seq.seq_name} dir=F library=#{mid_id} #{seq.get_comment_line}"
|
288
|
+
f=inserts[1]
|
289
|
+
q=[]
|
290
|
+
if @use_qual
|
291
|
+
q=qual_inserts[1]
|
292
|
+
end
|
293
|
+
|
294
|
+
paired_file(files,dir_name,file_name)<<FastqFile.to_fastq(n,f,q,c)
|
295
|
+
|
296
|
+
end
|
297
|
+
|
298
|
+
def save_one_insert(files,seq, stats,inserts,qual_inserts,mid,dir_name,file_name)
|
299
|
+
mid_id,mid_message=get_mid_message(mid)
|
300
|
+
|
301
|
+
# save fasta and qual in no MID file
|
302
|
+
has_low_complexity = seq.get_actions(ActionLowComplexity)
|
303
|
+
|
304
|
+
if has_low_complexity.empty?
|
305
|
+
add_stat(stats,'sequences','count','output_seqs')
|
306
|
+
|
307
|
+
fasta_file=sequence_file(files,dir_name,file_name)
|
308
|
+
sff_file=sffinfo_file(files,dir_name,file_name)
|
309
|
+
else
|
310
|
+
add_stat(stats,'sequences','count','output_seqs_low_complexity')
|
311
|
+
|
312
|
+
fasta_file=low_complexity_file(files,dir_name,file_name)
|
313
|
+
sff_file=low_sffinfo_file(files,dir_name,file_name)
|
314
|
+
end
|
315
|
+
|
316
|
+
q=[]
|
317
|
+
if @use_qual
|
318
|
+
q=qual_inserts[0]
|
319
|
+
end
|
320
|
+
|
321
|
+
n=seq.seq_name
|
322
|
+
c=mid_message
|
323
|
+
|
324
|
+
seq_comments=seq.get_comment_line
|
325
|
+
if !seq_comments.strip.empty?
|
326
|
+
c=seq_comments + c
|
327
|
+
end
|
328
|
+
|
329
|
+
f=inserts[0]
|
330
|
+
|
331
|
+
fasta_file << FastqFile.to_fastq(n,f,q,c)
|
332
|
+
|
333
|
+
inserts_pos = seq.get_actions(ActionInsert)
|
334
|
+
|
335
|
+
sff_file<< "#{n} #{inserts_pos[0].start_pos+1} #{inserts_pos[0].end_pos+1}"
|
336
|
+
|
337
|
+
|
338
|
+
end
|
339
|
+
|
340
|
+
|
341
|
+
def write_seq_to_files_normal(files,seq, stats)
|
342
|
+
|
343
|
+
# puts stats.to_json
|
344
|
+
|
345
|
+
dir_name,file_name,priority=seq.get_file_tag_path
|
346
|
+
# puts File.join(dir_name,'sequences_'+file_name)
|
347
|
+
|
348
|
+
# get current inserts
|
349
|
+
inserts = seq.get_inserts
|
350
|
+
|
351
|
+
# qualities are optional
|
352
|
+
if @use_qual
|
353
|
+
qual_inserts = seq.get_qual_inserts
|
354
|
+
end
|
355
|
+
|
356
|
+
# save json if necessary
|
357
|
+
if @use_json
|
358
|
+
json_file(files)<< seq.to_json
|
359
|
+
end
|
360
|
+
|
361
|
+
# find mids
|
362
|
+
mid = seq.get_actions(ActionMid).first
|
363
|
+
|
364
|
+
|
365
|
+
if (seq.seq_rejected) # save to rejected sequences
|
366
|
+
save_rejected_seq(files,seq, stats)
|
296
367
|
|
297
|
-
|
368
|
+
elsif (inserts.empty?) #sequence with no inserts
|
369
|
+
save_empty_insert(files,seq, stats)
|
370
|
+
|
371
|
+
elsif (inserts.count == 2) # sequence with two inserts = PAIRED SEQUENCES
|
372
|
+
save_two_inserts(files,seq, stats,inserts,qual_inserts,mid,dir_name,file_name)
|
373
|
+
|
374
|
+
elsif (inserts.count == 1) # sequence with one insert
|
375
|
+
save_one_insert(files,seq, stats,inserts,qual_inserts,mid,dir_name,file_name)
|
376
|
+
end
|
377
|
+
|
378
|
+
end
|
379
|
+
|
380
|
+
|
381
|
+
|
382
|
+
|
383
|
+
|
384
|
+
# ACCESS TO FILES
|
385
|
+
|
386
|
+
def json_file(files)
|
387
|
+
return get_file(files,File.join(OUTPUT_PATH,'results.json'))
|
388
|
+
end
|
389
|
+
|
390
|
+
def rejected_output_file(files)
|
391
|
+
return get_file(files,File.join(OUTPUT_PATH,'rejected.txt'))
|
392
|
+
end
|
393
|
+
|
394
|
+
|
395
|
+
def sequence_file(files, dir_name, file_name)
|
396
|
+
return get_file(files,File.join(OUTPUT_PATH,dir_name,'sequences_'+file_name+'.fastq'))
|
397
|
+
end
|
398
|
+
|
399
|
+
def paired_file(files, dir_name, file_name)
|
400
|
+
return get_file(files,File.join(OUTPUT_PATH,dir_name,'paired_'+file_name+'.fastq'))
|
401
|
+
end
|
402
|
+
|
403
|
+
def paired_file_ilu1(files, dir_name, file_name)
|
404
|
+
return get_file(files,File.join(OUTPUT_PATH,dir_name,'paired_1_'+file_name+'.fastq'))
|
405
|
+
end
|
406
|
+
|
407
|
+
def paired_file_ilu2(files, dir_name, file_name)
|
408
|
+
return get_file(files,File.join(OUTPUT_PATH,dir_name,'paired_2_'+file_name+'.fastq'))
|
409
|
+
end
|
410
|
+
|
411
|
+
|
412
|
+
def low_complexity_file(files, dir_name, file_name)
|
413
|
+
return get_file(files,File.join(OUTPUT_PATH,dir_name,'low_complexity_'+file_name+'.fastq'))
|
414
|
+
end
|
415
|
+
|
416
|
+
def sffinfo_file(files, dir_name, file_name)
|
417
|
+
return get_file(files,File.join(OUTPUT_PATH,dir_name,'sff_info_'+file_name+'.txt'))
|
418
|
+
end
|
419
|
+
|
420
|
+
def low_sffinfo_file(files, dir_name, file_name)
|
421
|
+
return get_file(files,File.join(OUTPUT_PATH,dir_name,'low_complexity_sff_info_'+file_name+'.txt'))
|
422
|
+
end
|
423
|
+
|
424
|
+
def get_file(files,fn)
|
425
|
+
res=files[fn]
|
426
|
+
|
427
|
+
if !res
|
428
|
+
files[fn]=[]
|
429
|
+
res=files[fn]
|
298
430
|
end
|
299
|
-
|
431
|
+
|
432
|
+
return res
|
433
|
+
end
|
434
|
+
|
300
435
|
end
|