seqtrimnext 2.0.51 → 2.0.52
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +7 -0
- data/Manifest.txt +3 -3
- data/README.rdoc +18 -3
- data/Rakefile +2 -1
- data/bin/parse_params.rb +5 -1
- data/bin/seqtrimnext +53 -21
- data/lib/seqtrimnext/actions/{action_classify.rb → action_user_contaminant.rb} +2 -2
- data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +64 -20
- data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +375 -240
- data/lib/seqtrimnext/classes/extract_stats.rb +26 -23
- data/lib/seqtrimnext/classes/params.rb +109 -123
- data/lib/seqtrimnext/classes/plugin_manager.rb +2 -4
- data/lib/seqtrimnext/classes/seqtrim.rb +24 -29
- data/lib/seqtrimnext/classes/sequence.rb +2 -2
- data/lib/seqtrimnext/classes/sequence_group.rb +21 -1
- data/lib/seqtrimnext/classes/sequence_with_action.rb +25 -13
- data/lib/seqtrimnext/plugins/plugin.rb +42 -12
- data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +1 -8
- data/lib/seqtrimnext/plugins/plugin_adapters.rb +0 -9
- data/lib/seqtrimnext/plugins/plugin_amplicons.rb +0 -12
- data/lib/seqtrimnext/plugins/plugin_contaminants.rb +5 -8
- data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +1 -10
- data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +1 -11
- data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +1 -7
- data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +1 -8
- data/lib/seqtrimnext/plugins/plugin_key.rb +1 -9
- data/lib/seqtrimnext/plugins/plugin_linker.rb +0 -9
- data/lib/seqtrimnext/plugins/plugin_low_complexity.rb +6 -21
- data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +3 -13
- data/lib/seqtrimnext/plugins/plugin_low_quality.rb +126 -330
- data/lib/seqtrimnext/plugins/plugin_mids.rb +0 -11
- data/lib/seqtrimnext/plugins/plugin_short_insert.rb +1 -10
- data/lib/seqtrimnext/plugins/plugin_user_contaminants.rb +40 -32
- data/lib/seqtrimnext/plugins/plugin_vectors.rb +0 -9
- data/lib/seqtrimnext/templates/amplicons.txt +1 -8
- data/lib/seqtrimnext/templates/genomics_454.txt +12 -8
- data/lib/seqtrimnext/templates/genomics_454_with_paired.txt +19 -1
- data/lib/seqtrimnext/templates/genomics_short_reads.txt +26 -1
- data/lib/seqtrimnext/templates/genomics_short_reads_2.txt +24 -1
- data/lib/seqtrimnext/templates/only_quality.txt +24 -0
- data/lib/seqtrimnext/templates/sanger.txt +25 -0
- data/lib/seqtrimnext/templates/transcriptomics_454.txt +18 -1
- data/lib/seqtrimnext/templates/transcriptomics_plants.txt +22 -1
- data/lib/seqtrimnext/templates/transcriptomics_short_reads.txt +23 -1
- data/lib/seqtrimnext.rb +1 -1
- metadata +20 -7
- data/lib/seqtrimnext/plugins/plugin_adapters_old.rb +0 -165
- data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +0 -245
@@ -2,25 +2,25 @@
|
|
2
2
|
#finds the classes that were in the folder 'classes'
|
3
3
|
|
4
4
|
# ROOT_PATH=File.dirname(File.dirname(File.dirname(__FILE__)))
|
5
|
-
#
|
5
|
+
#
|
6
6
|
# $: << File.expand_path(File.join(ROOT_PATH, 'classes'))
|
7
7
|
# $: << File.expand_path(File.join(ROOT_PATH, 'classes','blast'))
|
8
|
-
#
|
8
|
+
#
|
9
9
|
# #finds the classes that were in the folder 'plugins'
|
10
10
|
# $: << File.expand_path(File.join(ROOT_PATH, 'plugins'))
|
11
|
-
#
|
11
|
+
#
|
12
12
|
# #finds the classes that were in the folder 'plugins'
|
13
13
|
# $: << File.expand_path(File.join(ROOT_PATH, 'actions'))
|
14
|
-
#
|
14
|
+
#
|
15
15
|
# #finds the classes that were in the folder 'utils'
|
16
16
|
# $: << File.expand_path(File.join(ROOT_PATH, 'utils'))
|
17
|
-
#
|
17
|
+
#
|
18
18
|
# $: << File.expand_path(File.join(ROOT_PATH, 'classes','em_classes'))
|
19
|
-
#
|
19
|
+
#
|
20
20
|
# $: << File.expand_path(ROOT_PATH)
|
21
21
|
|
22
22
|
$: << File.expand_path('~/progs/ruby/gems/seqtrimnext/lib/')
|
23
|
-
$: << File.expand_path('~/progs/ruby/gems/scbi_mapreduce/lib')
|
23
|
+
# $: << File.expand_path('~/progs/ruby/gems/scbi_mapreduce/lib')
|
24
24
|
|
25
25
|
require 'seqtrimnext'
|
26
26
|
|
@@ -32,7 +32,7 @@ if ENV['BLASTDB']# && Dir.exists?(ENV['BLASTDB'])
|
|
32
32
|
$DB_PATH = File.dirname($FORMATTED_DB_PATH)
|
33
33
|
else
|
34
34
|
$FORMATTED_DB_PATH = File.expand_path(File.join(ROOT_PATH, "DB",'formatted'))
|
35
|
-
$DB_PATH = File.expand_path(File.join(ROOT_PATH, "DB"))
|
35
|
+
$DB_PATH = File.expand_path(File.join(ROOT_PATH, "DB"))
|
36
36
|
end
|
37
37
|
|
38
38
|
ENV['BLASTDB']=$FORMATTED_DB_PATH
|
@@ -47,254 +47,389 @@ require 'params'
|
|
47
47
|
require 'action_manager'
|
48
48
|
require 'plugin_manager'
|
49
49
|
# require 'sequence_with_action'
|
50
|
-
#
|
50
|
+
#
|
51
51
|
require 'scbi_fastq'
|
52
52
|
require 'sequence_group'
|
53
53
|
|
54
54
|
class SeqtrimWorker < ScbiMapreduce::Worker
|
55
55
|
|
56
|
-
|
57
|
-
running_seqs=SequenceGroup.new(obj)
|
58
|
-
|
59
|
-
# execute plugins
|
60
|
-
@plugin_manager.execute_plugins(running_seqs)
|
61
|
-
|
62
|
-
# add output data
|
63
|
-
add_output_data(running_seqs)
|
64
|
-
|
65
|
-
return running_seqs
|
66
|
-
end
|
67
|
-
|
68
|
-
def receive_initial_config(obj)
|
69
|
-
|
70
|
-
# Reads the parameters
|
71
|
-
$WORKER_LOG.info "Params received"
|
72
|
-
# @params = Params.new(params_path)
|
73
|
-
@params = obj
|
74
|
-
|
75
|
-
@use_qual=@params.get_param('use_qual')
|
76
|
-
@use_json=@params.get_param('use_json')
|
77
|
-
end
|
56
|
+
def process_object(obj)
|
78
57
|
|
79
|
-
|
80
|
-
|
81
|
-
# $WORKER_LOG.level = Logger::ERROR
|
82
|
-
$WORKER_LOG.level = Logger::WARN
|
83
|
-
$WORKER_LOG.info "Loading actions"
|
84
|
-
|
85
|
-
@action_manager = ActionManager.new
|
86
|
-
|
87
|
-
$WORKER_LOG.info "Loading plugins"
|
88
|
-
@plugin_list = @params.get_param('plugin_list') # puts in plugin_list the plugins's array
|
89
|
-
$WORKER_LOG.info "PLUGIN LIST:" + @plugin_list
|
90
|
-
|
91
|
-
@plugin_manager = PluginManager.new(@plugin_list,@params) # creates an instance from PluginManager. This must storage the plugins and load it
|
92
|
-
|
93
|
-
rescue Exception => e
|
94
|
-
puts (e.message+ e.backtrace.join("\n"))
|
95
|
-
|
96
|
-
end
|
97
|
-
|
98
|
-
|
99
|
-
def closing_worker
|
100
|
-
|
101
|
-
end
|
102
|
-
|
103
|
-
|
104
|
-
def add_output_data(obj)
|
105
|
-
obj.output_text=[]
|
106
|
-
|
107
|
-
obj.each do |seq|
|
108
|
-
obj.output_text << seq.to_text
|
109
|
-
write_seq_to_files(obj.output_files,seq, obj.stats)
|
110
|
-
end
|
111
|
-
|
112
|
-
# @remove seqs since they are not needed anymore to write output files
|
113
|
-
obj.remove_all_seqs
|
114
|
-
end
|
115
|
-
|
116
|
-
def add_stat(stats,key,subkey,value,count=1)
|
117
|
-
|
118
|
-
stats[key]={} if !stats[key]
|
119
|
-
stats[key][subkey]={} if !stats[key][subkey]
|
120
|
-
stats[key][subkey][value]=0 if !stats[key][subkey][value]
|
121
|
-
|
122
|
-
stats[key][subkey][value]+=count
|
123
|
-
end
|
58
|
+
running_seqs=SequenceGroup.new(obj.flatten)
|
124
59
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
# fasta_file = get_paired_file(mid_id)
|
183
|
-
|
184
|
-
n="#{seq.seq_name}_left"
|
185
|
-
c="template=#{seq.seq_name} dir=R library=#{mid_id}"
|
186
|
-
f=inserts[0].reverse.tr('actgACTG','tgacTGAC')
|
187
|
-
q=[]
|
188
|
-
if @use_qual
|
189
|
-
q=qual_inserts[0].reverse
|
190
|
-
end
|
191
|
-
|
192
|
-
paired_file(files,dir_name,file_name)<<FastqFile.to_fastq(n,f,q,c)
|
193
|
-
|
194
|
-
|
195
|
-
n="#{seq.seq_name}_right"
|
196
|
-
c="template=#{seq.seq_name} dir=F library=#{mid_id}"
|
197
|
-
f=inserts[1]
|
198
|
-
q=[]
|
199
|
-
if @use_qual
|
200
|
-
q=qual_inserts[1]
|
201
|
-
end
|
202
|
-
|
203
|
-
paired_file(files,dir_name,file_name)<<FastqFile.to_fastq(n,f,q,c)
|
204
|
-
|
205
|
-
|
206
|
-
elsif (inserts.count == 1) # sequence with one insert
|
207
|
-
|
208
|
-
if (mid.nil? || (mid.message=='no_MID') ) # without mid
|
209
|
-
mid_id = 'no_MID'
|
210
|
-
mid_message = ' No MID found'
|
211
|
-
else
|
212
|
-
mid_id = mid.tag_id
|
213
|
-
mid_message=''
|
214
|
-
if mid_id != mid_message
|
215
|
-
mid_message = ' '+mid.message
|
216
|
-
end
|
217
|
-
end
|
218
|
-
|
219
|
-
# save fasta and qual in no MID file
|
220
|
-
has_low_complexity = seq.get_actions(ActionLowComplexity)
|
221
|
-
|
222
|
-
if has_low_complexity.empty?
|
223
|
-
add_stat(stats,'sequences','count','output_seqs')
|
224
|
-
|
225
|
-
# fasta_file = get_sequence_file(mid_id)
|
226
|
-
# sff_file=get_sffinfo_file(mid_id)
|
227
|
-
fasta_file=sequence_file(files,dir_name,file_name)
|
228
|
-
sff_file=sffinfo_file(files,dir_name,file_name)
|
229
|
-
else
|
230
|
-
add_stat(stats,'sequences','count','output_seqs_low_complexity')
|
231
|
-
|
232
|
-
# fasta_file = get_low_complexity_file(mid_id)
|
233
|
-
# sff_file=get_low_sffinfo_file(mid_id)
|
234
|
-
fasta_file=low_complexity_file(files,dir_name,file_name)
|
235
|
-
sff_file=low_sffinfo_file(files,dir_name,file_name)
|
60
|
+
# execute plugins
|
61
|
+
@plugin_manager.execute_plugins(running_seqs)
|
62
|
+
|
63
|
+
# add output data
|
64
|
+
add_output_data(running_seqs)
|
65
|
+
|
66
|
+
return running_seqs
|
67
|
+
end
|
68
|
+
|
69
|
+
def receive_initial_config(obj)
|
70
|
+
|
71
|
+
# Reads the parameters
|
72
|
+
$WORKER_LOG.info "Params received"
|
73
|
+
# @params = Params.new(params_path)
|
74
|
+
@params = obj
|
75
|
+
@tuple_size=@params.get_param('tuple_size')
|
76
|
+
|
77
|
+
@use_qual=@params.get_param('use_qual')
|
78
|
+
@use_json=@params.get_param('use_json')
|
79
|
+
end
|
80
|
+
|
81
|
+
def starting_worker
|
82
|
+
|
83
|
+
# $WORKER_LOG.level = Logger::ERROR
|
84
|
+
$WORKER_LOG.level = Logger::WARN
|
85
|
+
$WORKER_LOG.info "Loading actions"
|
86
|
+
|
87
|
+
@action_manager = ActionManager.new
|
88
|
+
|
89
|
+
$WORKER_LOG.info "Loading plugins"
|
90
|
+
@plugin_list = @params.get_param('plugin_list') # puts in plugin_list the plugins's array
|
91
|
+
$WORKER_LOG.info "PLUGIN LIST:" + @plugin_list
|
92
|
+
|
93
|
+
@plugin_manager = PluginManager.new(@plugin_list,@params) # creates an instance from PluginManager. This must storage the plugins and load it
|
94
|
+
|
95
|
+
rescue Exception => e
|
96
|
+
puts (e.message+ e.backtrace.join("\n"))
|
97
|
+
|
98
|
+
end
|
99
|
+
|
100
|
+
|
101
|
+
def closing_worker
|
102
|
+
|
103
|
+
end
|
104
|
+
|
105
|
+
|
106
|
+
def add_output_data(obj)
|
107
|
+
obj.output_text=[]
|
108
|
+
|
109
|
+
if @tuple_size>1
|
110
|
+
obj.each_slice(@tuple_size) do |seqs|
|
111
|
+
|
112
|
+
write_seq_to_files_tuple(obj.output_files,seqs, obj.stats)
|
113
|
+
|
114
|
+
seqs.each do |seq|
|
115
|
+
obj.output_text << seq.to_text
|
236
116
|
end
|
237
|
-
|
238
|
-
q=[]
|
239
|
-
if @use_qual
|
240
|
-
q=qual_inserts[0]
|
241
|
-
end
|
242
|
-
|
243
|
-
n=seq.seq_name
|
244
|
-
c=mid_message
|
245
|
-
f=inserts[0]
|
246
|
-
|
247
|
-
fasta_file << FastqFile.to_fastq(n,f,q,c)
|
248
|
-
|
249
|
-
inserts_pos = seq.get_actions(ActionInsert)
|
250
|
-
|
251
|
-
sff_file<< "#{n} #{inserts_pos[0].start_pos+1} #{inserts_pos[0].end_pos+1}"
|
252
|
-
|
253
117
|
end
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
118
|
+
|
119
|
+
else
|
120
|
+
obj.each do |seq|
|
121
|
+
write_seq_to_files_normal(obj.output_files,seq, obj.stats)
|
122
|
+
obj.output_text << seq.to_text
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
# @remove seqs since they are not needed anymore to write output files
|
127
|
+
obj.remove_all_seqs
|
128
|
+
end
|
129
|
+
|
130
|
+
def add_stat(stats,key,subkey,value,count=1)
|
131
|
+
|
132
|
+
stats[key]={} if !stats[key]
|
133
|
+
stats[key][subkey]={} if !stats[key][subkey]
|
134
|
+
stats[key][subkey][value]=0 if !stats[key][subkey][value]
|
135
|
+
|
136
|
+
stats[key][subkey][value]+=count
|
137
|
+
end
|
138
|
+
|
139
|
+
def write_seq_to_files_tuple(files,seqs, stats)
|
140
|
+
|
141
|
+
|
142
|
+
seq1=seqs[0]
|
143
|
+
seq2=seqs[1]
|
144
|
+
|
145
|
+
dir_name,file_name,priority=seq1.get_file_tag_path
|
146
|
+
dir_name2,file_name2,priority2=seq2.get_file_tag_path
|
147
|
+
|
148
|
+
# both paired sequences must go in same file, there are priorities
|
149
|
+
if (dir_name!=dir_name2) || (file_name!=file_name2)
|
150
|
+
if priority2>priority
|
151
|
+
dir_name=dir_name2
|
152
|
+
file_name=file_name2
|
153
|
+
end
|
275
154
|
end
|
155
|
+
|
156
|
+
# get current inserts
|
157
|
+
inserts1 = seq1.get_inserts
|
158
|
+
inserts2 = seq2.get_inserts
|
276
159
|
|
277
|
-
|
278
|
-
|
160
|
+
# qualities are optional
|
161
|
+
if @use_qual
|
162
|
+
qual_inserts1 = seq1.get_qual_inserts
|
163
|
+
qual_inserts2 = seq2.get_qual_inserts
|
279
164
|
end
|
280
|
-
|
281
|
-
|
282
|
-
|
165
|
+
|
166
|
+
|
167
|
+
|
168
|
+
# save json if necessary
|
169
|
+
if @use_json
|
170
|
+
json_file(files)<< seq1.to_json
|
171
|
+
json_file(files)<< seq2.to_json
|
283
172
|
end
|
284
173
|
|
285
|
-
|
286
|
-
|
174
|
+
# find mids
|
175
|
+
mid1 = seq1.get_actions(ActionMid).first
|
176
|
+
mid2 = seq2.get_actions(ActionMid).first
|
177
|
+
|
178
|
+
|
179
|
+
if !inserts1.empty? && !inserts2.empty? # both have inserts
|
180
|
+
# save_two_inserts(files,seq, stats,inserts,qual_inserts,mid,dir_name,file_name)
|
181
|
+
save_two_inserts_tuple(files,seq1,seq2, stats,inserts1,inserts2,qual_inserts1,qual_inserts2,mid1,dir_name,file_name)
|
182
|
+
else
|
183
|
+
save_rejected_empty_or_single(files,seq1, stats,inserts1,qual_inserts1,mid1,dir_name,file_name)
|
184
|
+
save_rejected_empty_or_single(files,seq2, stats,inserts2,qual_inserts2,mid2,dir_name,file_name)
|
185
|
+
end
|
186
|
+
|
187
|
+
end
|
188
|
+
|
189
|
+
def save_two_inserts_tuple(files,seq1,seq2, stats,inserts1,inserts2,qual_inserts1,qual_inserts2,mid,dir_name,file_name)
|
190
|
+
|
191
|
+
add_stat(stats,'sequences','count','output_seqs_paired')
|
192
|
+
add_stat(stats,'sequences','count','output_seqs_paired')
|
193
|
+
|
194
|
+
mid_id,mid_message=get_mid_message(mid)
|
195
|
+
|
196
|
+
# save left read
|
197
|
+
n="#{seq1.seq_name}"
|
198
|
+
c=seq1.get_comment_line # "template=#{seq1.seq_name} dir=R library=#{mid_id}"
|
199
|
+
f=inserts1[0]#.reverse.tr('actgACTG','tgacTGAC')
|
200
|
+
q=[]
|
201
|
+
if @use_qual
|
202
|
+
q=qual_inserts1[0] #.reverse
|
287
203
|
end
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
204
|
+
|
205
|
+
paired_file_ilu1(files,dir_name,file_name)<<FastqFile.to_fastq(n,f,q,c)
|
206
|
+
|
207
|
+
# save right read
|
208
|
+
n="#{seq2.seq_name}"
|
209
|
+
c=seq2.get_comment_line # "template=#{seq2.seq_name} dir=F library=#{mid_id}"
|
210
|
+
f=inserts2[0]
|
211
|
+
q=[]
|
212
|
+
if @use_qual
|
213
|
+
q=qual_inserts2[0]
|
214
|
+
end
|
215
|
+
|
216
|
+
paired_file_ilu2(files,dir_name,file_name)<<FastqFile.to_fastq(n,f,q,c)
|
217
|
+
|
218
|
+
end
|
219
|
+
|
220
|
+
|
221
|
+
def save_rejected_empty_or_single(files,seq, stats,inserts,qual_inserts,mid,dir_name,file_name)
|
222
|
+
if (seq.seq_rejected) # save to rejected sequences
|
223
|
+
save_rejected_seq(files,seq, stats)
|
224
|
+
elsif (inserts.empty?) #sequence with no inserts
|
225
|
+
save_empty_insert(files,seq, stats)
|
226
|
+
elsif (inserts.count == 1) # sequence with one insert
|
227
|
+
save_one_insert(files,seq, stats,inserts,qual_inserts,mid,dir_name,file_name)
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
|
232
|
+
# SAVE NORMAL ===============================
|
233
|
+
def save_rejected_seq(files,seq, stats)
|
234
|
+
# message = seq.seq_rejected_by_message
|
235
|
+
message= seq.get_comment_line
|
236
|
+
rejected_output_file(files)<<('>'+seq.seq_name+ ' ' + message)
|
237
|
+
|
238
|
+
add_stat(stats,'sequences','rejected',seq.seq_rejected_by_message)
|
239
|
+
add_stat(stats,'sequences','count','rejected')
|
240
|
+
end
|
241
|
+
|
242
|
+
def save_empty_insert(files,seq, stats)
|
243
|
+
seq.seq_rejected=true
|
244
|
+
seq.seq_rejected_by_message='short insert'
|
245
|
+
|
246
|
+
message = 'No valid inserts found'
|
247
|
+
|
248
|
+
rejected_output_file(files)<<('>'+seq.seq_name+ ' ' + message)
|
249
|
+
|
250
|
+
add_stat(stats,'sequences','rejected',message)
|
251
|
+
add_stat(stats,'sequences','count','rejected')
|
252
|
+
|
253
|
+
end
|
254
|
+
|
255
|
+
def get_mid_message(mid)
|
256
|
+
if (mid.nil? || (mid.message=='no_MID') ) # without mid
|
257
|
+
mid_id = 'no_MID'
|
258
|
+
mid_message = ' No MID found'
|
259
|
+
else
|
260
|
+
mid_id = mid.tag_id
|
261
|
+
mid_message=''
|
262
|
+
if mid_id != mid_message
|
263
|
+
mid_message = ' '+mid.message
|
295
264
|
end
|
265
|
+
end
|
266
|
+
return mid_id,mid_message
|
267
|
+
end
|
268
|
+
|
269
|
+
def save_two_inserts(files,seq, stats,inserts,qual_inserts,mid,dir_name,file_name)
|
270
|
+
add_stat(stats,'sequences','count','output_seqs_paired')
|
271
|
+
|
272
|
+
mid_id,mid_message=get_mid_message(mid)
|
273
|
+
|
274
|
+
# save left read
|
275
|
+
n="#{seq.seq_name}_left"
|
276
|
+
c="template=#{seq.seq_name} dir=R library=#{mid_id} #{seq.get_comment_line}"
|
277
|
+
f=inserts[0].reverse.tr('actgACTG','tgacTGAC')
|
278
|
+
q=[]
|
279
|
+
if @use_qual
|
280
|
+
q=qual_inserts[0].reverse
|
281
|
+
end
|
282
|
+
|
283
|
+
paired_file(files,dir_name,file_name)<<FastqFile.to_fastq(n,f,q,c)
|
284
|
+
|
285
|
+
# save right read
|
286
|
+
n="#{seq.seq_name}_right"
|
287
|
+
c="template=#{seq.seq_name} dir=F library=#{mid_id} #{seq.get_comment_line}"
|
288
|
+
f=inserts[1]
|
289
|
+
q=[]
|
290
|
+
if @use_qual
|
291
|
+
q=qual_inserts[1]
|
292
|
+
end
|
293
|
+
|
294
|
+
paired_file(files,dir_name,file_name)<<FastqFile.to_fastq(n,f,q,c)
|
295
|
+
|
296
|
+
end
|
297
|
+
|
298
|
+
def save_one_insert(files,seq, stats,inserts,qual_inserts,mid,dir_name,file_name)
|
299
|
+
mid_id,mid_message=get_mid_message(mid)
|
300
|
+
|
301
|
+
# save fasta and qual in no MID file
|
302
|
+
has_low_complexity = seq.get_actions(ActionLowComplexity)
|
303
|
+
|
304
|
+
if has_low_complexity.empty?
|
305
|
+
add_stat(stats,'sequences','count','output_seqs')
|
306
|
+
|
307
|
+
fasta_file=sequence_file(files,dir_name,file_name)
|
308
|
+
sff_file=sffinfo_file(files,dir_name,file_name)
|
309
|
+
else
|
310
|
+
add_stat(stats,'sequences','count','output_seqs_low_complexity')
|
311
|
+
|
312
|
+
fasta_file=low_complexity_file(files,dir_name,file_name)
|
313
|
+
sff_file=low_sffinfo_file(files,dir_name,file_name)
|
314
|
+
end
|
315
|
+
|
316
|
+
q=[]
|
317
|
+
if @use_qual
|
318
|
+
q=qual_inserts[0]
|
319
|
+
end
|
320
|
+
|
321
|
+
n=seq.seq_name
|
322
|
+
c=mid_message
|
323
|
+
|
324
|
+
seq_comments=seq.get_comment_line
|
325
|
+
if !seq_comments.strip.empty?
|
326
|
+
c=seq_comments + c
|
327
|
+
end
|
328
|
+
|
329
|
+
f=inserts[0]
|
330
|
+
|
331
|
+
fasta_file << FastqFile.to_fastq(n,f,q,c)
|
332
|
+
|
333
|
+
inserts_pos = seq.get_actions(ActionInsert)
|
334
|
+
|
335
|
+
sff_file<< "#{n} #{inserts_pos[0].start_pos+1} #{inserts_pos[0].end_pos+1}"
|
336
|
+
|
337
|
+
|
338
|
+
end
|
339
|
+
|
340
|
+
|
341
|
+
def write_seq_to_files_normal(files,seq, stats)
|
342
|
+
|
343
|
+
# puts stats.to_json
|
344
|
+
|
345
|
+
dir_name,file_name,priority=seq.get_file_tag_path
|
346
|
+
# puts File.join(dir_name,'sequences_'+file_name)
|
347
|
+
|
348
|
+
# get current inserts
|
349
|
+
inserts = seq.get_inserts
|
350
|
+
|
351
|
+
# qualities are optional
|
352
|
+
if @use_qual
|
353
|
+
qual_inserts = seq.get_qual_inserts
|
354
|
+
end
|
355
|
+
|
356
|
+
# save json if necessary
|
357
|
+
if @use_json
|
358
|
+
json_file(files)<< seq.to_json
|
359
|
+
end
|
360
|
+
|
361
|
+
# find mids
|
362
|
+
mid = seq.get_actions(ActionMid).first
|
363
|
+
|
364
|
+
|
365
|
+
if (seq.seq_rejected) # save to rejected sequences
|
366
|
+
save_rejected_seq(files,seq, stats)
|
296
367
|
|
297
|
-
|
368
|
+
elsif (inserts.empty?) #sequence with no inserts
|
369
|
+
save_empty_insert(files,seq, stats)
|
370
|
+
|
371
|
+
elsif (inserts.count == 2) # sequence with two inserts = PAIRED SEQUENCES
|
372
|
+
save_two_inserts(files,seq, stats,inserts,qual_inserts,mid,dir_name,file_name)
|
373
|
+
|
374
|
+
elsif (inserts.count == 1) # sequence with one insert
|
375
|
+
save_one_insert(files,seq, stats,inserts,qual_inserts,mid,dir_name,file_name)
|
376
|
+
end
|
377
|
+
|
378
|
+
end
|
379
|
+
|
380
|
+
|
381
|
+
|
382
|
+
|
383
|
+
|
384
|
+
# ACCESS TO FILES
|
385
|
+
|
386
|
+
def json_file(files)
|
387
|
+
return get_file(files,File.join(OUTPUT_PATH,'results.json'))
|
388
|
+
end
|
389
|
+
|
390
|
+
def rejected_output_file(files)
|
391
|
+
return get_file(files,File.join(OUTPUT_PATH,'rejected.txt'))
|
392
|
+
end
|
393
|
+
|
394
|
+
|
395
|
+
def sequence_file(files, dir_name, file_name)
|
396
|
+
return get_file(files,File.join(OUTPUT_PATH,dir_name,'sequences_'+file_name+'.fastq'))
|
397
|
+
end
|
398
|
+
|
399
|
+
def paired_file(files, dir_name, file_name)
|
400
|
+
return get_file(files,File.join(OUTPUT_PATH,dir_name,'paired_'+file_name+'.fastq'))
|
401
|
+
end
|
402
|
+
|
403
|
+
def paired_file_ilu1(files, dir_name, file_name)
|
404
|
+
return get_file(files,File.join(OUTPUT_PATH,dir_name,'paired_1_'+file_name+'.fastq'))
|
405
|
+
end
|
406
|
+
|
407
|
+
def paired_file_ilu2(files, dir_name, file_name)
|
408
|
+
return get_file(files,File.join(OUTPUT_PATH,dir_name,'paired_2_'+file_name+'.fastq'))
|
409
|
+
end
|
410
|
+
|
411
|
+
|
412
|
+
def low_complexity_file(files, dir_name, file_name)
|
413
|
+
return get_file(files,File.join(OUTPUT_PATH,dir_name,'low_complexity_'+file_name+'.fastq'))
|
414
|
+
end
|
415
|
+
|
416
|
+
def sffinfo_file(files, dir_name, file_name)
|
417
|
+
return get_file(files,File.join(OUTPUT_PATH,dir_name,'sff_info_'+file_name+'.txt'))
|
418
|
+
end
|
419
|
+
|
420
|
+
def low_sffinfo_file(files, dir_name, file_name)
|
421
|
+
return get_file(files,File.join(OUTPUT_PATH,dir_name,'low_complexity_sff_info_'+file_name+'.txt'))
|
422
|
+
end
|
423
|
+
|
424
|
+
def get_file(files,fn)
|
425
|
+
res=files[fn]
|
426
|
+
|
427
|
+
if !res
|
428
|
+
files[fn]=[]
|
429
|
+
res=files[fn]
|
298
430
|
end
|
299
|
-
|
431
|
+
|
432
|
+
return res
|
433
|
+
end
|
434
|
+
|
300
435
|
end
|