seqtrimnext 2.0.29
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +3 -0
- data/Manifest.txt +114 -0
- data/PostInstall.txt +7 -0
- data/README.rdoc +159 -0
- data/Rakefile +38 -0
- data/bin/create_graphs.rb +46 -0
- data/bin/extract_seqs.rb +45 -0
- data/bin/extract_seqs_from_fasta.rb +56 -0
- data/bin/extract_seqs_from_fastq.rb +45 -0
- data/bin/fasta2fastq.rb +38 -0
- data/bin/fastq2fasta.rb +35 -0
- data/bin/gen_qual.rb +46 -0
- data/bin/get_seq.rb +46 -0
- data/bin/group_by_range.rb +17 -0
- data/bin/join_ilumina_paired.rb +130 -0
- data/bin/parse_amplicons.rb +95 -0
- data/bin/parse_json_results.rb +66 -0
- data/bin/parse_params.rb +82 -0
- data/bin/resume_clusters.rb +48 -0
- data/bin/resume_rejected.sh +9 -0
- data/bin/reverse_paired.rb +49 -0
- data/bin/seqtrimnext +368 -0
- data/bin/split_fastq.rb +42 -0
- data/bin/split_ilumina_paired.rb +65 -0
- data/bin/split_paired.rb +70 -0
- data/lib/seqtrimnext/actions/action_ab_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_ab_far_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_ab_left_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_empty_insert.rb +22 -0
- data/lib/seqtrimnext/actions/action_ignore_repeated.rb +24 -0
- data/lib/seqtrimnext/actions/action_indetermination.rb +30 -0
- data/lib/seqtrimnext/actions/action_induced_low_complexity.rb +29 -0
- data/lib/seqtrimnext/actions/action_insert.rb +32 -0
- data/lib/seqtrimnext/actions/action_is_contaminated.rb +30 -0
- data/lib/seqtrimnext/actions/action_key.rb +30 -0
- data/lib/seqtrimnext/actions/action_left_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_left_primer.rb +17 -0
- data/lib/seqtrimnext/actions/action_linker.rb +30 -0
- data/lib/seqtrimnext/actions/action_low_complexity.rb +30 -0
- data/lib/seqtrimnext/actions/action_low_high_size.rb +31 -0
- data/lib/seqtrimnext/actions/action_low_quality.rb +33 -0
- data/lib/seqtrimnext/actions/action_mid.rb +30 -0
- data/lib/seqtrimnext/actions/action_multiple_linker.rb +29 -0
- data/lib/seqtrimnext/actions/action_paired_reads.rb +28 -0
- data/lib/seqtrimnext/actions/action_poly_a.rb +29 -0
- data/lib/seqtrimnext/actions/action_poly_t.rb +29 -0
- data/lib/seqtrimnext/actions/action_rem_adit_artifacts.rb +32 -0
- data/lib/seqtrimnext/actions/action_right_adapter.rb +29 -0
- data/lib/seqtrimnext/actions/action_right_primer.rb +25 -0
- data/lib/seqtrimnext/actions/action_short_insert.rb +32 -0
- data/lib/seqtrimnext/actions/action_unexpected_poly_t.rb +29 -0
- data/lib/seqtrimnext/actions/action_unexpected_vector.rb +31 -0
- data/lib/seqtrimnext/actions/action_vectors.rb +31 -0
- data/lib/seqtrimnext/actions/seqtrim_action.rb +136 -0
- data/lib/seqtrimnext/classes/action_manager.rb +47 -0
- data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +335 -0
- data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +290 -0
- data/lib/seqtrimnext/classes/extract_stats.rb +255 -0
- data/lib/seqtrimnext/classes/gnu_plot_graph.rb +140 -0
- data/lib/seqtrimnext/classes/graph_stats.rb +74 -0
- data/lib/seqtrimnext/classes/install_database.rb +43 -0
- data/lib/seqtrimnext/classes/install_requirements.rb +123 -0
- data/lib/seqtrimnext/classes/list_db.rb +49 -0
- data/lib/seqtrimnext/classes/make_blast_db.rb +113 -0
- data/lib/seqtrimnext/classes/one_blast.rb +41 -0
- data/lib/seqtrimnext/classes/params.rb +387 -0
- data/lib/seqtrimnext/classes/piro.rb +78 -0
- data/lib/seqtrimnext/classes/plugin_manager.rb +153 -0
- data/lib/seqtrimnext/classes/scan_for_restr_site.rb +138 -0
- data/lib/seqtrimnext/classes/scbi_stats.rb +68 -0
- data/lib/seqtrimnext/classes/seqtrim.rb +317 -0
- data/lib/seqtrimnext/classes/sequence.rb +55 -0
- data/lib/seqtrimnext/classes/sequence_group.rb +72 -0
- data/lib/seqtrimnext/classes/sequence_with_action.rb +503 -0
- data/lib/seqtrimnext/plugins/plugin.rb +267 -0
- data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +189 -0
- data/lib/seqtrimnext/plugins/plugin_adapters.rb +165 -0
- data/lib/seqtrimnext/plugins/plugin_amplicons.rb +221 -0
- data/lib/seqtrimnext/plugins/plugin_contaminants.rb +209 -0
- data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +438 -0
- data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +393 -0
- data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +101 -0
- data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +199 -0
- data/lib/seqtrimnext/plugins/plugin_key.rb +70 -0
- data/lib/seqtrimnext/plugins/plugin_linker.rb +232 -0
- data/lib/seqtrimnext/plugins/plugin_low_complexity.rb +98 -0
- data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +74 -0
- data/lib/seqtrimnext/plugins/plugin_low_quality.rb +394 -0
- data/lib/seqtrimnext/plugins/plugin_mids.rb +231 -0
- data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +246 -0
- data/lib/seqtrimnext/plugins/plugin_short_insert.rb +244 -0
- data/lib/seqtrimnext/plugins/plugin_vectors.rb +191 -0
- data/lib/seqtrimnext/templates/amplicons.txt +16 -0
- data/lib/seqtrimnext/templates/genomics_454.txt +5 -0
- data/lib/seqtrimnext/templates/genomics_454_with_paired.txt +5 -0
- data/lib/seqtrimnext/templates/low_quality.txt +5 -0
- data/lib/seqtrimnext/templates/low_quality_and_low_complexity.txt +5 -0
- data/lib/seqtrimnext/templates/transcriptomics_454.txt +8 -0
- data/lib/seqtrimnext/templates/transcriptomics_plants.txt +8 -0
- data/lib/seqtrimnext/utils/extract_samples.rb +52 -0
- data/lib/seqtrimnext/utils/fasta2xml.rb +69 -0
- data/lib/seqtrimnext/utils/global_match.rb +65 -0
- data/lib/seqtrimnext/utils/hash_stats.rb +29 -0
- data/lib/seqtrimnext/utils/json_utils.rb +50 -0
- data/lib/seqtrimnext/utils/load_fasta_names_in_hash.rb +37 -0
- data/lib/seqtrimnext/utils/load_qual_in_hash.rb +37 -0
- data/lib/seqtrimnext/utils/recover_mid.rb +95 -0
- data/lib/seqtrimnext/utils/string_utils.rb +56 -0
- data/lib/seqtrimnext.rb +37 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/test/test_helper.rb +3 -0
- data/test/test_seqtrimnext.rb +11 -0
- metadata +318 -0
@@ -0,0 +1,290 @@
|
|
1
|
+
|
2
|
+
#finds the classes that were in the folder 'classes'
|
3
|
+
|
4
|
+
ROOT_PATH=File.dirname(File.dirname(File.dirname(__FILE__)))
|
5
|
+
|
6
|
+
$: << File.expand_path(File.join(ROOT_PATH, 'classes'))
|
7
|
+
$: << File.expand_path(File.join(ROOT_PATH, 'classes','blast'))
|
8
|
+
|
9
|
+
#finds the classes that were in the folder 'plugins'
|
10
|
+
$: << File.expand_path(File.join(ROOT_PATH, 'plugins'))
|
11
|
+
|
12
|
+
#finds the classes that were in the folder 'plugins'
|
13
|
+
$: << File.expand_path(File.join(ROOT_PATH, 'actions'))
|
14
|
+
|
15
|
+
#finds the classes that were in the folder 'utils'
|
16
|
+
$: << File.expand_path(File.join(ROOT_PATH, 'utils'))
|
17
|
+
|
18
|
+
$: << File.expand_path(File.join(ROOT_PATH, 'classes','em_classes'))
|
19
|
+
|
20
|
+
$: << File.expand_path(ROOT_PATH)
|
21
|
+
|
22
|
+
$SEQTRIM_PATH = ROOT_PATH
|
23
|
+
|
24
|
+
if ENV['BLASTDB']# && Dir.exists?(ENV['BLASTDB'])
|
25
|
+
$FORMATTED_DB_PATH = ENV['BLASTDB']
|
26
|
+
$DB_PATH = File.dirname($FORMATTED_DB_PATH)
|
27
|
+
else
|
28
|
+
$FORMATTED_DB_PATH = File.expand_path(File.join(ROOT_PATH, "DB",'formatted'))
|
29
|
+
$DB_PATH = File.expand_path(File.join(ROOT_PATH, "DB"))
|
30
|
+
end
|
31
|
+
|
32
|
+
ENV['BLASTDB']=$FORMATTED_DB_PATH
|
33
|
+
|
34
|
+
OUTPUT_PATH='output_files'
|
35
|
+
|
36
|
+
|
37
|
+
# $: << File.expand_path('~/progs/ruby/gems/scbi_mapreduce/lib')
|
38
|
+
|
39
|
+
require 'scbi_mapreduce'
|
40
|
+
require 'params'
|
41
|
+
require 'action_manager'
|
42
|
+
require 'plugin_manager'
|
43
|
+
# require 'sequence_with_action'
|
44
|
+
#
|
45
|
+
require 'scbi_fastq'
|
46
|
+
require 'sequence_group'
|
47
|
+
|
48
|
+
class SeqtrimWorker < ScbiMapreduce::Worker
|
49
|
+
|
50
|
+
def process_object(obj)
|
51
|
+
running_seqs=SequenceGroup.new(obj)
|
52
|
+
|
53
|
+
# execute plugins
|
54
|
+
@plugin_manager.execute_plugins(running_seqs)
|
55
|
+
|
56
|
+
# add output data
|
57
|
+
add_output_data(running_seqs)
|
58
|
+
|
59
|
+
return running_seqs
|
60
|
+
end
|
61
|
+
|
62
|
+
def receive_initial_config(obj)
|
63
|
+
|
64
|
+
# Reads the parameters
|
65
|
+
$WORKER_LOG.info "Params received"
|
66
|
+
# @params = Params.new(params_path)
|
67
|
+
@params = obj
|
68
|
+
|
69
|
+
@use_qual=@params.get_param('use_qual')
|
70
|
+
@use_json=@params.get_param('use_json')
|
71
|
+
end
|
72
|
+
|
73
|
+
def starting_worker
|
74
|
+
# $WORKER_LOG.level = Logger::ERROR
|
75
|
+
$WORKER_LOG.info "Loading actions"
|
76
|
+
@action_manager = ActionManager.new
|
77
|
+
|
78
|
+
$WORKER_LOG.info "Loading plugins"
|
79
|
+
@plugin_list = @params.get_param('plugin_list') # puts in plugin_list the plugins's array
|
80
|
+
$WORKER_LOG.info "PLUGIN LIST:" + @plugin_list
|
81
|
+
|
82
|
+
@plugin_manager = PluginManager.new(@plugin_list,@params) # creates an instance from PluginManager. This must storage the plugins and load it
|
83
|
+
|
84
|
+
rescue Exception => e
|
85
|
+
puts (e.message+ e.backtrace.join("\n"))
|
86
|
+
|
87
|
+
end
|
88
|
+
|
89
|
+
|
90
|
+
def closing_worker
|
91
|
+
|
92
|
+
end
|
93
|
+
|
94
|
+
|
95
|
+
def add_output_data(obj)
|
96
|
+
obj.output_text=[]
|
97
|
+
|
98
|
+
obj.each do |seq|
|
99
|
+
obj.output_text << seq.to_text
|
100
|
+
write_seq_to_files(obj.output_files,seq, obj.stats)
|
101
|
+
end
|
102
|
+
|
103
|
+
# @remove seqs since they are not needed anymore to write output files
|
104
|
+
obj.remove_all_seqs
|
105
|
+
end
|
106
|
+
|
107
|
+
def add_stat(stats,key,subkey,value,count=1)
|
108
|
+
|
109
|
+
stats[key]={} if !stats[key]
|
110
|
+
stats[key][subkey]={} if !stats[key][subkey]
|
111
|
+
stats[key][subkey][value]=0 if !stats[key][subkey][value]
|
112
|
+
|
113
|
+
stats[key][subkey][value]+=count
|
114
|
+
end
|
115
|
+
|
116
|
+
def write_seq_to_files(files,seq, stats)
|
117
|
+
# puts stats.to_json
|
118
|
+
|
119
|
+
dir_name,file_name=seq.get_file_tag_path
|
120
|
+
# puts File.join(dir_name,'sequences_'+file_name)
|
121
|
+
|
122
|
+
# get current inserts
|
123
|
+
inserts = seq.get_inserts
|
124
|
+
|
125
|
+
# qualities are optional
|
126
|
+
if @use_qual
|
127
|
+
qual_inserts = seq.get_qual_inserts
|
128
|
+
end
|
129
|
+
|
130
|
+
# save json if necessary
|
131
|
+
if @use_json
|
132
|
+
json_file(files)<< seq.to_json
|
133
|
+
end
|
134
|
+
|
135
|
+
# find mids
|
136
|
+
mid = seq.get_actions(ActionMid).first
|
137
|
+
|
138
|
+
if (seq.seq_rejected) # sequence rejected
|
139
|
+
|
140
|
+
#save to rejected sequences
|
141
|
+
message = seq.seq_rejected_by_message
|
142
|
+
rejected_output_file(files)<<('>'+seq.seq_name+ ' ' + message)
|
143
|
+
|
144
|
+
add_stat(stats,'sequences','rejected',seq.seq_rejected_by_message)
|
145
|
+
add_stat(stats,'sequences','count','rejected')
|
146
|
+
|
147
|
+
|
148
|
+
elsif (inserts.empty?) #sequence with no inserts
|
149
|
+
message = 'No valid inserts found'
|
150
|
+
rejected_output_file(files)<<('>'+seq.seq_name+ ' ' + message)
|
151
|
+
|
152
|
+
add_stat(stats,'sequences','rejected',message)
|
153
|
+
add_stat(stats,'sequences','count','rejected')
|
154
|
+
|
155
|
+
elsif (inserts.count == 2) # sequence with two inserts = PAIRED SEQUENCES
|
156
|
+
add_stat(stats,'sequences','count','output_seqs_paired')
|
157
|
+
|
158
|
+
# TODO - Add this stats to full stats
|
159
|
+
# @@full_stats.add_stats({'sequences' => {'paired' => {'count' => 1}}})
|
160
|
+
|
161
|
+
if (mid.nil? || (mid.message=='no_MID') ) # without mid
|
162
|
+
mid_id = 'no_MID'
|
163
|
+
mid_message = ' No MID found'
|
164
|
+
else
|
165
|
+
mid_id = mid.tag_id
|
166
|
+
mid_message=''
|
167
|
+
if mid_id != mid_message
|
168
|
+
mid_message = ' '+mid.message
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
# fasta_file = get_paired_file(mid_id)
|
173
|
+
|
174
|
+
n="#{seq.seq_name}_left"
|
175
|
+
c="template=#{seq.seq_name} dir=R library=#{mid_id}"
|
176
|
+
f=inserts[0].reverse.tr('actgACTG','tgacTGAC')
|
177
|
+
q=[]
|
178
|
+
if @use_qual
|
179
|
+
q=qual_inserts[0].reverse
|
180
|
+
end
|
181
|
+
|
182
|
+
paired_file(files,dir_name,file_name)<<FastqFile.to_fastq(n,f,q,c)
|
183
|
+
|
184
|
+
|
185
|
+
n="#{seq.seq_name}_right"
|
186
|
+
c="template=#{seq.seq_name} dir=F library=#{mid_id}"
|
187
|
+
f=inserts[1]
|
188
|
+
q=[]
|
189
|
+
if @use_qual
|
190
|
+
q=qual_inserts[1]
|
191
|
+
end
|
192
|
+
|
193
|
+
paired_file(files,dir_name,file_name)<<FastqFile.to_fastq(n,f,q,c)
|
194
|
+
|
195
|
+
|
196
|
+
elsif (inserts.count == 1) # sequence with one insert
|
197
|
+
|
198
|
+
if (mid.nil? || (mid.message=='no_MID') ) # without mid
|
199
|
+
mid_id = 'no_MID'
|
200
|
+
mid_message = ' No MID found'
|
201
|
+
else
|
202
|
+
mid_id = mid.tag_id
|
203
|
+
mid_message=''
|
204
|
+
if mid_id != mid_message
|
205
|
+
mid_message = ' '+mid.message
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
# save fasta and qual in no MID file
|
210
|
+
has_low_complexity = seq.get_actions(ActionLowComplexity)
|
211
|
+
|
212
|
+
if has_low_complexity.empty?
|
213
|
+
add_stat(stats,'sequences','count','output_seqs')
|
214
|
+
|
215
|
+
# fasta_file = get_sequence_file(mid_id)
|
216
|
+
# sff_file=get_sffinfo_file(mid_id)
|
217
|
+
fasta_file=sequence_file(files,dir_name,file_name)
|
218
|
+
sff_file=sffinfo_file(files,dir_name,file_name)
|
219
|
+
else
|
220
|
+
add_stat(stats,'sequences','count','output_seqs_low_complexity')
|
221
|
+
|
222
|
+
# fasta_file = get_low_complexity_file(mid_id)
|
223
|
+
# sff_file=get_low_sffinfo_file(mid_id)
|
224
|
+
fasta_file=low_complexity_file(files,dir_name,file_name)
|
225
|
+
sff_file=low_sffinfo_file(files,dir_name,file_name)
|
226
|
+
end
|
227
|
+
|
228
|
+
q=[]
|
229
|
+
if @use_qual
|
230
|
+
q=qual_inserts[0]
|
231
|
+
end
|
232
|
+
|
233
|
+
n=seq.seq_name
|
234
|
+
c=mid_message
|
235
|
+
f=inserts[0]
|
236
|
+
|
237
|
+
fasta_file << FastqFile.to_fastq(n,f,q,c)
|
238
|
+
|
239
|
+
inserts_pos = seq.get_actions(ActionInsert)
|
240
|
+
|
241
|
+
sff_file<< "#{n} #{inserts_pos[0].start_pos+1} #{inserts_pos[0].end_pos+1}"
|
242
|
+
|
243
|
+
end
|
244
|
+
|
245
|
+
end
|
246
|
+
|
247
|
+
|
248
|
+
# ACCESS TO FILES
|
249
|
+
|
250
|
+
def json_file(files)
|
251
|
+
return get_file(files,File.join(OUTPUT_PATH,'results.json'))
|
252
|
+
end
|
253
|
+
|
254
|
+
def rejected_output_file(files)
|
255
|
+
return get_file(files,File.join(OUTPUT_PATH,'rejected.txt'))
|
256
|
+
end
|
257
|
+
|
258
|
+
|
259
|
+
def sequence_file(files, dir_name, file_name)
|
260
|
+
return get_file(files,File.join(OUTPUT_PATH,dir_name,'sequences_'+file_name+'.fastq'))
|
261
|
+
end
|
262
|
+
|
263
|
+
def paired_file(files, dir_name, file_name)
|
264
|
+
return get_file(files,File.join(OUTPUT_PATH,dir_name,'paired_'+file_name+'.fastq'))
|
265
|
+
end
|
266
|
+
|
267
|
+
def low_complexity_file(files, dir_name, file_name)
|
268
|
+
return get_file(files,File.join(OUTPUT_PATH,dir_name,'low_complexity_'+file_name+'.fastq'))
|
269
|
+
end
|
270
|
+
|
271
|
+
def sffinfo_file(files, dir_name, file_name)
|
272
|
+
return get_file(files,File.join(OUTPUT_PATH,dir_name,'sff_info_'+file_name+'.txt'))
|
273
|
+
end
|
274
|
+
|
275
|
+
def low_sffinfo_file(files, dir_name, file_name)
|
276
|
+
return get_file(files,File.join(OUTPUT_PATH,dir_name,'low_complexity_sff_info_'+file_name+'.txt'))
|
277
|
+
end
|
278
|
+
|
279
|
+
def get_file(files,fn)
|
280
|
+
res=files[fn]
|
281
|
+
|
282
|
+
if !res
|
283
|
+
files[fn]=[]
|
284
|
+
res=files[fn]
|
285
|
+
end
|
286
|
+
|
287
|
+
return res
|
288
|
+
end
|
289
|
+
|
290
|
+
end
|
@@ -0,0 +1,255 @@
|
|
1
|
+
######################################
|
2
|
+
# Author:: Almudena Bocinos Rioboo
|
3
|
+
# Extract stats like mean of sequence's length
|
4
|
+
######################################
|
5
|
+
|
6
|
+
# $: << '/Users/dariogf/progs/ruby/gems/scbi_plot/lib'
|
7
|
+
# $: << '/Users/dariogf/progs/ruby/gems/scbi_math/lib'
|
8
|
+
|
9
|
+
require 'scbi_plot'
|
10
|
+
require "scbi_math"
|
11
|
+
|
12
|
+
class ExtractStats
|
13
|
+
|
14
|
+
def initialize(sequence_reader,params)
|
15
|
+
|
16
|
+
@sequence_lengths = [] #array of sequences lengths
|
17
|
+
@length_frequency = [] #number of sequences of each size (frequency)
|
18
|
+
@keys={} #found keys
|
19
|
+
@params = params
|
20
|
+
@use_qual=sequence_reader.with_qual?
|
21
|
+
# @params.get_param('use_qual')
|
22
|
+
|
23
|
+
@totalnt=0
|
24
|
+
@qv=[]
|
25
|
+
|
26
|
+
|
27
|
+
@sequence_lengths_stats, @length_frequency_stats, @quality_stats = extract_stats_from_sequences(sequence_reader)
|
28
|
+
|
29
|
+
|
30
|
+
set_params_and_results
|
31
|
+
|
32
|
+
plot_lengths
|
33
|
+
|
34
|
+
plot_qualities if @use_qual
|
35
|
+
|
36
|
+
print_global_stats
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
def extract_stats_from_sequences(sequence_reader)
|
41
|
+
|
42
|
+
sequence_reader.each do |name_seq,fasta_seq,qual|
|
43
|
+
l = fasta_seq.length
|
44
|
+
|
45
|
+
@totalnt+=l
|
46
|
+
|
47
|
+
#save all lengths
|
48
|
+
@sequence_lengths.push l
|
49
|
+
|
50
|
+
# add key value
|
51
|
+
add_key(fasta_seq[0..3].upcase)
|
52
|
+
|
53
|
+
# add fasta length
|
54
|
+
@length_frequency[fasta_seq.length] = (@length_frequency[fasta_seq.length] || 1 ) + 1
|
55
|
+
|
56
|
+
#extract qv values
|
57
|
+
extract_qv_from_sequence(qual) if @use_qual
|
58
|
+
|
59
|
+
# print some progress info
|
60
|
+
if (sequence_reader.num_seqs % 10000==0)
|
61
|
+
puts "Calculating stats: #{sequence_reader.num_seqs}"
|
62
|
+
end
|
63
|
+
|
64
|
+
end
|
65
|
+
|
66
|
+
length_stats = ScbiNArray.to_na(@sequence_lengths)
|
67
|
+
length_frequency_stats = ScbiNArray.to_na(@length_frequency.map{|e| e || 0})
|
68
|
+
quality_stats = ScbiNArray.to_na(@qv) if @use_qual
|
69
|
+
|
70
|
+
return [length_stats, length_frequency_stats, quality_stats]
|
71
|
+
end
|
72
|
+
|
73
|
+
def plot_lengths
|
74
|
+
|
75
|
+
## PLOT RESULTS
|
76
|
+
if !File.exists?('graphs')
|
77
|
+
Dir.mkdir('graphs')
|
78
|
+
end
|
79
|
+
|
80
|
+
|
81
|
+
x = []
|
82
|
+
y = []
|
83
|
+
|
84
|
+
x =(0..@length_frequency.length-1).collect.to_a
|
85
|
+
y = @length_frequency.map{|e| e || 0}
|
86
|
+
|
87
|
+
file_name = 'graphs/size_stats.png'
|
88
|
+
|
89
|
+
p=ScbiPlot::Lines.new(file_name,'Stats of sequence sizes')
|
90
|
+
p.x_label= "Sequence length"
|
91
|
+
p.y_label= "Number of sequences"
|
92
|
+
|
93
|
+
p.add_x(x)
|
94
|
+
|
95
|
+
p.add_series('sizes', y,'impulses',2)
|
96
|
+
|
97
|
+
p.add_vertical_line('Mode',@length_frequency_stats.fat_mode[0])
|
98
|
+
|
99
|
+
p.add_vertical_line('L',@params.get_param('min_sequence_size_raw').to_i)
|
100
|
+
p.add_vertical_line('H',@params.get_param('max_sequence_size_raw').to_i)
|
101
|
+
|
102
|
+
p.do_graph
|
103
|
+
|
104
|
+
|
105
|
+
end
|
106
|
+
|
107
|
+
def plot_qualities
|
108
|
+
|
109
|
+
if !File.exists?('graphs')
|
110
|
+
Dir.mkdir('graphs')
|
111
|
+
end
|
112
|
+
minimum_qual_value = @params.get_param('min_quality').to_i
|
113
|
+
|
114
|
+
# get qualities values
|
115
|
+
x=[]
|
116
|
+
y=[]
|
117
|
+
min=[]
|
118
|
+
max=[]
|
119
|
+
qual_limit=[]
|
120
|
+
|
121
|
+
@qv.each_with_index do |e,i|
|
122
|
+
x << i
|
123
|
+
y << (e[:tot]/e[:nseq])
|
124
|
+
min << (e[:min])
|
125
|
+
max << (e[:max])
|
126
|
+
qual_limit << minimum_qual_value
|
127
|
+
# puts "#{i}: #{e[:tot]/e[:nseq]}"
|
128
|
+
end
|
129
|
+
|
130
|
+
# make plot of qualities
|
131
|
+
|
132
|
+
file_name='graphs/qualities.png'
|
133
|
+
|
134
|
+
p=ScbiPlot::Lines.new(file_name,'Stats of sequence qualities')
|
135
|
+
p.x_label= "Nucleotide position"
|
136
|
+
p.y_label= "Quality value"
|
137
|
+
|
138
|
+
p.add_x(x)
|
139
|
+
|
140
|
+
p.add_series('mean', y)
|
141
|
+
p.add_series('min', min)
|
142
|
+
p.add_series('max', max)
|
143
|
+
p.add_series('qual limit',qual_limit)
|
144
|
+
|
145
|
+
|
146
|
+
p.do_graph
|
147
|
+
end
|
148
|
+
|
149
|
+
|
150
|
+
def add_qv(q,i)
|
151
|
+
if !@qv[i]
|
152
|
+
@qv[i]={:max => 0, :min => 1000000, :nseq => 0, :tot => 0}
|
153
|
+
end
|
154
|
+
|
155
|
+
# set max
|
156
|
+
@qv[i][:tot]+=q
|
157
|
+
@qv[i][:nseq]+=1
|
158
|
+
@qv[i][:min]=[@qv[i][:min],q].min
|
159
|
+
@qv[i][:max]=[@qv[i][:max],q].max
|
160
|
+
|
161
|
+
end
|
162
|
+
|
163
|
+
def extract_qv_from_sequence(qual)
|
164
|
+
qual.each_with_index do |q,i|
|
165
|
+
add_qv(q,i)
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
def add_key(key)
|
170
|
+
if @keys[key].nil?
|
171
|
+
@keys[key]=1
|
172
|
+
else
|
173
|
+
@keys[key]+=1
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
def get_max_key
|
178
|
+
return @keys.keys.sort{|e1,e2| @keys[e1]<=>@keys[e2]}.last
|
179
|
+
end
|
180
|
+
|
181
|
+
def set_params_and_results
|
182
|
+
|
183
|
+
if @sequence_lengths.empty?
|
184
|
+
puts "No sequences has been sucessfully readed "
|
185
|
+
return
|
186
|
+
end
|
187
|
+
|
188
|
+
|
189
|
+
# set limiting parameters
|
190
|
+
|
191
|
+
@params.set_param('sequencing_key',get_max_key)
|
192
|
+
@params.set_param('all_found_keys',@keys.to_json)
|
193
|
+
|
194
|
+
# sequence min size, is taken directly from params file
|
195
|
+
# max sequence limit is calculated here
|
196
|
+
if (@sequence_lengths_stats.variance_coefficient<=10) or (@params.get_param('accept_very_long_sequences')=='true')
|
197
|
+
|
198
|
+
# high size limit is calculated with stats
|
199
|
+
@params.set_param('max_sequence_size_raw',(@sequence_lengths_stats.max+10).to_i)
|
200
|
+
|
201
|
+
else # > 10 %
|
202
|
+
|
203
|
+
# high size limit is calculated with stats
|
204
|
+
@params.set_param('max_sequence_size_raw',(@sequence_lengths_stats.mean+2*@sequence_lengths_stats.stddev).to_i)
|
205
|
+
end
|
206
|
+
|
207
|
+
|
208
|
+
end
|
209
|
+
|
210
|
+
def print_global_stats
|
211
|
+
|
212
|
+
if !@sequence_lengths_stats.nil?
|
213
|
+
initial_stats={}
|
214
|
+
initial_stats[:sequence_count] = @sequence_lengths_stats.size
|
215
|
+
initial_stats[:smallest_sequence_size] = @sequence_lengths_stats.min
|
216
|
+
initial_stats[:biggest_sequence_size] = @sequence_lengths_stats.max
|
217
|
+
|
218
|
+
initial_stats[:min_sequence_size_raw]=@params.get_param('min_sequence_size_raw')
|
219
|
+
initial_stats[:max_sequence_size_raw]=@params.get_param('max_sequence_size_raw')
|
220
|
+
initial_stats[:coefficient_of_variance]=@sequence_lengths_stats.variance_coefficient
|
221
|
+
initial_stats[:nucleotide_count]=@totalnt
|
222
|
+
initial_stats[:mode_of_sizes]=@length_frequency_stats.fat_mode[0]
|
223
|
+
initial_stats[:mean_of_sequence_sizes]=@sequence_lengths_stats.mean
|
224
|
+
|
225
|
+
initial_stats[:qv]=@qv
|
226
|
+
initial_stats[:used_key]=get_max_key
|
227
|
+
initial_stats[:all_keys]=@keys
|
228
|
+
|
229
|
+
File.open(File.join(OUTPUT_PATH,'initial_stats.json'),'w') do |f|
|
230
|
+
f.puts JSON.pretty_generate(initial_stats)
|
231
|
+
end
|
232
|
+
|
233
|
+
puts "_"*10+ " STATISTICS "+"_"*10
|
234
|
+
puts "Total sequence count: #{@sequence_lengths_stats.size}"
|
235
|
+
|
236
|
+
puts "Smallest sequence: #{initial_stats[:smallest_sequence_size]} nt"
|
237
|
+
puts "Biggest sequence : #{initial_stats[:biggest_sequence_size]} nt"
|
238
|
+
puts "Mean of sequence sizes : #{initial_stats[:mean_of_sequence_sizes]} nt"
|
239
|
+
puts "Mode of sequence sizes : #{initial_stats[:mode_of_sizes]} nt"
|
240
|
+
|
241
|
+
puts "Low size limit : #{initial_stats[:min_sequence_size_raw]} nt"
|
242
|
+
puts "High size limit : #{initial_stats[:max_sequence_size_raw]} nt"
|
243
|
+
|
244
|
+
puts "Coefficient of variation: #{initial_stats[:coefficient_of_variance]} %"
|
245
|
+
puts "Total nucleotide count: #{initial_stats[:nucleotide_count]} nt"
|
246
|
+
|
247
|
+
puts "_"*30
|
248
|
+
|
249
|
+
|
250
|
+
end
|
251
|
+
|
252
|
+
end
|
253
|
+
|
254
|
+
|
255
|
+
end
|
@@ -0,0 +1,140 @@
|
|
1
|
+
require 'gnuplot'
|
2
|
+
|
3
|
+
class GnuPlotGraph
|
4
|
+
|
5
|
+
def initialize(file_name,x,y,title=nil)
|
6
|
+
$VERBOSE=true
|
7
|
+
Gnuplot.open do |gp|
|
8
|
+
# histogram
|
9
|
+
Gnuplot::Plot.new( gp ) do |plot|
|
10
|
+
|
11
|
+
# plot.space= 5 # it's the free space between the first/last value and the begin/end of axis X
|
12
|
+
|
13
|
+
#plot.set("xrange [#{xr_min}: #{xr_max}]")
|
14
|
+
if !title
|
15
|
+
title=file_name
|
16
|
+
end
|
17
|
+
|
18
|
+
plot.title "#{title}"
|
19
|
+
plot.xlabel "length"
|
20
|
+
plot.ylabel "Number of sequences"
|
21
|
+
plot.set "key off" #leyend
|
22
|
+
|
23
|
+
|
24
|
+
# plot.set "style fill solid 1.00 border -1"
|
25
|
+
# #plot.set "style histogram clustered gap 0 title offset character 0, 0, 0"
|
26
|
+
# plot.set "style data histograms"
|
27
|
+
# plot.set "boxwidth 0.2 absolute"
|
28
|
+
|
29
|
+
# For this next line, lw is linewidth (2-4)?
|
30
|
+
#plot [XMIN:XMAX] 'myHistogramData' with boxes lw VALUE
|
31
|
+
|
32
|
+
contains_strings=false
|
33
|
+
|
34
|
+
x.each do |v|
|
35
|
+
begin
|
36
|
+
r=Integer(v)
|
37
|
+
rescue
|
38
|
+
contains_strings=true
|
39
|
+
break
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
|
44
|
+
if !contains_strings
|
45
|
+
# plot.set "xrange [*:*]"
|
46
|
+
# puts "INTEGER GRAPH"
|
47
|
+
plot.style "fill pattern 22 border -1"
|
48
|
+
plot.set "boxwidth 0.2" # Probably 3-5.
|
49
|
+
|
50
|
+
plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
|
51
|
+
#ds.with= " boxes lw 1"
|
52
|
+
# ds.using=""
|
53
|
+
ds.with= " imp lw 4"
|
54
|
+
end
|
55
|
+
|
56
|
+
else #graph with strings in X axis
|
57
|
+
# puts "STRING GRAPH"
|
58
|
+
plot.xlabel ""
|
59
|
+
|
60
|
+
plot.set "style fill solid 1.00 border -1"
|
61
|
+
plot.set "style histogram clustered gap 1 title offset character 0, 0, 0"
|
62
|
+
plot.set "style data histogram"
|
63
|
+
plot.set "boxwidth 0.2 absolute"
|
64
|
+
if x.count>4 then
|
65
|
+
plot.set "xtics offset 0,graph 0 rotate 90"
|
66
|
+
end
|
67
|
+
# $VERBOSE=true
|
68
|
+
# plot.set "style data linespoints"
|
69
|
+
# plot.set "xtics border in scale 1,0.5 nomirror rotate by -45 offset character 0, 0, 0"
|
70
|
+
|
71
|
+
# s = []
|
72
|
+
# # i=0
|
73
|
+
# x.each_with_index do |v,i|
|
74
|
+
# #s.push "\"#{v}\""
|
75
|
+
# s.push "#{v} #{i}"
|
76
|
+
#
|
77
|
+
# # i+=1
|
78
|
+
# end
|
79
|
+
#
|
80
|
+
#
|
81
|
+
# plot.set "xtics (#{s.join(',')})"
|
82
|
+
# puts "XTICKS: (#{s.join(',')})"
|
83
|
+
# puts "X:"
|
84
|
+
# puts x.join(';')
|
85
|
+
# puts "Y:"
|
86
|
+
# puts y.join(';')
|
87
|
+
|
88
|
+
# if more than 20 strings, then keep greater ones
|
89
|
+
|
90
|
+
if x.count>20
|
91
|
+
# puts "original X:#{x.count}"
|
92
|
+
$VERBOSE=true
|
93
|
+
h = {}
|
94
|
+
|
95
|
+
x.each_with_index do |x1,i|
|
96
|
+
h[x1]=y[i]
|
97
|
+
end
|
98
|
+
|
99
|
+
# puts h.to_json
|
100
|
+
x=[]
|
101
|
+
y=[]
|
102
|
+
|
103
|
+
10.times do
|
104
|
+
ma=h.max_by{|k,v| v}
|
105
|
+
if ma
|
106
|
+
puts "MAX:",ma.join(' * '),"of",h.values.sort.join(',')
|
107
|
+
x.push ma[0]
|
108
|
+
y.push ma[1]
|
109
|
+
h.delete(ma[0])
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
# puts "MAX 20 #{x.length}:#{x.join(';')}"
|
114
|
+
|
115
|
+
# set key below
|
116
|
+
# plot.set "label 3 below"
|
117
|
+
|
118
|
+
end
|
119
|
+
|
120
|
+
plot.data << Gnuplot::DataSet.new( [x,y] ) do |ds|
|
121
|
+
ds.using = "2:xticlabels(1)" #show the graph and use labels at x
|
122
|
+
# ds.using="2"
|
123
|
+
#ds.with= " boxes lw 1"
|
124
|
+
# ds.using = "2 t 'Sequences' " #show the legend in the graph
|
125
|
+
end
|
126
|
+
|
127
|
+
end
|
128
|
+
|
129
|
+
if !file_name.nil?
|
130
|
+
plot.terminal "png size 800,600"
|
131
|
+
plot.output "#{file_name}"
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
end
|
136
|
+
|
137
|
+
end
|
138
|
+
|
139
|
+
|
140
|
+
end
|