seqtrimnext 2.0.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +3 -0
- data/Manifest.txt +114 -0
- data/PostInstall.txt +7 -0
- data/README.rdoc +159 -0
- data/Rakefile +38 -0
- data/bin/create_graphs.rb +46 -0
- data/bin/extract_seqs.rb +45 -0
- data/bin/extract_seqs_from_fasta.rb +56 -0
- data/bin/extract_seqs_from_fastq.rb +45 -0
- data/bin/fasta2fastq.rb +38 -0
- data/bin/fastq2fasta.rb +35 -0
- data/bin/gen_qual.rb +46 -0
- data/bin/get_seq.rb +46 -0
- data/bin/group_by_range.rb +17 -0
- data/bin/join_ilumina_paired.rb +130 -0
- data/bin/parse_amplicons.rb +95 -0
- data/bin/parse_json_results.rb +66 -0
- data/bin/parse_params.rb +82 -0
- data/bin/resume_clusters.rb +48 -0
- data/bin/resume_rejected.sh +9 -0
- data/bin/reverse_paired.rb +49 -0
- data/bin/seqtrimnext +368 -0
- data/bin/split_fastq.rb +42 -0
- data/bin/split_ilumina_paired.rb +65 -0
- data/bin/split_paired.rb +70 -0
- data/lib/seqtrimnext/actions/action_ab_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_ab_far_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_ab_left_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_empty_insert.rb +22 -0
- data/lib/seqtrimnext/actions/action_ignore_repeated.rb +24 -0
- data/lib/seqtrimnext/actions/action_indetermination.rb +30 -0
- data/lib/seqtrimnext/actions/action_induced_low_complexity.rb +29 -0
- data/lib/seqtrimnext/actions/action_insert.rb +32 -0
- data/lib/seqtrimnext/actions/action_is_contaminated.rb +30 -0
- data/lib/seqtrimnext/actions/action_key.rb +30 -0
- data/lib/seqtrimnext/actions/action_left_adapter.rb +32 -0
- data/lib/seqtrimnext/actions/action_left_primer.rb +17 -0
- data/lib/seqtrimnext/actions/action_linker.rb +30 -0
- data/lib/seqtrimnext/actions/action_low_complexity.rb +30 -0
- data/lib/seqtrimnext/actions/action_low_high_size.rb +31 -0
- data/lib/seqtrimnext/actions/action_low_quality.rb +33 -0
- data/lib/seqtrimnext/actions/action_mid.rb +30 -0
- data/lib/seqtrimnext/actions/action_multiple_linker.rb +29 -0
- data/lib/seqtrimnext/actions/action_paired_reads.rb +28 -0
- data/lib/seqtrimnext/actions/action_poly_a.rb +29 -0
- data/lib/seqtrimnext/actions/action_poly_t.rb +29 -0
- data/lib/seqtrimnext/actions/action_rem_adit_artifacts.rb +32 -0
- data/lib/seqtrimnext/actions/action_right_adapter.rb +29 -0
- data/lib/seqtrimnext/actions/action_right_primer.rb +25 -0
- data/lib/seqtrimnext/actions/action_short_insert.rb +32 -0
- data/lib/seqtrimnext/actions/action_unexpected_poly_t.rb +29 -0
- data/lib/seqtrimnext/actions/action_unexpected_vector.rb +31 -0
- data/lib/seqtrimnext/actions/action_vectors.rb +31 -0
- data/lib/seqtrimnext/actions/seqtrim_action.rb +136 -0
- data/lib/seqtrimnext/classes/action_manager.rb +47 -0
- data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +335 -0
- data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +290 -0
- data/lib/seqtrimnext/classes/extract_stats.rb +255 -0
- data/lib/seqtrimnext/classes/gnu_plot_graph.rb +140 -0
- data/lib/seqtrimnext/classes/graph_stats.rb +74 -0
- data/lib/seqtrimnext/classes/install_database.rb +43 -0
- data/lib/seqtrimnext/classes/install_requirements.rb +123 -0
- data/lib/seqtrimnext/classes/list_db.rb +49 -0
- data/lib/seqtrimnext/classes/make_blast_db.rb +113 -0
- data/lib/seqtrimnext/classes/one_blast.rb +41 -0
- data/lib/seqtrimnext/classes/params.rb +387 -0
- data/lib/seqtrimnext/classes/piro.rb +78 -0
- data/lib/seqtrimnext/classes/plugin_manager.rb +153 -0
- data/lib/seqtrimnext/classes/scan_for_restr_site.rb +138 -0
- data/lib/seqtrimnext/classes/scbi_stats.rb +68 -0
- data/lib/seqtrimnext/classes/seqtrim.rb +317 -0
- data/lib/seqtrimnext/classes/sequence.rb +55 -0
- data/lib/seqtrimnext/classes/sequence_group.rb +72 -0
- data/lib/seqtrimnext/classes/sequence_with_action.rb +503 -0
- data/lib/seqtrimnext/plugins/plugin.rb +267 -0
- data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +189 -0
- data/lib/seqtrimnext/plugins/plugin_adapters.rb +165 -0
- data/lib/seqtrimnext/plugins/plugin_amplicons.rb +221 -0
- data/lib/seqtrimnext/plugins/plugin_contaminants.rb +209 -0
- data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +438 -0
- data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +393 -0
- data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +101 -0
- data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +199 -0
- data/lib/seqtrimnext/plugins/plugin_key.rb +70 -0
- data/lib/seqtrimnext/plugins/plugin_linker.rb +232 -0
- data/lib/seqtrimnext/plugins/plugin_low_complexity.rb +98 -0
- data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +74 -0
- data/lib/seqtrimnext/plugins/plugin_low_quality.rb +394 -0
- data/lib/seqtrimnext/plugins/plugin_mids.rb +231 -0
- data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +246 -0
- data/lib/seqtrimnext/plugins/plugin_short_insert.rb +244 -0
- data/lib/seqtrimnext/plugins/plugin_vectors.rb +191 -0
- data/lib/seqtrimnext/templates/amplicons.txt +16 -0
- data/lib/seqtrimnext/templates/genomics_454.txt +5 -0
- data/lib/seqtrimnext/templates/genomics_454_with_paired.txt +5 -0
- data/lib/seqtrimnext/templates/low_quality.txt +5 -0
- data/lib/seqtrimnext/templates/low_quality_and_low_complexity.txt +5 -0
- data/lib/seqtrimnext/templates/transcriptomics_454.txt +8 -0
- data/lib/seqtrimnext/templates/transcriptomics_plants.txt +8 -0
- data/lib/seqtrimnext/utils/extract_samples.rb +52 -0
- data/lib/seqtrimnext/utils/fasta2xml.rb +69 -0
- data/lib/seqtrimnext/utils/global_match.rb +65 -0
- data/lib/seqtrimnext/utils/hash_stats.rb +29 -0
- data/lib/seqtrimnext/utils/json_utils.rb +50 -0
- data/lib/seqtrimnext/utils/load_fasta_names_in_hash.rb +37 -0
- data/lib/seqtrimnext/utils/load_qual_in_hash.rb +37 -0
- data/lib/seqtrimnext/utils/recover_mid.rb +95 -0
- data/lib/seqtrimnext/utils/string_utils.rb +56 -0
- data/lib/seqtrimnext.rb +37 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/test/test_helper.rb +3 -0
- data/test/test_seqtrimnext.rb +11 -0
- metadata +318 -0
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
|
|
2
|
+
#finds the classes that were in the folder 'classes'
|
|
3
|
+
|
|
4
|
+
ROOT_PATH=File.dirname(File.dirname(File.dirname(__FILE__)))
|
|
5
|
+
|
|
6
|
+
$: << File.expand_path(File.join(ROOT_PATH, 'classes'))
|
|
7
|
+
$: << File.expand_path(File.join(ROOT_PATH, 'classes','blast'))
|
|
8
|
+
|
|
9
|
+
#finds the classes that were in the folder 'plugins'
|
|
10
|
+
$: << File.expand_path(File.join(ROOT_PATH, 'plugins'))
|
|
11
|
+
|
|
12
|
+
#finds the classes that were in the folder 'plugins'
|
|
13
|
+
$: << File.expand_path(File.join(ROOT_PATH, 'actions'))
|
|
14
|
+
|
|
15
|
+
#finds the classes that were in the folder 'utils'
|
|
16
|
+
$: << File.expand_path(File.join(ROOT_PATH, 'utils'))
|
|
17
|
+
|
|
18
|
+
$: << File.expand_path(File.join(ROOT_PATH, 'classes','em_classes'))
|
|
19
|
+
|
|
20
|
+
$: << File.expand_path(ROOT_PATH)
|
|
21
|
+
|
|
22
|
+
$SEQTRIM_PATH = ROOT_PATH
|
|
23
|
+
|
|
24
|
+
if ENV['BLASTDB']# && Dir.exists?(ENV['BLASTDB'])
|
|
25
|
+
$FORMATTED_DB_PATH = ENV['BLASTDB']
|
|
26
|
+
$DB_PATH = File.dirname($FORMATTED_DB_PATH)
|
|
27
|
+
else
|
|
28
|
+
$FORMATTED_DB_PATH = File.expand_path(File.join(ROOT_PATH, "DB",'formatted'))
|
|
29
|
+
$DB_PATH = File.expand_path(File.join(ROOT_PATH, "DB"))
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
ENV['BLASTDB']=$FORMATTED_DB_PATH
|
|
33
|
+
|
|
34
|
+
OUTPUT_PATH='output_files'
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# $: << File.expand_path('~/progs/ruby/gems/scbi_mapreduce/lib')
|
|
38
|
+
|
|
39
|
+
require 'scbi_mapreduce'
|
|
40
|
+
require 'params'
|
|
41
|
+
require 'action_manager'
|
|
42
|
+
require 'plugin_manager'
|
|
43
|
+
# require 'sequence_with_action'
|
|
44
|
+
#
|
|
45
|
+
require 'scbi_fastq'
|
|
46
|
+
require 'sequence_group'
|
|
47
|
+
|
|
48
|
+
class SeqtrimWorker < ScbiMapreduce::Worker
|
|
49
|
+
|
|
50
|
+
def process_object(obj)
|
|
51
|
+
running_seqs=SequenceGroup.new(obj)
|
|
52
|
+
|
|
53
|
+
# execute plugins
|
|
54
|
+
@plugin_manager.execute_plugins(running_seqs)
|
|
55
|
+
|
|
56
|
+
# add output data
|
|
57
|
+
add_output_data(running_seqs)
|
|
58
|
+
|
|
59
|
+
return running_seqs
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def receive_initial_config(obj)
|
|
63
|
+
|
|
64
|
+
# Reads the parameters
|
|
65
|
+
$WORKER_LOG.info "Params received"
|
|
66
|
+
# @params = Params.new(params_path)
|
|
67
|
+
@params = obj
|
|
68
|
+
|
|
69
|
+
@use_qual=@params.get_param('use_qual')
|
|
70
|
+
@use_json=@params.get_param('use_json')
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def starting_worker
|
|
74
|
+
# $WORKER_LOG.level = Logger::ERROR
|
|
75
|
+
$WORKER_LOG.info "Loading actions"
|
|
76
|
+
@action_manager = ActionManager.new
|
|
77
|
+
|
|
78
|
+
$WORKER_LOG.info "Loading plugins"
|
|
79
|
+
@plugin_list = @params.get_param('plugin_list') # puts in plugin_list the plugins's array
|
|
80
|
+
$WORKER_LOG.info "PLUGIN LIST:" + @plugin_list
|
|
81
|
+
|
|
82
|
+
@plugin_manager = PluginManager.new(@plugin_list,@params) # creates an instance from PluginManager. This must storage the plugins and load it
|
|
83
|
+
|
|
84
|
+
rescue Exception => e
|
|
85
|
+
puts (e.message+ e.backtrace.join("\n"))
|
|
86
|
+
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def closing_worker
|
|
91
|
+
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def add_output_data(obj)
|
|
96
|
+
obj.output_text=[]
|
|
97
|
+
|
|
98
|
+
obj.each do |seq|
|
|
99
|
+
obj.output_text << seq.to_text
|
|
100
|
+
write_seq_to_files(obj.output_files,seq, obj.stats)
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# @remove seqs since they are not needed anymore to write output files
|
|
104
|
+
obj.remove_all_seqs
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def add_stat(stats,key,subkey,value,count=1)
|
|
108
|
+
|
|
109
|
+
stats[key]={} if !stats[key]
|
|
110
|
+
stats[key][subkey]={} if !stats[key][subkey]
|
|
111
|
+
stats[key][subkey][value]=0 if !stats[key][subkey][value]
|
|
112
|
+
|
|
113
|
+
stats[key][subkey][value]+=count
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def write_seq_to_files(files,seq, stats)
|
|
117
|
+
# puts stats.to_json
|
|
118
|
+
|
|
119
|
+
dir_name,file_name=seq.get_file_tag_path
|
|
120
|
+
# puts File.join(dir_name,'sequences_'+file_name)
|
|
121
|
+
|
|
122
|
+
# get current inserts
|
|
123
|
+
inserts = seq.get_inserts
|
|
124
|
+
|
|
125
|
+
# qualities are optional
|
|
126
|
+
if @use_qual
|
|
127
|
+
qual_inserts = seq.get_qual_inserts
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# save json if necessary
|
|
131
|
+
if @use_json
|
|
132
|
+
json_file(files)<< seq.to_json
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
# find mids
|
|
136
|
+
mid = seq.get_actions(ActionMid).first
|
|
137
|
+
|
|
138
|
+
if (seq.seq_rejected) # sequence rejected
|
|
139
|
+
|
|
140
|
+
#save to rejected sequences
|
|
141
|
+
message = seq.seq_rejected_by_message
|
|
142
|
+
rejected_output_file(files)<<('>'+seq.seq_name+ ' ' + message)
|
|
143
|
+
|
|
144
|
+
add_stat(stats,'sequences','rejected',seq.seq_rejected_by_message)
|
|
145
|
+
add_stat(stats,'sequences','count','rejected')
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
elsif (inserts.empty?) #sequence with no inserts
|
|
149
|
+
message = 'No valid inserts found'
|
|
150
|
+
rejected_output_file(files)<<('>'+seq.seq_name+ ' ' + message)
|
|
151
|
+
|
|
152
|
+
add_stat(stats,'sequences','rejected',message)
|
|
153
|
+
add_stat(stats,'sequences','count','rejected')
|
|
154
|
+
|
|
155
|
+
elsif (inserts.count == 2) # sequence with two inserts = PAIRED SEQUENCES
|
|
156
|
+
add_stat(stats,'sequences','count','output_seqs_paired')
|
|
157
|
+
|
|
158
|
+
# TODO - Add this stats to full stats
|
|
159
|
+
# @@full_stats.add_stats({'sequences' => {'paired' => {'count' => 1}}})
|
|
160
|
+
|
|
161
|
+
if (mid.nil? || (mid.message=='no_MID') ) # without mid
|
|
162
|
+
mid_id = 'no_MID'
|
|
163
|
+
mid_message = ' No MID found'
|
|
164
|
+
else
|
|
165
|
+
mid_id = mid.tag_id
|
|
166
|
+
mid_message=''
|
|
167
|
+
if mid_id != mid_message
|
|
168
|
+
mid_message = ' '+mid.message
|
|
169
|
+
end
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
# fasta_file = get_paired_file(mid_id)
|
|
173
|
+
|
|
174
|
+
n="#{seq.seq_name}_left"
|
|
175
|
+
c="template=#{seq.seq_name} dir=R library=#{mid_id}"
|
|
176
|
+
f=inserts[0].reverse.tr('actgACTG','tgacTGAC')
|
|
177
|
+
q=[]
|
|
178
|
+
if @use_qual
|
|
179
|
+
q=qual_inserts[0].reverse
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
paired_file(files,dir_name,file_name)<<FastqFile.to_fastq(n,f,q,c)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
n="#{seq.seq_name}_right"
|
|
186
|
+
c="template=#{seq.seq_name} dir=F library=#{mid_id}"
|
|
187
|
+
f=inserts[1]
|
|
188
|
+
q=[]
|
|
189
|
+
if @use_qual
|
|
190
|
+
q=qual_inserts[1]
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
paired_file(files,dir_name,file_name)<<FastqFile.to_fastq(n,f,q,c)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
elsif (inserts.count == 1) # sequence with one insert
|
|
197
|
+
|
|
198
|
+
if (mid.nil? || (mid.message=='no_MID') ) # without mid
|
|
199
|
+
mid_id = 'no_MID'
|
|
200
|
+
mid_message = ' No MID found'
|
|
201
|
+
else
|
|
202
|
+
mid_id = mid.tag_id
|
|
203
|
+
mid_message=''
|
|
204
|
+
if mid_id != mid_message
|
|
205
|
+
mid_message = ' '+mid.message
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
# save fasta and qual in no MID file
|
|
210
|
+
has_low_complexity = seq.get_actions(ActionLowComplexity)
|
|
211
|
+
|
|
212
|
+
if has_low_complexity.empty?
|
|
213
|
+
add_stat(stats,'sequences','count','output_seqs')
|
|
214
|
+
|
|
215
|
+
# fasta_file = get_sequence_file(mid_id)
|
|
216
|
+
# sff_file=get_sffinfo_file(mid_id)
|
|
217
|
+
fasta_file=sequence_file(files,dir_name,file_name)
|
|
218
|
+
sff_file=sffinfo_file(files,dir_name,file_name)
|
|
219
|
+
else
|
|
220
|
+
add_stat(stats,'sequences','count','output_seqs_low_complexity')
|
|
221
|
+
|
|
222
|
+
# fasta_file = get_low_complexity_file(mid_id)
|
|
223
|
+
# sff_file=get_low_sffinfo_file(mid_id)
|
|
224
|
+
fasta_file=low_complexity_file(files,dir_name,file_name)
|
|
225
|
+
sff_file=low_sffinfo_file(files,dir_name,file_name)
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
q=[]
|
|
229
|
+
if @use_qual
|
|
230
|
+
q=qual_inserts[0]
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
n=seq.seq_name
|
|
234
|
+
c=mid_message
|
|
235
|
+
f=inserts[0]
|
|
236
|
+
|
|
237
|
+
fasta_file << FastqFile.to_fastq(n,f,q,c)
|
|
238
|
+
|
|
239
|
+
inserts_pos = seq.get_actions(ActionInsert)
|
|
240
|
+
|
|
241
|
+
sff_file<< "#{n} #{inserts_pos[0].start_pos+1} #{inserts_pos[0].end_pos+1}"
|
|
242
|
+
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
# ACCESS TO FILES
|
|
249
|
+
|
|
250
|
+
def json_file(files)
|
|
251
|
+
return get_file(files,File.join(OUTPUT_PATH,'results.json'))
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
def rejected_output_file(files)
|
|
255
|
+
return get_file(files,File.join(OUTPUT_PATH,'rejected.txt'))
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def sequence_file(files, dir_name, file_name)
|
|
260
|
+
return get_file(files,File.join(OUTPUT_PATH,dir_name,'sequences_'+file_name+'.fastq'))
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
def paired_file(files, dir_name, file_name)
|
|
264
|
+
return get_file(files,File.join(OUTPUT_PATH,dir_name,'paired_'+file_name+'.fastq'))
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
def low_complexity_file(files, dir_name, file_name)
|
|
268
|
+
return get_file(files,File.join(OUTPUT_PATH,dir_name,'low_complexity_'+file_name+'.fastq'))
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
def sffinfo_file(files, dir_name, file_name)
|
|
272
|
+
return get_file(files,File.join(OUTPUT_PATH,dir_name,'sff_info_'+file_name+'.txt'))
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
def low_sffinfo_file(files, dir_name, file_name)
|
|
276
|
+
return get_file(files,File.join(OUTPUT_PATH,dir_name,'low_complexity_sff_info_'+file_name+'.txt'))
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
def get_file(files,fn)
|
|
280
|
+
res=files[fn]
|
|
281
|
+
|
|
282
|
+
if !res
|
|
283
|
+
files[fn]=[]
|
|
284
|
+
res=files[fn]
|
|
285
|
+
end
|
|
286
|
+
|
|
287
|
+
return res
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
end
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
######################################
|
|
2
|
+
# Author:: Almudena Bocinos Rioboo
|
|
3
|
+
# Extract stats like mean of sequence's length
|
|
4
|
+
######################################
|
|
5
|
+
|
|
6
|
+
# $: << '/Users/dariogf/progs/ruby/gems/scbi_plot/lib'
|
|
7
|
+
# $: << '/Users/dariogf/progs/ruby/gems/scbi_math/lib'
|
|
8
|
+
|
|
9
|
+
require 'scbi_plot'
|
|
10
|
+
require "scbi_math"
|
|
11
|
+
|
|
12
|
+
class ExtractStats
|
|
13
|
+
|
|
14
|
+
def initialize(sequence_reader,params)
|
|
15
|
+
|
|
16
|
+
@sequence_lengths = [] #array of sequences lengths
|
|
17
|
+
@length_frequency = [] #number of sequences of each size (frequency)
|
|
18
|
+
@keys={} #found keys
|
|
19
|
+
@params = params
|
|
20
|
+
@use_qual=sequence_reader.with_qual?
|
|
21
|
+
# @params.get_param('use_qual')
|
|
22
|
+
|
|
23
|
+
@totalnt=0
|
|
24
|
+
@qv=[]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@sequence_lengths_stats, @length_frequency_stats, @quality_stats = extract_stats_from_sequences(sequence_reader)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
set_params_and_results
|
|
31
|
+
|
|
32
|
+
plot_lengths
|
|
33
|
+
|
|
34
|
+
plot_qualities if @use_qual
|
|
35
|
+
|
|
36
|
+
print_global_stats
|
|
37
|
+
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def extract_stats_from_sequences(sequence_reader)
|
|
41
|
+
|
|
42
|
+
sequence_reader.each do |name_seq,fasta_seq,qual|
|
|
43
|
+
l = fasta_seq.length
|
|
44
|
+
|
|
45
|
+
@totalnt+=l
|
|
46
|
+
|
|
47
|
+
#save all lengths
|
|
48
|
+
@sequence_lengths.push l
|
|
49
|
+
|
|
50
|
+
# add key value
|
|
51
|
+
add_key(fasta_seq[0..3].upcase)
|
|
52
|
+
|
|
53
|
+
# add fasta length
|
|
54
|
+
@length_frequency[fasta_seq.length] = (@length_frequency[fasta_seq.length] || 1 ) + 1
|
|
55
|
+
|
|
56
|
+
#extract qv values
|
|
57
|
+
extract_qv_from_sequence(qual) if @use_qual
|
|
58
|
+
|
|
59
|
+
# print some progress info
|
|
60
|
+
if (sequence_reader.num_seqs % 10000==0)
|
|
61
|
+
puts "Calculating stats: #{sequence_reader.num_seqs}"
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
length_stats = ScbiNArray.to_na(@sequence_lengths)
|
|
67
|
+
length_frequency_stats = ScbiNArray.to_na(@length_frequency.map{|e| e || 0})
|
|
68
|
+
quality_stats = ScbiNArray.to_na(@qv) if @use_qual
|
|
69
|
+
|
|
70
|
+
return [length_stats, length_frequency_stats, quality_stats]
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def plot_lengths
|
|
74
|
+
|
|
75
|
+
## PLOT RESULTS
|
|
76
|
+
if !File.exists?('graphs')
|
|
77
|
+
Dir.mkdir('graphs')
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
x = []
|
|
82
|
+
y = []
|
|
83
|
+
|
|
84
|
+
x =(0..@length_frequency.length-1).collect.to_a
|
|
85
|
+
y = @length_frequency.map{|e| e || 0}
|
|
86
|
+
|
|
87
|
+
file_name = 'graphs/size_stats.png'
|
|
88
|
+
|
|
89
|
+
p=ScbiPlot::Lines.new(file_name,'Stats of sequence sizes')
|
|
90
|
+
p.x_label= "Sequence length"
|
|
91
|
+
p.y_label= "Number of sequences"
|
|
92
|
+
|
|
93
|
+
p.add_x(x)
|
|
94
|
+
|
|
95
|
+
p.add_series('sizes', y,'impulses',2)
|
|
96
|
+
|
|
97
|
+
p.add_vertical_line('Mode',@length_frequency_stats.fat_mode[0])
|
|
98
|
+
|
|
99
|
+
p.add_vertical_line('L',@params.get_param('min_sequence_size_raw').to_i)
|
|
100
|
+
p.add_vertical_line('H',@params.get_param('max_sequence_size_raw').to_i)
|
|
101
|
+
|
|
102
|
+
p.do_graph
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def plot_qualities
|
|
108
|
+
|
|
109
|
+
if !File.exists?('graphs')
|
|
110
|
+
Dir.mkdir('graphs')
|
|
111
|
+
end
|
|
112
|
+
minimum_qual_value = @params.get_param('min_quality').to_i
|
|
113
|
+
|
|
114
|
+
# get qualities values
|
|
115
|
+
x=[]
|
|
116
|
+
y=[]
|
|
117
|
+
min=[]
|
|
118
|
+
max=[]
|
|
119
|
+
qual_limit=[]
|
|
120
|
+
|
|
121
|
+
@qv.each_with_index do |e,i|
|
|
122
|
+
x << i
|
|
123
|
+
y << (e[:tot]/e[:nseq])
|
|
124
|
+
min << (e[:min])
|
|
125
|
+
max << (e[:max])
|
|
126
|
+
qual_limit << minimum_qual_value
|
|
127
|
+
# puts "#{i}: #{e[:tot]/e[:nseq]}"
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# make plot of qualities
|
|
131
|
+
|
|
132
|
+
file_name='graphs/qualities.png'
|
|
133
|
+
|
|
134
|
+
p=ScbiPlot::Lines.new(file_name,'Stats of sequence qualities')
|
|
135
|
+
p.x_label= "Nucleotide position"
|
|
136
|
+
p.y_label= "Quality value"
|
|
137
|
+
|
|
138
|
+
p.add_x(x)
|
|
139
|
+
|
|
140
|
+
p.add_series('mean', y)
|
|
141
|
+
p.add_series('min', min)
|
|
142
|
+
p.add_series('max', max)
|
|
143
|
+
p.add_series('qual limit',qual_limit)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
p.do_graph
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def add_qv(q,i)
|
|
151
|
+
if !@qv[i]
|
|
152
|
+
@qv[i]={:max => 0, :min => 1000000, :nseq => 0, :tot => 0}
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# set max
|
|
156
|
+
@qv[i][:tot]+=q
|
|
157
|
+
@qv[i][:nseq]+=1
|
|
158
|
+
@qv[i][:min]=[@qv[i][:min],q].min
|
|
159
|
+
@qv[i][:max]=[@qv[i][:max],q].max
|
|
160
|
+
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
def extract_qv_from_sequence(qual)
|
|
164
|
+
qual.each_with_index do |q,i|
|
|
165
|
+
add_qv(q,i)
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
def add_key(key)
|
|
170
|
+
if @keys[key].nil?
|
|
171
|
+
@keys[key]=1
|
|
172
|
+
else
|
|
173
|
+
@keys[key]+=1
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
def get_max_key
|
|
178
|
+
return @keys.keys.sort{|e1,e2| @keys[e1]<=>@keys[e2]}.last
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
def set_params_and_results
|
|
182
|
+
|
|
183
|
+
if @sequence_lengths.empty?
|
|
184
|
+
puts "No sequences has been sucessfully readed "
|
|
185
|
+
return
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
# set limiting parameters
|
|
190
|
+
|
|
191
|
+
@params.set_param('sequencing_key',get_max_key)
|
|
192
|
+
@params.set_param('all_found_keys',@keys.to_json)
|
|
193
|
+
|
|
194
|
+
# sequence min size, is taken directly from params file
|
|
195
|
+
# max sequence limit is calculated here
|
|
196
|
+
if (@sequence_lengths_stats.variance_coefficient<=10) or (@params.get_param('accept_very_long_sequences')=='true')
|
|
197
|
+
|
|
198
|
+
# high size limit is calculated with stats
|
|
199
|
+
@params.set_param('max_sequence_size_raw',(@sequence_lengths_stats.max+10).to_i)
|
|
200
|
+
|
|
201
|
+
else # > 10 %
|
|
202
|
+
|
|
203
|
+
# high size limit is calculated with stats
|
|
204
|
+
@params.set_param('max_sequence_size_raw',(@sequence_lengths_stats.mean+2*@sequence_lengths_stats.stddev).to_i)
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
def print_global_stats
|
|
211
|
+
|
|
212
|
+
if !@sequence_lengths_stats.nil?
|
|
213
|
+
initial_stats={}
|
|
214
|
+
initial_stats[:sequence_count] = @sequence_lengths_stats.size
|
|
215
|
+
initial_stats[:smallest_sequence_size] = @sequence_lengths_stats.min
|
|
216
|
+
initial_stats[:biggest_sequence_size] = @sequence_lengths_stats.max
|
|
217
|
+
|
|
218
|
+
initial_stats[:min_sequence_size_raw]=@params.get_param('min_sequence_size_raw')
|
|
219
|
+
initial_stats[:max_sequence_size_raw]=@params.get_param('max_sequence_size_raw')
|
|
220
|
+
initial_stats[:coefficient_of_variance]=@sequence_lengths_stats.variance_coefficient
|
|
221
|
+
initial_stats[:nucleotide_count]=@totalnt
|
|
222
|
+
initial_stats[:mode_of_sizes]=@length_frequency_stats.fat_mode[0]
|
|
223
|
+
initial_stats[:mean_of_sequence_sizes]=@sequence_lengths_stats.mean
|
|
224
|
+
|
|
225
|
+
initial_stats[:qv]=@qv
|
|
226
|
+
initial_stats[:used_key]=get_max_key
|
|
227
|
+
initial_stats[:all_keys]=@keys
|
|
228
|
+
|
|
229
|
+
File.open(File.join(OUTPUT_PATH,'initial_stats.json'),'w') do |f|
|
|
230
|
+
f.puts JSON.pretty_generate(initial_stats)
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
puts "_"*10+ " STATISTICS "+"_"*10
|
|
234
|
+
puts "Total sequence count: #{@sequence_lengths_stats.size}"
|
|
235
|
+
|
|
236
|
+
puts "Smallest sequence: #{initial_stats[:smallest_sequence_size]} nt"
|
|
237
|
+
puts "Biggest sequence : #{initial_stats[:biggest_sequence_size]} nt"
|
|
238
|
+
puts "Mean of sequence sizes : #{initial_stats[:mean_of_sequence_sizes]} nt"
|
|
239
|
+
puts "Mode of sequence sizes : #{initial_stats[:mode_of_sizes]} nt"
|
|
240
|
+
|
|
241
|
+
puts "Low size limit : #{initial_stats[:min_sequence_size_raw]} nt"
|
|
242
|
+
puts "High size limit : #{initial_stats[:max_sequence_size_raw]} nt"
|
|
243
|
+
|
|
244
|
+
puts "Coefficient of variation: #{initial_stats[:coefficient_of_variance]} %"
|
|
245
|
+
puts "Total nucleotide count: #{initial_stats[:nucleotide_count]} nt"
|
|
246
|
+
|
|
247
|
+
puts "_"*30
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
end
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
require 'gnuplot'
|
|
2
|
+
|
|
3
|
+
class GnuPlotGraph
|
|
4
|
+
|
|
5
|
+
def initialize(file_name,x,y,title=nil)
|
|
6
|
+
$VERBOSE=true
|
|
7
|
+
Gnuplot.open do |gp|
|
|
8
|
+
# histogram
|
|
9
|
+
Gnuplot::Plot.new( gp ) do |plot|
|
|
10
|
+
|
|
11
|
+
# plot.space= 5 # it's the free space between the first/last value and the begin/end of axis X
|
|
12
|
+
|
|
13
|
+
#plot.set("xrange [#{xr_min}: #{xr_max}]")
|
|
14
|
+
if !title
|
|
15
|
+
title=file_name
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
plot.title "#{title}"
|
|
19
|
+
plot.xlabel "length"
|
|
20
|
+
plot.ylabel "Number of sequences"
|
|
21
|
+
plot.set "key off" #leyend
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# plot.set "style fill solid 1.00 border -1"
|
|
25
|
+
# #plot.set "style histogram clustered gap 0 title offset character 0, 0, 0"
|
|
26
|
+
# plot.set "style data histograms"
|
|
27
|
+
# plot.set "boxwidth 0.2 absolute"
|
|
28
|
+
|
|
29
|
+
# For this next line, lw is linewidth (2-4)?
|
|
30
|
+
#plot [XMIN:XMAX] 'myHistogramData' with boxes lw VALUE
|
|
31
|
+
|
|
32
|
+
contains_strings=false
|
|
33
|
+
|
|
34
|
+
x.each do |v|
|
|
35
|
+
begin
|
|
36
|
+
r=Integer(v)
|
|
37
|
+
rescue
|
|
38
|
+
contains_strings=true
|
|
39
|
+
break
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
if !contains_strings
|
|
45
|
+
# plot.set "xrange [*:*]"
|
|
46
|
+
# puts "INTEGER GRAPH"
|
|
47
|
+
plot.style "fill pattern 22 border -1"
|
|
48
|
+
plot.set "boxwidth 0.2" # Probably 3-5.
|
|
49
|
+
|
|
50
|
+
plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
|
|
51
|
+
#ds.with= " boxes lw 1"
|
|
52
|
+
# ds.using=""
|
|
53
|
+
ds.with= " imp lw 4"
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
else #graph with strings in X axis
|
|
57
|
+
# puts "STRING GRAPH"
|
|
58
|
+
plot.xlabel ""
|
|
59
|
+
|
|
60
|
+
plot.set "style fill solid 1.00 border -1"
|
|
61
|
+
plot.set "style histogram clustered gap 1 title offset character 0, 0, 0"
|
|
62
|
+
plot.set "style data histogram"
|
|
63
|
+
plot.set "boxwidth 0.2 absolute"
|
|
64
|
+
if x.count>4 then
|
|
65
|
+
plot.set "xtics offset 0,graph 0 rotate 90"
|
|
66
|
+
end
|
|
67
|
+
# $VERBOSE=true
|
|
68
|
+
# plot.set "style data linespoints"
|
|
69
|
+
# plot.set "xtics border in scale 1,0.5 nomirror rotate by -45 offset character 0, 0, 0"
|
|
70
|
+
|
|
71
|
+
# s = []
|
|
72
|
+
# # i=0
|
|
73
|
+
# x.each_with_index do |v,i|
|
|
74
|
+
# #s.push "\"#{v}\""
|
|
75
|
+
# s.push "#{v} #{i}"
|
|
76
|
+
#
|
|
77
|
+
# # i+=1
|
|
78
|
+
# end
|
|
79
|
+
#
|
|
80
|
+
#
|
|
81
|
+
# plot.set "xtics (#{s.join(',')})"
|
|
82
|
+
# puts "XTICKS: (#{s.join(',')})"
|
|
83
|
+
# puts "X:"
|
|
84
|
+
# puts x.join(';')
|
|
85
|
+
# puts "Y:"
|
|
86
|
+
# puts y.join(';')
|
|
87
|
+
|
|
88
|
+
# if more than 20 strings, then keep greater ones
|
|
89
|
+
|
|
90
|
+
if x.count>20
|
|
91
|
+
# puts "original X:#{x.count}"
|
|
92
|
+
$VERBOSE=true
|
|
93
|
+
h = {}
|
|
94
|
+
|
|
95
|
+
x.each_with_index do |x1,i|
|
|
96
|
+
h[x1]=y[i]
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# puts h.to_json
|
|
100
|
+
x=[]
|
|
101
|
+
y=[]
|
|
102
|
+
|
|
103
|
+
10.times do
|
|
104
|
+
ma=h.max_by{|k,v| v}
|
|
105
|
+
if ma
|
|
106
|
+
puts "MAX:",ma.join(' * '),"of",h.values.sort.join(',')
|
|
107
|
+
x.push ma[0]
|
|
108
|
+
y.push ma[1]
|
|
109
|
+
h.delete(ma[0])
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# puts "MAX 20 #{x.length}:#{x.join(';')}"
|
|
114
|
+
|
|
115
|
+
# set key below
|
|
116
|
+
# plot.set "label 3 below"
|
|
117
|
+
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
plot.data << Gnuplot::DataSet.new( [x,y] ) do |ds|
|
|
121
|
+
ds.using = "2:xticlabels(1)" #show the graph and use labels at x
|
|
122
|
+
# ds.using="2"
|
|
123
|
+
#ds.with= " boxes lw 1"
|
|
124
|
+
# ds.using = "2 t 'Sequences' " #show the legend in the graph
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
if !file_name.nil?
|
|
130
|
+
plot.terminal "png size 800,600"
|
|
131
|
+
plot.output "#{file_name}"
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
end
|