seqtrimnext 2.0.29

Sign up to get free protection for your applications and to get access to all the features.
Files changed (115) hide show
  1. data/History.txt +3 -0
  2. data/Manifest.txt +114 -0
  3. data/PostInstall.txt +7 -0
  4. data/README.rdoc +159 -0
  5. data/Rakefile +38 -0
  6. data/bin/create_graphs.rb +46 -0
  7. data/bin/extract_seqs.rb +45 -0
  8. data/bin/extract_seqs_from_fasta.rb +56 -0
  9. data/bin/extract_seqs_from_fastq.rb +45 -0
  10. data/bin/fasta2fastq.rb +38 -0
  11. data/bin/fastq2fasta.rb +35 -0
  12. data/bin/gen_qual.rb +46 -0
  13. data/bin/get_seq.rb +46 -0
  14. data/bin/group_by_range.rb +17 -0
  15. data/bin/join_ilumina_paired.rb +130 -0
  16. data/bin/parse_amplicons.rb +95 -0
  17. data/bin/parse_json_results.rb +66 -0
  18. data/bin/parse_params.rb +82 -0
  19. data/bin/resume_clusters.rb +48 -0
  20. data/bin/resume_rejected.sh +9 -0
  21. data/bin/reverse_paired.rb +49 -0
  22. data/bin/seqtrimnext +368 -0
  23. data/bin/split_fastq.rb +42 -0
  24. data/bin/split_ilumina_paired.rb +65 -0
  25. data/bin/split_paired.rb +70 -0
  26. data/lib/seqtrimnext/actions/action_ab_adapter.rb +32 -0
  27. data/lib/seqtrimnext/actions/action_ab_far_adapter.rb +32 -0
  28. data/lib/seqtrimnext/actions/action_ab_left_adapter.rb +32 -0
  29. data/lib/seqtrimnext/actions/action_empty_insert.rb +22 -0
  30. data/lib/seqtrimnext/actions/action_ignore_repeated.rb +24 -0
  31. data/lib/seqtrimnext/actions/action_indetermination.rb +30 -0
  32. data/lib/seqtrimnext/actions/action_induced_low_complexity.rb +29 -0
  33. data/lib/seqtrimnext/actions/action_insert.rb +32 -0
  34. data/lib/seqtrimnext/actions/action_is_contaminated.rb +30 -0
  35. data/lib/seqtrimnext/actions/action_key.rb +30 -0
  36. data/lib/seqtrimnext/actions/action_left_adapter.rb +32 -0
  37. data/lib/seqtrimnext/actions/action_left_primer.rb +17 -0
  38. data/lib/seqtrimnext/actions/action_linker.rb +30 -0
  39. data/lib/seqtrimnext/actions/action_low_complexity.rb +30 -0
  40. data/lib/seqtrimnext/actions/action_low_high_size.rb +31 -0
  41. data/lib/seqtrimnext/actions/action_low_quality.rb +33 -0
  42. data/lib/seqtrimnext/actions/action_mid.rb +30 -0
  43. data/lib/seqtrimnext/actions/action_multiple_linker.rb +29 -0
  44. data/lib/seqtrimnext/actions/action_paired_reads.rb +28 -0
  45. data/lib/seqtrimnext/actions/action_poly_a.rb +29 -0
  46. data/lib/seqtrimnext/actions/action_poly_t.rb +29 -0
  47. data/lib/seqtrimnext/actions/action_rem_adit_artifacts.rb +32 -0
  48. data/lib/seqtrimnext/actions/action_right_adapter.rb +29 -0
  49. data/lib/seqtrimnext/actions/action_right_primer.rb +25 -0
  50. data/lib/seqtrimnext/actions/action_short_insert.rb +32 -0
  51. data/lib/seqtrimnext/actions/action_unexpected_poly_t.rb +29 -0
  52. data/lib/seqtrimnext/actions/action_unexpected_vector.rb +31 -0
  53. data/lib/seqtrimnext/actions/action_vectors.rb +31 -0
  54. data/lib/seqtrimnext/actions/seqtrim_action.rb +136 -0
  55. data/lib/seqtrimnext/classes/action_manager.rb +47 -0
  56. data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +335 -0
  57. data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +290 -0
  58. data/lib/seqtrimnext/classes/extract_stats.rb +255 -0
  59. data/lib/seqtrimnext/classes/gnu_plot_graph.rb +140 -0
  60. data/lib/seqtrimnext/classes/graph_stats.rb +74 -0
  61. data/lib/seqtrimnext/classes/install_database.rb +43 -0
  62. data/lib/seqtrimnext/classes/install_requirements.rb +123 -0
  63. data/lib/seqtrimnext/classes/list_db.rb +49 -0
  64. data/lib/seqtrimnext/classes/make_blast_db.rb +113 -0
  65. data/lib/seqtrimnext/classes/one_blast.rb +41 -0
  66. data/lib/seqtrimnext/classes/params.rb +387 -0
  67. data/lib/seqtrimnext/classes/piro.rb +78 -0
  68. data/lib/seqtrimnext/classes/plugin_manager.rb +153 -0
  69. data/lib/seqtrimnext/classes/scan_for_restr_site.rb +138 -0
  70. data/lib/seqtrimnext/classes/scbi_stats.rb +68 -0
  71. data/lib/seqtrimnext/classes/seqtrim.rb +317 -0
  72. data/lib/seqtrimnext/classes/sequence.rb +55 -0
  73. data/lib/seqtrimnext/classes/sequence_group.rb +72 -0
  74. data/lib/seqtrimnext/classes/sequence_with_action.rb +503 -0
  75. data/lib/seqtrimnext/plugins/plugin.rb +267 -0
  76. data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +189 -0
  77. data/lib/seqtrimnext/plugins/plugin_adapters.rb +165 -0
  78. data/lib/seqtrimnext/plugins/plugin_amplicons.rb +221 -0
  79. data/lib/seqtrimnext/plugins/plugin_contaminants.rb +209 -0
  80. data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +438 -0
  81. data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +393 -0
  82. data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +101 -0
  83. data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +199 -0
  84. data/lib/seqtrimnext/plugins/plugin_key.rb +70 -0
  85. data/lib/seqtrimnext/plugins/plugin_linker.rb +232 -0
  86. data/lib/seqtrimnext/plugins/plugin_low_complexity.rb +98 -0
  87. data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +74 -0
  88. data/lib/seqtrimnext/plugins/plugin_low_quality.rb +394 -0
  89. data/lib/seqtrimnext/plugins/plugin_mids.rb +231 -0
  90. data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +246 -0
  91. data/lib/seqtrimnext/plugins/plugin_short_insert.rb +244 -0
  92. data/lib/seqtrimnext/plugins/plugin_vectors.rb +191 -0
  93. data/lib/seqtrimnext/templates/amplicons.txt +16 -0
  94. data/lib/seqtrimnext/templates/genomics_454.txt +5 -0
  95. data/lib/seqtrimnext/templates/genomics_454_with_paired.txt +5 -0
  96. data/lib/seqtrimnext/templates/low_quality.txt +5 -0
  97. data/lib/seqtrimnext/templates/low_quality_and_low_complexity.txt +5 -0
  98. data/lib/seqtrimnext/templates/transcriptomics_454.txt +8 -0
  99. data/lib/seqtrimnext/templates/transcriptomics_plants.txt +8 -0
  100. data/lib/seqtrimnext/utils/extract_samples.rb +52 -0
  101. data/lib/seqtrimnext/utils/fasta2xml.rb +69 -0
  102. data/lib/seqtrimnext/utils/global_match.rb +65 -0
  103. data/lib/seqtrimnext/utils/hash_stats.rb +29 -0
  104. data/lib/seqtrimnext/utils/json_utils.rb +50 -0
  105. data/lib/seqtrimnext/utils/load_fasta_names_in_hash.rb +37 -0
  106. data/lib/seqtrimnext/utils/load_qual_in_hash.rb +37 -0
  107. data/lib/seqtrimnext/utils/recover_mid.rb +95 -0
  108. data/lib/seqtrimnext/utils/string_utils.rb +56 -0
  109. data/lib/seqtrimnext.rb +37 -0
  110. data/script/console +10 -0
  111. data/script/destroy +14 -0
  112. data/script/generate +14 -0
  113. data/test/test_helper.rb +3 -0
  114. data/test/test_seqtrimnext.rb +11 -0
  115. metadata +318 -0
@@ -0,0 +1,290 @@
1
+
2
+ #finds the classes that were in the folder 'classes'
3
+
4
+ ROOT_PATH=File.dirname(File.dirname(File.dirname(__FILE__)))
5
+
6
+ $: << File.expand_path(File.join(ROOT_PATH, 'classes'))
7
+ $: << File.expand_path(File.join(ROOT_PATH, 'classes','blast'))
8
+
9
+ #finds the classes that were in the folder 'plugins'
10
+ $: << File.expand_path(File.join(ROOT_PATH, 'plugins'))
11
+
12
+ #finds the classes that were in the folder 'plugins'
13
+ $: << File.expand_path(File.join(ROOT_PATH, 'actions'))
14
+
15
+ #finds the classes that were in the folder 'utils'
16
+ $: << File.expand_path(File.join(ROOT_PATH, 'utils'))
17
+
18
+ $: << File.expand_path(File.join(ROOT_PATH, 'classes','em_classes'))
19
+
20
+ $: << File.expand_path(ROOT_PATH)
21
+
22
+ $SEQTRIM_PATH = ROOT_PATH
23
+
24
+ if ENV['BLASTDB']# && Dir.exists?(ENV['BLASTDB'])
25
+ $FORMATTED_DB_PATH = ENV['BLASTDB']
26
+ $DB_PATH = File.dirname($FORMATTED_DB_PATH)
27
+ else
28
+ $FORMATTED_DB_PATH = File.expand_path(File.join(ROOT_PATH, "DB",'formatted'))
29
+ $DB_PATH = File.expand_path(File.join(ROOT_PATH, "DB"))
30
+ end
31
+
32
+ ENV['BLASTDB']=$FORMATTED_DB_PATH
33
+
34
+ OUTPUT_PATH='output_files'
35
+
36
+
37
+ # $: << File.expand_path('~/progs/ruby/gems/scbi_mapreduce/lib')
38
+
39
+ require 'scbi_mapreduce'
40
+ require 'params'
41
+ require 'action_manager'
42
+ require 'plugin_manager'
43
+ # require 'sequence_with_action'
44
+ #
45
+ require 'scbi_fastq'
46
+ require 'sequence_group'
47
+
48
+ class SeqtrimWorker < ScbiMapreduce::Worker
49
+
50
+ def process_object(obj)
51
+ running_seqs=SequenceGroup.new(obj)
52
+
53
+ # execute plugins
54
+ @plugin_manager.execute_plugins(running_seqs)
55
+
56
+ # add output data
57
+ add_output_data(running_seqs)
58
+
59
+ return running_seqs
60
+ end
61
+
62
+ def receive_initial_config(obj)
63
+
64
+ # Reads the parameters
65
+ $WORKER_LOG.info "Params received"
66
+ # @params = Params.new(params_path)
67
+ @params = obj
68
+
69
+ @use_qual=@params.get_param('use_qual')
70
+ @use_json=@params.get_param('use_json')
71
+ end
72
+
73
+ def starting_worker
74
+ # $WORKER_LOG.level = Logger::ERROR
75
+ $WORKER_LOG.info "Loading actions"
76
+ @action_manager = ActionManager.new
77
+
78
+ $WORKER_LOG.info "Loading plugins"
79
+ @plugin_list = @params.get_param('plugin_list') # puts in plugin_list the plugins's array
80
+ $WORKER_LOG.info "PLUGIN LIST:" + @plugin_list
81
+
82
+ @plugin_manager = PluginManager.new(@plugin_list,@params) # creates an instance from PluginManager. This must storage the plugins and load it
83
+
84
+ rescue Exception => e
85
+ puts (e.message+ e.backtrace.join("\n"))
86
+
87
+ end
88
+
89
+
90
+ def closing_worker
91
+
92
+ end
93
+
94
+
95
+ def add_output_data(obj)
96
+ obj.output_text=[]
97
+
98
+ obj.each do |seq|
99
+ obj.output_text << seq.to_text
100
+ write_seq_to_files(obj.output_files,seq, obj.stats)
101
+ end
102
+
103
+ # @remove seqs since they are not needed anymore to write output files
104
+ obj.remove_all_seqs
105
+ end
106
+
107
+ def add_stat(stats,key,subkey,value,count=1)
108
+
109
+ stats[key]={} if !stats[key]
110
+ stats[key][subkey]={} if !stats[key][subkey]
111
+ stats[key][subkey][value]=0 if !stats[key][subkey][value]
112
+
113
+ stats[key][subkey][value]+=count
114
+ end
115
+
116
+ def write_seq_to_files(files,seq, stats)
117
+ # puts stats.to_json
118
+
119
+ dir_name,file_name=seq.get_file_tag_path
120
+ # puts File.join(dir_name,'sequences_'+file_name)
121
+
122
+ # get current inserts
123
+ inserts = seq.get_inserts
124
+
125
+ # qualities are optional
126
+ if @use_qual
127
+ qual_inserts = seq.get_qual_inserts
128
+ end
129
+
130
+ # save json if necessary
131
+ if @use_json
132
+ json_file(files)<< seq.to_json
133
+ end
134
+
135
+ # find mids
136
+ mid = seq.get_actions(ActionMid).first
137
+
138
+ if (seq.seq_rejected) # sequence rejected
139
+
140
+ #save to rejected sequences
141
+ message = seq.seq_rejected_by_message
142
+ rejected_output_file(files)<<('>'+seq.seq_name+ ' ' + message)
143
+
144
+ add_stat(stats,'sequences','rejected',seq.seq_rejected_by_message)
145
+ add_stat(stats,'sequences','count','rejected')
146
+
147
+
148
+ elsif (inserts.empty?) #sequence with no inserts
149
+ message = 'No valid inserts found'
150
+ rejected_output_file(files)<<('>'+seq.seq_name+ ' ' + message)
151
+
152
+ add_stat(stats,'sequences','rejected',message)
153
+ add_stat(stats,'sequences','count','rejected')
154
+
155
+ elsif (inserts.count == 2) # sequence with two inserts = PAIRED SEQUENCES
156
+ add_stat(stats,'sequences','count','output_seqs_paired')
157
+
158
+ # TODO - Add this stats to full stats
159
+ # @@full_stats.add_stats({'sequences' => {'paired' => {'count' => 1}}})
160
+
161
+ if (mid.nil? || (mid.message=='no_MID') ) # without mid
162
+ mid_id = 'no_MID'
163
+ mid_message = ' No MID found'
164
+ else
165
+ mid_id = mid.tag_id
166
+ mid_message=''
167
+ if mid_id != mid_message
168
+ mid_message = ' '+mid.message
169
+ end
170
+ end
171
+
172
+ # fasta_file = get_paired_file(mid_id)
173
+
174
+ n="#{seq.seq_name}_left"
175
+ c="template=#{seq.seq_name} dir=R library=#{mid_id}"
176
+ f=inserts[0].reverse.tr('actgACTG','tgacTGAC')
177
+ q=[]
178
+ if @use_qual
179
+ q=qual_inserts[0].reverse
180
+ end
181
+
182
+ paired_file(files,dir_name,file_name)<<FastqFile.to_fastq(n,f,q,c)
183
+
184
+
185
+ n="#{seq.seq_name}_right"
186
+ c="template=#{seq.seq_name} dir=F library=#{mid_id}"
187
+ f=inserts[1]
188
+ q=[]
189
+ if @use_qual
190
+ q=qual_inserts[1]
191
+ end
192
+
193
+ paired_file(files,dir_name,file_name)<<FastqFile.to_fastq(n,f,q,c)
194
+
195
+
196
+ elsif (inserts.count == 1) # sequence with one insert
197
+
198
+ if (mid.nil? || (mid.message=='no_MID') ) # without mid
199
+ mid_id = 'no_MID'
200
+ mid_message = ' No MID found'
201
+ else
202
+ mid_id = mid.tag_id
203
+ mid_message=''
204
+ if mid_id != mid_message
205
+ mid_message = ' '+mid.message
206
+ end
207
+ end
208
+
209
+ # save fasta and qual in no MID file
210
+ has_low_complexity = seq.get_actions(ActionLowComplexity)
211
+
212
+ if has_low_complexity.empty?
213
+ add_stat(stats,'sequences','count','output_seqs')
214
+
215
+ # fasta_file = get_sequence_file(mid_id)
216
+ # sff_file=get_sffinfo_file(mid_id)
217
+ fasta_file=sequence_file(files,dir_name,file_name)
218
+ sff_file=sffinfo_file(files,dir_name,file_name)
219
+ else
220
+ add_stat(stats,'sequences','count','output_seqs_low_complexity')
221
+
222
+ # fasta_file = get_low_complexity_file(mid_id)
223
+ # sff_file=get_low_sffinfo_file(mid_id)
224
+ fasta_file=low_complexity_file(files,dir_name,file_name)
225
+ sff_file=low_sffinfo_file(files,dir_name,file_name)
226
+ end
227
+
228
+ q=[]
229
+ if @use_qual
230
+ q=qual_inserts[0]
231
+ end
232
+
233
+ n=seq.seq_name
234
+ c=mid_message
235
+ f=inserts[0]
236
+
237
+ fasta_file << FastqFile.to_fastq(n,f,q,c)
238
+
239
+ inserts_pos = seq.get_actions(ActionInsert)
240
+
241
+ sff_file<< "#{n} #{inserts_pos[0].start_pos+1} #{inserts_pos[0].end_pos+1}"
242
+
243
+ end
244
+
245
+ end
246
+
247
+
248
+ # ACCESS TO FILES
249
+
250
+ def json_file(files)
251
+ return get_file(files,File.join(OUTPUT_PATH,'results.json'))
252
+ end
253
+
254
+ def rejected_output_file(files)
255
+ return get_file(files,File.join(OUTPUT_PATH,'rejected.txt'))
256
+ end
257
+
258
+
259
+ def sequence_file(files, dir_name, file_name)
260
+ return get_file(files,File.join(OUTPUT_PATH,dir_name,'sequences_'+file_name+'.fastq'))
261
+ end
262
+
263
+ def paired_file(files, dir_name, file_name)
264
+ return get_file(files,File.join(OUTPUT_PATH,dir_name,'paired_'+file_name+'.fastq'))
265
+ end
266
+
267
+ def low_complexity_file(files, dir_name, file_name)
268
+ return get_file(files,File.join(OUTPUT_PATH,dir_name,'low_complexity_'+file_name+'.fastq'))
269
+ end
270
+
271
+ def sffinfo_file(files, dir_name, file_name)
272
+ return get_file(files,File.join(OUTPUT_PATH,dir_name,'sff_info_'+file_name+'.txt'))
273
+ end
274
+
275
+ def low_sffinfo_file(files, dir_name, file_name)
276
+ return get_file(files,File.join(OUTPUT_PATH,dir_name,'low_complexity_sff_info_'+file_name+'.txt'))
277
+ end
278
+
279
+ def get_file(files,fn)
280
+ res=files[fn]
281
+
282
+ if !res
283
+ files[fn]=[]
284
+ res=files[fn]
285
+ end
286
+
287
+ return res
288
+ end
289
+
290
+ end
@@ -0,0 +1,255 @@
1
+ ######################################
2
+ # Author:: Almudena Bocinos Rioboo
3
+ # Extract stats like mean of sequence's length
4
+ ######################################
5
+
6
+ # $: << '/Users/dariogf/progs/ruby/gems/scbi_plot/lib'
7
+ # $: << '/Users/dariogf/progs/ruby/gems/scbi_math/lib'
8
+
9
+ require 'scbi_plot'
10
+ require "scbi_math"
11
+
12
+ class ExtractStats
13
+
14
+ def initialize(sequence_reader,params)
15
+
16
+ @sequence_lengths = [] #array of sequences lengths
17
+ @length_frequency = [] #number of sequences of each size (frequency)
18
+ @keys={} #found keys
19
+ @params = params
20
+ @use_qual=sequence_reader.with_qual?
21
+ # @params.get_param('use_qual')
22
+
23
+ @totalnt=0
24
+ @qv=[]
25
+
26
+
27
+ @sequence_lengths_stats, @length_frequency_stats, @quality_stats = extract_stats_from_sequences(sequence_reader)
28
+
29
+
30
+ set_params_and_results
31
+
32
+ plot_lengths
33
+
34
+ plot_qualities if @use_qual
35
+
36
+ print_global_stats
37
+
38
+ end
39
+
40
+ def extract_stats_from_sequences(sequence_reader)
41
+
42
+ sequence_reader.each do |name_seq,fasta_seq,qual|
43
+ l = fasta_seq.length
44
+
45
+ @totalnt+=l
46
+
47
+ #save all lengths
48
+ @sequence_lengths.push l
49
+
50
+ # add key value
51
+ add_key(fasta_seq[0..3].upcase)
52
+
53
+ # add fasta length
54
+ @length_frequency[fasta_seq.length] = (@length_frequency[fasta_seq.length] || 1 ) + 1
55
+
56
+ #extract qv values
57
+ extract_qv_from_sequence(qual) if @use_qual
58
+
59
+ # print some progress info
60
+ if (sequence_reader.num_seqs % 10000==0)
61
+ puts "Calculating stats: #{sequence_reader.num_seqs}"
62
+ end
63
+
64
+ end
65
+
66
+ length_stats = ScbiNArray.to_na(@sequence_lengths)
67
+ length_frequency_stats = ScbiNArray.to_na(@length_frequency.map{|e| e || 0})
68
+ quality_stats = ScbiNArray.to_na(@qv) if @use_qual
69
+
70
+ return [length_stats, length_frequency_stats, quality_stats]
71
+ end
72
+
73
+ def plot_lengths
74
+
75
+ ## PLOT RESULTS
76
+ if !File.exists?('graphs')
77
+ Dir.mkdir('graphs')
78
+ end
79
+
80
+
81
+ x = []
82
+ y = []
83
+
84
+ x =(0..@length_frequency.length-1).collect.to_a
85
+ y = @length_frequency.map{|e| e || 0}
86
+
87
+ file_name = 'graphs/size_stats.png'
88
+
89
+ p=ScbiPlot::Lines.new(file_name,'Stats of sequence sizes')
90
+ p.x_label= "Sequence length"
91
+ p.y_label= "Number of sequences"
92
+
93
+ p.add_x(x)
94
+
95
+ p.add_series('sizes', y,'impulses',2)
96
+
97
+ p.add_vertical_line('Mode',@length_frequency_stats.fat_mode[0])
98
+
99
+ p.add_vertical_line('L',@params.get_param('min_sequence_size_raw').to_i)
100
+ p.add_vertical_line('H',@params.get_param('max_sequence_size_raw').to_i)
101
+
102
+ p.do_graph
103
+
104
+
105
+ end
106
+
107
+ def plot_qualities
108
+
109
+ if !File.exists?('graphs')
110
+ Dir.mkdir('graphs')
111
+ end
112
+ minimum_qual_value = @params.get_param('min_quality').to_i
113
+
114
+ # get qualities values
115
+ x=[]
116
+ y=[]
117
+ min=[]
118
+ max=[]
119
+ qual_limit=[]
120
+
121
+ @qv.each_with_index do |e,i|
122
+ x << i
123
+ y << (e[:tot]/e[:nseq])
124
+ min << (e[:min])
125
+ max << (e[:max])
126
+ qual_limit << minimum_qual_value
127
+ # puts "#{i}: #{e[:tot]/e[:nseq]}"
128
+ end
129
+
130
+ # make plot of qualities
131
+
132
+ file_name='graphs/qualities.png'
133
+
134
+ p=ScbiPlot::Lines.new(file_name,'Stats of sequence qualities')
135
+ p.x_label= "Nucleotide position"
136
+ p.y_label= "Quality value"
137
+
138
+ p.add_x(x)
139
+
140
+ p.add_series('mean', y)
141
+ p.add_series('min', min)
142
+ p.add_series('max', max)
143
+ p.add_series('qual limit',qual_limit)
144
+
145
+
146
+ p.do_graph
147
+ end
148
+
149
+
150
+ def add_qv(q,i)
151
+ if !@qv[i]
152
+ @qv[i]={:max => 0, :min => 1000000, :nseq => 0, :tot => 0}
153
+ end
154
+
155
+ # set max
156
+ @qv[i][:tot]+=q
157
+ @qv[i][:nseq]+=1
158
+ @qv[i][:min]=[@qv[i][:min],q].min
159
+ @qv[i][:max]=[@qv[i][:max],q].max
160
+
161
+ end
162
+
163
+ def extract_qv_from_sequence(qual)
164
+ qual.each_with_index do |q,i|
165
+ add_qv(q,i)
166
+ end
167
+ end
168
+
169
+ def add_key(key)
170
+ if @keys[key].nil?
171
+ @keys[key]=1
172
+ else
173
+ @keys[key]+=1
174
+ end
175
+ end
176
+
177
+ def get_max_key
178
+ return @keys.keys.sort{|e1,e2| @keys[e1]<=>@keys[e2]}.last
179
+ end
180
+
181
+ def set_params_and_results
182
+
183
+ if @sequence_lengths.empty?
184
+ puts "No sequences has been sucessfully readed "
185
+ return
186
+ end
187
+
188
+
189
+ # set limiting parameters
190
+
191
+ @params.set_param('sequencing_key',get_max_key)
192
+ @params.set_param('all_found_keys',@keys.to_json)
193
+
194
+ # sequence min size, is taken directly from params file
195
+ # max sequence limit is calculated here
196
+ if (@sequence_lengths_stats.variance_coefficient<=10) or (@params.get_param('accept_very_long_sequences')=='true')
197
+
198
+ # high size limit is calculated with stats
199
+ @params.set_param('max_sequence_size_raw',(@sequence_lengths_stats.max+10).to_i)
200
+
201
+ else # > 10 %
202
+
203
+ # high size limit is calculated with stats
204
+ @params.set_param('max_sequence_size_raw',(@sequence_lengths_stats.mean+2*@sequence_lengths_stats.stddev).to_i)
205
+ end
206
+
207
+
208
+ end
209
+
210
+ def print_global_stats
211
+
212
+ if !@sequence_lengths_stats.nil?
213
+ initial_stats={}
214
+ initial_stats[:sequence_count] = @sequence_lengths_stats.size
215
+ initial_stats[:smallest_sequence_size] = @sequence_lengths_stats.min
216
+ initial_stats[:biggest_sequence_size] = @sequence_lengths_stats.max
217
+
218
+ initial_stats[:min_sequence_size_raw]=@params.get_param('min_sequence_size_raw')
219
+ initial_stats[:max_sequence_size_raw]=@params.get_param('max_sequence_size_raw')
220
+ initial_stats[:coefficient_of_variance]=@sequence_lengths_stats.variance_coefficient
221
+ initial_stats[:nucleotide_count]=@totalnt
222
+ initial_stats[:mode_of_sizes]=@length_frequency_stats.fat_mode[0]
223
+ initial_stats[:mean_of_sequence_sizes]=@sequence_lengths_stats.mean
224
+
225
+ initial_stats[:qv]=@qv
226
+ initial_stats[:used_key]=get_max_key
227
+ initial_stats[:all_keys]=@keys
228
+
229
+ File.open(File.join(OUTPUT_PATH,'initial_stats.json'),'w') do |f|
230
+ f.puts JSON.pretty_generate(initial_stats)
231
+ end
232
+
233
+ puts "_"*10+ " STATISTICS "+"_"*10
234
+ puts "Total sequence count: #{@sequence_lengths_stats.size}"
235
+
236
+ puts "Smallest sequence: #{initial_stats[:smallest_sequence_size]} nt"
237
+ puts "Biggest sequence : #{initial_stats[:biggest_sequence_size]} nt"
238
+ puts "Mean of sequence sizes : #{initial_stats[:mean_of_sequence_sizes]} nt"
239
+ puts "Mode of sequence sizes : #{initial_stats[:mode_of_sizes]} nt"
240
+
241
+ puts "Low size limit : #{initial_stats[:min_sequence_size_raw]} nt"
242
+ puts "High size limit : #{initial_stats[:max_sequence_size_raw]} nt"
243
+
244
+ puts "Coefficient of variation: #{initial_stats[:coefficient_of_variance]} %"
245
+ puts "Total nucleotide count: #{initial_stats[:nucleotide_count]} nt"
246
+
247
+ puts "_"*30
248
+
249
+
250
+ end
251
+
252
+ end
253
+
254
+
255
+ end
@@ -0,0 +1,140 @@
1
+ require 'gnuplot'
2
+
3
+ class GnuPlotGraph
4
+
5
+ def initialize(file_name,x,y,title=nil)
6
+ $VERBOSE=true
7
+ Gnuplot.open do |gp|
8
+ # histogram
9
+ Gnuplot::Plot.new( gp ) do |plot|
10
+
11
+ # plot.space= 5 # it's the free space between the first/last value and the begin/end of axis X
12
+
13
+ #plot.set("xrange [#{xr_min}: #{xr_max}]")
14
+ if !title
15
+ title=file_name
16
+ end
17
+
18
+ plot.title "#{title}"
19
+ plot.xlabel "length"
20
+ plot.ylabel "Number of sequences"
21
+ plot.set "key off" #leyend
22
+
23
+
24
+ # plot.set "style fill solid 1.00 border -1"
25
+ # #plot.set "style histogram clustered gap 0 title offset character 0, 0, 0"
26
+ # plot.set "style data histograms"
27
+ # plot.set "boxwidth 0.2 absolute"
28
+
29
+ # For this next line, lw is linewidth (2-4)?
30
+ #plot [XMIN:XMAX] 'myHistogramData' with boxes lw VALUE
31
+
32
+ contains_strings=false
33
+
34
+ x.each do |v|
35
+ begin
36
+ r=Integer(v)
37
+ rescue
38
+ contains_strings=true
39
+ break
40
+ end
41
+ end
42
+
43
+
44
+ if !contains_strings
45
+ # plot.set "xrange [*:*]"
46
+ # puts "INTEGER GRAPH"
47
+ plot.style "fill pattern 22 border -1"
48
+ plot.set "boxwidth 0.2" # Probably 3-5.
49
+
50
+ plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
51
+ #ds.with= " boxes lw 1"
52
+ # ds.using=""
53
+ ds.with= " imp lw 4"
54
+ end
55
+
56
+ else #graph with strings in X axis
57
+ # puts "STRING GRAPH"
58
+ plot.xlabel ""
59
+
60
+ plot.set "style fill solid 1.00 border -1"
61
+ plot.set "style histogram clustered gap 1 title offset character 0, 0, 0"
62
+ plot.set "style data histogram"
63
+ plot.set "boxwidth 0.2 absolute"
64
+ if x.count>4 then
65
+ plot.set "xtics offset 0,graph 0 rotate 90"
66
+ end
67
+ # $VERBOSE=true
68
+ # plot.set "style data linespoints"
69
+ # plot.set "xtics border in scale 1,0.5 nomirror rotate by -45 offset character 0, 0, 0"
70
+
71
+ # s = []
72
+ # # i=0
73
+ # x.each_with_index do |v,i|
74
+ # #s.push "\"#{v}\""
75
+ # s.push "#{v} #{i}"
76
+ #
77
+ # # i+=1
78
+ # end
79
+ #
80
+ #
81
+ # plot.set "xtics (#{s.join(',')})"
82
+ # puts "XTICKS: (#{s.join(',')})"
83
+ # puts "X:"
84
+ # puts x.join(';')
85
+ # puts "Y:"
86
+ # puts y.join(';')
87
+
88
+ # if more than 20 strings, then keep greater ones
89
+
90
+ if x.count>20
91
+ # puts "original X:#{x.count}"
92
+ $VERBOSE=true
93
+ h = {}
94
+
95
+ x.each_with_index do |x1,i|
96
+ h[x1]=y[i]
97
+ end
98
+
99
+ # puts h.to_json
100
+ x=[]
101
+ y=[]
102
+
103
+ 10.times do
104
+ ma=h.max_by{|k,v| v}
105
+ if ma
106
+ puts "MAX:",ma.join(' * '),"of",h.values.sort.join(',')
107
+ x.push ma[0]
108
+ y.push ma[1]
109
+ h.delete(ma[0])
110
+ end
111
+ end
112
+
113
+ # puts "MAX 20 #{x.length}:#{x.join(';')}"
114
+
115
+ # set key below
116
+ # plot.set "label 3 below"
117
+
118
+ end
119
+
120
+ plot.data << Gnuplot::DataSet.new( [x,y] ) do |ds|
121
+ ds.using = "2:xticlabels(1)" #show the graph and use labels at x
122
+ # ds.using="2"
123
+ #ds.with= " boxes lw 1"
124
+ # ds.using = "2 t 'Sequences' " #show the legend in the graph
125
+ end
126
+
127
+ end
128
+
129
+ if !file_name.nil?
130
+ plot.terminal "png size 800,600"
131
+ plot.output "#{file_name}"
132
+ end
133
+ end
134
+
135
+ end
136
+
137
+ end
138
+
139
+
140
+ end