seqtrimnext 2.0.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. data/History.txt +3 -0
  2. data/Manifest.txt +114 -0
  3. data/PostInstall.txt +7 -0
  4. data/README.rdoc +159 -0
  5. data/Rakefile +38 -0
  6. data/bin/create_graphs.rb +46 -0
  7. data/bin/extract_seqs.rb +45 -0
  8. data/bin/extract_seqs_from_fasta.rb +56 -0
  9. data/bin/extract_seqs_from_fastq.rb +45 -0
  10. data/bin/fasta2fastq.rb +38 -0
  11. data/bin/fastq2fasta.rb +35 -0
  12. data/bin/gen_qual.rb +46 -0
  13. data/bin/get_seq.rb +46 -0
  14. data/bin/group_by_range.rb +17 -0
  15. data/bin/join_ilumina_paired.rb +130 -0
  16. data/bin/parse_amplicons.rb +95 -0
  17. data/bin/parse_json_results.rb +66 -0
  18. data/bin/parse_params.rb +82 -0
  19. data/bin/resume_clusters.rb +48 -0
  20. data/bin/resume_rejected.sh +9 -0
  21. data/bin/reverse_paired.rb +49 -0
  22. data/bin/seqtrimnext +368 -0
  23. data/bin/split_fastq.rb +42 -0
  24. data/bin/split_ilumina_paired.rb +65 -0
  25. data/bin/split_paired.rb +70 -0
  26. data/lib/seqtrimnext/actions/action_ab_adapter.rb +32 -0
  27. data/lib/seqtrimnext/actions/action_ab_far_adapter.rb +32 -0
  28. data/lib/seqtrimnext/actions/action_ab_left_adapter.rb +32 -0
  29. data/lib/seqtrimnext/actions/action_empty_insert.rb +22 -0
  30. data/lib/seqtrimnext/actions/action_ignore_repeated.rb +24 -0
  31. data/lib/seqtrimnext/actions/action_indetermination.rb +30 -0
  32. data/lib/seqtrimnext/actions/action_induced_low_complexity.rb +29 -0
  33. data/lib/seqtrimnext/actions/action_insert.rb +32 -0
  34. data/lib/seqtrimnext/actions/action_is_contaminated.rb +30 -0
  35. data/lib/seqtrimnext/actions/action_key.rb +30 -0
  36. data/lib/seqtrimnext/actions/action_left_adapter.rb +32 -0
  37. data/lib/seqtrimnext/actions/action_left_primer.rb +17 -0
  38. data/lib/seqtrimnext/actions/action_linker.rb +30 -0
  39. data/lib/seqtrimnext/actions/action_low_complexity.rb +30 -0
  40. data/lib/seqtrimnext/actions/action_low_high_size.rb +31 -0
  41. data/lib/seqtrimnext/actions/action_low_quality.rb +33 -0
  42. data/lib/seqtrimnext/actions/action_mid.rb +30 -0
  43. data/lib/seqtrimnext/actions/action_multiple_linker.rb +29 -0
  44. data/lib/seqtrimnext/actions/action_paired_reads.rb +28 -0
  45. data/lib/seqtrimnext/actions/action_poly_a.rb +29 -0
  46. data/lib/seqtrimnext/actions/action_poly_t.rb +29 -0
  47. data/lib/seqtrimnext/actions/action_rem_adit_artifacts.rb +32 -0
  48. data/lib/seqtrimnext/actions/action_right_adapter.rb +29 -0
  49. data/lib/seqtrimnext/actions/action_right_primer.rb +25 -0
  50. data/lib/seqtrimnext/actions/action_short_insert.rb +32 -0
  51. data/lib/seqtrimnext/actions/action_unexpected_poly_t.rb +29 -0
  52. data/lib/seqtrimnext/actions/action_unexpected_vector.rb +31 -0
  53. data/lib/seqtrimnext/actions/action_vectors.rb +31 -0
  54. data/lib/seqtrimnext/actions/seqtrim_action.rb +136 -0
  55. data/lib/seqtrimnext/classes/action_manager.rb +47 -0
  56. data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +335 -0
  57. data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +290 -0
  58. data/lib/seqtrimnext/classes/extract_stats.rb +255 -0
  59. data/lib/seqtrimnext/classes/gnu_plot_graph.rb +140 -0
  60. data/lib/seqtrimnext/classes/graph_stats.rb +74 -0
  61. data/lib/seqtrimnext/classes/install_database.rb +43 -0
  62. data/lib/seqtrimnext/classes/install_requirements.rb +123 -0
  63. data/lib/seqtrimnext/classes/list_db.rb +49 -0
  64. data/lib/seqtrimnext/classes/make_blast_db.rb +113 -0
  65. data/lib/seqtrimnext/classes/one_blast.rb +41 -0
  66. data/lib/seqtrimnext/classes/params.rb +387 -0
  67. data/lib/seqtrimnext/classes/piro.rb +78 -0
  68. data/lib/seqtrimnext/classes/plugin_manager.rb +153 -0
  69. data/lib/seqtrimnext/classes/scan_for_restr_site.rb +138 -0
  70. data/lib/seqtrimnext/classes/scbi_stats.rb +68 -0
  71. data/lib/seqtrimnext/classes/seqtrim.rb +317 -0
  72. data/lib/seqtrimnext/classes/sequence.rb +55 -0
  73. data/lib/seqtrimnext/classes/sequence_group.rb +72 -0
  74. data/lib/seqtrimnext/classes/sequence_with_action.rb +503 -0
  75. data/lib/seqtrimnext/plugins/plugin.rb +267 -0
  76. data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +189 -0
  77. data/lib/seqtrimnext/plugins/plugin_adapters.rb +165 -0
  78. data/lib/seqtrimnext/plugins/plugin_amplicons.rb +221 -0
  79. data/lib/seqtrimnext/plugins/plugin_contaminants.rb +209 -0
  80. data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +438 -0
  81. data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +393 -0
  82. data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +101 -0
  83. data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +199 -0
  84. data/lib/seqtrimnext/plugins/plugin_key.rb +70 -0
  85. data/lib/seqtrimnext/plugins/plugin_linker.rb +232 -0
  86. data/lib/seqtrimnext/plugins/plugin_low_complexity.rb +98 -0
  87. data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +74 -0
  88. data/lib/seqtrimnext/plugins/plugin_low_quality.rb +394 -0
  89. data/lib/seqtrimnext/plugins/plugin_mids.rb +231 -0
  90. data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +246 -0
  91. data/lib/seqtrimnext/plugins/plugin_short_insert.rb +244 -0
  92. data/lib/seqtrimnext/plugins/plugin_vectors.rb +191 -0
  93. data/lib/seqtrimnext/templates/amplicons.txt +16 -0
  94. data/lib/seqtrimnext/templates/genomics_454.txt +5 -0
  95. data/lib/seqtrimnext/templates/genomics_454_with_paired.txt +5 -0
  96. data/lib/seqtrimnext/templates/low_quality.txt +5 -0
  97. data/lib/seqtrimnext/templates/low_quality_and_low_complexity.txt +5 -0
  98. data/lib/seqtrimnext/templates/transcriptomics_454.txt +8 -0
  99. data/lib/seqtrimnext/templates/transcriptomics_plants.txt +8 -0
  100. data/lib/seqtrimnext/utils/extract_samples.rb +52 -0
  101. data/lib/seqtrimnext/utils/fasta2xml.rb +69 -0
  102. data/lib/seqtrimnext/utils/global_match.rb +65 -0
  103. data/lib/seqtrimnext/utils/hash_stats.rb +29 -0
  104. data/lib/seqtrimnext/utils/json_utils.rb +50 -0
  105. data/lib/seqtrimnext/utils/load_fasta_names_in_hash.rb +37 -0
  106. data/lib/seqtrimnext/utils/load_qual_in_hash.rb +37 -0
  107. data/lib/seqtrimnext/utils/recover_mid.rb +95 -0
  108. data/lib/seqtrimnext/utils/string_utils.rb +56 -0
  109. data/lib/seqtrimnext.rb +37 -0
  110. data/script/console +10 -0
  111. data/script/destroy +14 -0
  112. data/script/generate +14 -0
  113. data/test/test_helper.rb +3 -0
  114. data/test/test_seqtrimnext.rb +11 -0
  115. metadata +318 -0
@@ -0,0 +1,290 @@
1
+
2
+ #finds the classes that were in the folder 'classes'
3
+
4
+ ROOT_PATH=File.dirname(File.dirname(File.dirname(__FILE__)))
5
+
6
+ $: << File.expand_path(File.join(ROOT_PATH, 'classes'))
7
+ $: << File.expand_path(File.join(ROOT_PATH, 'classes','blast'))
8
+
9
+ #finds the classes that were in the folder 'plugins'
10
+ $: << File.expand_path(File.join(ROOT_PATH, 'plugins'))
11
+
12
+ #finds the classes that were in the folder 'plugins'
13
+ $: << File.expand_path(File.join(ROOT_PATH, 'actions'))
14
+
15
+ #finds the classes that were in the folder 'utils'
16
+ $: << File.expand_path(File.join(ROOT_PATH, 'utils'))
17
+
18
+ $: << File.expand_path(File.join(ROOT_PATH, 'classes','em_classes'))
19
+
20
+ $: << File.expand_path(ROOT_PATH)
21
+
22
+ $SEQTRIM_PATH = ROOT_PATH
23
+
24
+ if ENV['BLASTDB']# && Dir.exists?(ENV['BLASTDB'])
25
+ $FORMATTED_DB_PATH = ENV['BLASTDB']
26
+ $DB_PATH = File.dirname($FORMATTED_DB_PATH)
27
+ else
28
+ $FORMATTED_DB_PATH = File.expand_path(File.join(ROOT_PATH, "DB",'formatted'))
29
+ $DB_PATH = File.expand_path(File.join(ROOT_PATH, "DB"))
30
+ end
31
+
32
+ ENV['BLASTDB']=$FORMATTED_DB_PATH
33
+
34
+ OUTPUT_PATH='output_files'
35
+
36
+
37
+ # $: << File.expand_path('~/progs/ruby/gems/scbi_mapreduce/lib')
38
+
39
+ require 'scbi_mapreduce'
40
+ require 'params'
41
+ require 'action_manager'
42
+ require 'plugin_manager'
43
+ # require 'sequence_with_action'
44
+ #
45
+ require 'scbi_fastq'
46
+ require 'sequence_group'
47
+
48
+ class SeqtrimWorker < ScbiMapreduce::Worker
49
+
50
+ def process_object(obj)
51
+ running_seqs=SequenceGroup.new(obj)
52
+
53
+ # execute plugins
54
+ @plugin_manager.execute_plugins(running_seqs)
55
+
56
+ # add output data
57
+ add_output_data(running_seqs)
58
+
59
+ return running_seqs
60
+ end
61
+
62
+ def receive_initial_config(obj)
63
+
64
+ # Reads the parameters
65
+ $WORKER_LOG.info "Params received"
66
+ # @params = Params.new(params_path)
67
+ @params = obj
68
+
69
+ @use_qual=@params.get_param('use_qual')
70
+ @use_json=@params.get_param('use_json')
71
+ end
72
+
73
+ def starting_worker
74
+ # $WORKER_LOG.level = Logger::ERROR
75
+ $WORKER_LOG.info "Loading actions"
76
+ @action_manager = ActionManager.new
77
+
78
+ $WORKER_LOG.info "Loading plugins"
79
+ @plugin_list = @params.get_param('plugin_list') # puts in plugin_list the plugins's array
80
+ $WORKER_LOG.info "PLUGIN LIST:" + @plugin_list
81
+
82
+ @plugin_manager = PluginManager.new(@plugin_list,@params) # creates an instance from PluginManager. This must storage the plugins and load it
83
+
84
+ rescue Exception => e
85
+ puts (e.message+ e.backtrace.join("\n"))
86
+
87
+ end
88
+
89
+
90
+ def closing_worker
91
+
92
+ end
93
+
94
+
95
+ def add_output_data(obj)
96
+ obj.output_text=[]
97
+
98
+ obj.each do |seq|
99
+ obj.output_text << seq.to_text
100
+ write_seq_to_files(obj.output_files,seq, obj.stats)
101
+ end
102
+
103
+ # @remove seqs since they are not needed anymore to write output files
104
+ obj.remove_all_seqs
105
+ end
106
+
107
+ def add_stat(stats,key,subkey,value,count=1)
108
+
109
+ stats[key]={} if !stats[key]
110
+ stats[key][subkey]={} if !stats[key][subkey]
111
+ stats[key][subkey][value]=0 if !stats[key][subkey][value]
112
+
113
+ stats[key][subkey][value]+=count
114
+ end
115
+
116
+ def write_seq_to_files(files,seq, stats)
117
+ # puts stats.to_json
118
+
119
+ dir_name,file_name=seq.get_file_tag_path
120
+ # puts File.join(dir_name,'sequences_'+file_name)
121
+
122
+ # get current inserts
123
+ inserts = seq.get_inserts
124
+
125
+ # qualities are optional
126
+ if @use_qual
127
+ qual_inserts = seq.get_qual_inserts
128
+ end
129
+
130
+ # save json if necessary
131
+ if @use_json
132
+ json_file(files)<< seq.to_json
133
+ end
134
+
135
+ # find mids
136
+ mid = seq.get_actions(ActionMid).first
137
+
138
+ if (seq.seq_rejected) # sequence rejected
139
+
140
+ #save to rejected sequences
141
+ message = seq.seq_rejected_by_message
142
+ rejected_output_file(files)<<('>'+seq.seq_name+ ' ' + message)
143
+
144
+ add_stat(stats,'sequences','rejected',seq.seq_rejected_by_message)
145
+ add_stat(stats,'sequences','count','rejected')
146
+
147
+
148
+ elsif (inserts.empty?) #sequence with no inserts
149
+ message = 'No valid inserts found'
150
+ rejected_output_file(files)<<('>'+seq.seq_name+ ' ' + message)
151
+
152
+ add_stat(stats,'sequences','rejected',message)
153
+ add_stat(stats,'sequences','count','rejected')
154
+
155
+ elsif (inserts.count == 2) # sequence with two inserts = PAIRED SEQUENCES
156
+ add_stat(stats,'sequences','count','output_seqs_paired')
157
+
158
+ # TODO - Add this stats to full stats
159
+ # @@full_stats.add_stats({'sequences' => {'paired' => {'count' => 1}}})
160
+
161
+ if (mid.nil? || (mid.message=='no_MID') ) # without mid
162
+ mid_id = 'no_MID'
163
+ mid_message = ' No MID found'
164
+ else
165
+ mid_id = mid.tag_id
166
+ mid_message=''
167
+ if mid_id != mid_message
168
+ mid_message = ' '+mid.message
169
+ end
170
+ end
171
+
172
+ # fasta_file = get_paired_file(mid_id)
173
+
174
+ n="#{seq.seq_name}_left"
175
+ c="template=#{seq.seq_name} dir=R library=#{mid_id}"
176
+ f=inserts[0].reverse.tr('actgACTG','tgacTGAC')
177
+ q=[]
178
+ if @use_qual
179
+ q=qual_inserts[0].reverse
180
+ end
181
+
182
+ paired_file(files,dir_name,file_name)<<FastqFile.to_fastq(n,f,q,c)
183
+
184
+
185
+ n="#{seq.seq_name}_right"
186
+ c="template=#{seq.seq_name} dir=F library=#{mid_id}"
187
+ f=inserts[1]
188
+ q=[]
189
+ if @use_qual
190
+ q=qual_inserts[1]
191
+ end
192
+
193
+ paired_file(files,dir_name,file_name)<<FastqFile.to_fastq(n,f,q,c)
194
+
195
+
196
+ elsif (inserts.count == 1) # sequence with one insert
197
+
198
+ if (mid.nil? || (mid.message=='no_MID') ) # without mid
199
+ mid_id = 'no_MID'
200
+ mid_message = ' No MID found'
201
+ else
202
+ mid_id = mid.tag_id
203
+ mid_message=''
204
+ if mid_id != mid_message
205
+ mid_message = ' '+mid.message
206
+ end
207
+ end
208
+
209
+ # save fasta and qual in no MID file
210
+ has_low_complexity = seq.get_actions(ActionLowComplexity)
211
+
212
+ if has_low_complexity.empty?
213
+ add_stat(stats,'sequences','count','output_seqs')
214
+
215
+ # fasta_file = get_sequence_file(mid_id)
216
+ # sff_file=get_sffinfo_file(mid_id)
217
+ fasta_file=sequence_file(files,dir_name,file_name)
218
+ sff_file=sffinfo_file(files,dir_name,file_name)
219
+ else
220
+ add_stat(stats,'sequences','count','output_seqs_low_complexity')
221
+
222
+ # fasta_file = get_low_complexity_file(mid_id)
223
+ # sff_file=get_low_sffinfo_file(mid_id)
224
+ fasta_file=low_complexity_file(files,dir_name,file_name)
225
+ sff_file=low_sffinfo_file(files,dir_name,file_name)
226
+ end
227
+
228
+ q=[]
229
+ if @use_qual
230
+ q=qual_inserts[0]
231
+ end
232
+
233
+ n=seq.seq_name
234
+ c=mid_message
235
+ f=inserts[0]
236
+
237
+ fasta_file << FastqFile.to_fastq(n,f,q,c)
238
+
239
+ inserts_pos = seq.get_actions(ActionInsert)
240
+
241
+ sff_file<< "#{n} #{inserts_pos[0].start_pos+1} #{inserts_pos[0].end_pos+1}"
242
+
243
+ end
244
+
245
+ end
246
+
247
+
248
+ # ACCESS TO FILES
249
+
250
+ def json_file(files)
251
+ return get_file(files,File.join(OUTPUT_PATH,'results.json'))
252
+ end
253
+
254
+ def rejected_output_file(files)
255
+ return get_file(files,File.join(OUTPUT_PATH,'rejected.txt'))
256
+ end
257
+
258
+
259
+ def sequence_file(files, dir_name, file_name)
260
+ return get_file(files,File.join(OUTPUT_PATH,dir_name,'sequences_'+file_name+'.fastq'))
261
+ end
262
+
263
+ def paired_file(files, dir_name, file_name)
264
+ return get_file(files,File.join(OUTPUT_PATH,dir_name,'paired_'+file_name+'.fastq'))
265
+ end
266
+
267
+ def low_complexity_file(files, dir_name, file_name)
268
+ return get_file(files,File.join(OUTPUT_PATH,dir_name,'low_complexity_'+file_name+'.fastq'))
269
+ end
270
+
271
+ def sffinfo_file(files, dir_name, file_name)
272
+ return get_file(files,File.join(OUTPUT_PATH,dir_name,'sff_info_'+file_name+'.txt'))
273
+ end
274
+
275
+ def low_sffinfo_file(files, dir_name, file_name)
276
+ return get_file(files,File.join(OUTPUT_PATH,dir_name,'low_complexity_sff_info_'+file_name+'.txt'))
277
+ end
278
+
279
+ def get_file(files,fn)
280
+ res=files[fn]
281
+
282
+ if !res
283
+ files[fn]=[]
284
+ res=files[fn]
285
+ end
286
+
287
+ return res
288
+ end
289
+
290
+ end
@@ -0,0 +1,255 @@
1
+ ######################################
2
+ # Author:: Almudena Bocinos Rioboo
3
+ # Extract stats like mean of sequence's length
4
+ ######################################
5
+
6
+ # $: << '/Users/dariogf/progs/ruby/gems/scbi_plot/lib'
7
+ # $: << '/Users/dariogf/progs/ruby/gems/scbi_math/lib'
8
+
9
+ require 'scbi_plot'
10
+ require "scbi_math"
11
+
12
+ class ExtractStats
13
+
14
+ def initialize(sequence_reader,params)
15
+
16
+ @sequence_lengths = [] #array of sequences lengths
17
+ @length_frequency = [] #number of sequences of each size (frequency)
18
+ @keys={} #found keys
19
+ @params = params
20
+ @use_qual=sequence_reader.with_qual?
21
+ # @params.get_param('use_qual')
22
+
23
+ @totalnt=0
24
+ @qv=[]
25
+
26
+
27
+ @sequence_lengths_stats, @length_frequency_stats, @quality_stats = extract_stats_from_sequences(sequence_reader)
28
+
29
+
30
+ set_params_and_results
31
+
32
+ plot_lengths
33
+
34
+ plot_qualities if @use_qual
35
+
36
+ print_global_stats
37
+
38
+ end
39
+
40
+ def extract_stats_from_sequences(sequence_reader)
41
+
42
+ sequence_reader.each do |name_seq,fasta_seq,qual|
43
+ l = fasta_seq.length
44
+
45
+ @totalnt+=l
46
+
47
+ #save all lengths
48
+ @sequence_lengths.push l
49
+
50
+ # add key value
51
+ add_key(fasta_seq[0..3].upcase)
52
+
53
+ # add fasta length
54
+ @length_frequency[fasta_seq.length] = (@length_frequency[fasta_seq.length] || 1 ) + 1
55
+
56
+ #extract qv values
57
+ extract_qv_from_sequence(qual) if @use_qual
58
+
59
+ # print some progress info
60
+ if (sequence_reader.num_seqs % 10000==0)
61
+ puts "Calculating stats: #{sequence_reader.num_seqs}"
62
+ end
63
+
64
+ end
65
+
66
+ length_stats = ScbiNArray.to_na(@sequence_lengths)
67
+ length_frequency_stats = ScbiNArray.to_na(@length_frequency.map{|e| e || 0})
68
+ quality_stats = ScbiNArray.to_na(@qv) if @use_qual
69
+
70
+ return [length_stats, length_frequency_stats, quality_stats]
71
+ end
72
+
73
+ def plot_lengths
74
+
75
+ ## PLOT RESULTS
76
+ if !File.exists?('graphs')
77
+ Dir.mkdir('graphs')
78
+ end
79
+
80
+
81
+ x = []
82
+ y = []
83
+
84
+ x =(0..@length_frequency.length-1).collect.to_a
85
+ y = @length_frequency.map{|e| e || 0}
86
+
87
+ file_name = 'graphs/size_stats.png'
88
+
89
+ p=ScbiPlot::Lines.new(file_name,'Stats of sequence sizes')
90
+ p.x_label= "Sequence length"
91
+ p.y_label= "Number of sequences"
92
+
93
+ p.add_x(x)
94
+
95
+ p.add_series('sizes', y,'impulses',2)
96
+
97
+ p.add_vertical_line('Mode',@length_frequency_stats.fat_mode[0])
98
+
99
+ p.add_vertical_line('L',@params.get_param('min_sequence_size_raw').to_i)
100
+ p.add_vertical_line('H',@params.get_param('max_sequence_size_raw').to_i)
101
+
102
+ p.do_graph
103
+
104
+
105
+ end
106
+
107
+ def plot_qualities
108
+
109
+ if !File.exists?('graphs')
110
+ Dir.mkdir('graphs')
111
+ end
112
+ minimum_qual_value = @params.get_param('min_quality').to_i
113
+
114
+ # get qualities values
115
+ x=[]
116
+ y=[]
117
+ min=[]
118
+ max=[]
119
+ qual_limit=[]
120
+
121
+ @qv.each_with_index do |e,i|
122
+ x << i
123
+ y << (e[:tot]/e[:nseq])
124
+ min << (e[:min])
125
+ max << (e[:max])
126
+ qual_limit << minimum_qual_value
127
+ # puts "#{i}: #{e[:tot]/e[:nseq]}"
128
+ end
129
+
130
+ # make plot of qualities
131
+
132
+ file_name='graphs/qualities.png'
133
+
134
+ p=ScbiPlot::Lines.new(file_name,'Stats of sequence qualities')
135
+ p.x_label= "Nucleotide position"
136
+ p.y_label= "Quality value"
137
+
138
+ p.add_x(x)
139
+
140
+ p.add_series('mean', y)
141
+ p.add_series('min', min)
142
+ p.add_series('max', max)
143
+ p.add_series('qual limit',qual_limit)
144
+
145
+
146
+ p.do_graph
147
+ end
148
+
149
+
150
+ def add_qv(q,i)
151
+ if !@qv[i]
152
+ @qv[i]={:max => 0, :min => 1000000, :nseq => 0, :tot => 0}
153
+ end
154
+
155
+ # set max
156
+ @qv[i][:tot]+=q
157
+ @qv[i][:nseq]+=1
158
+ @qv[i][:min]=[@qv[i][:min],q].min
159
+ @qv[i][:max]=[@qv[i][:max],q].max
160
+
161
+ end
162
+
163
+ def extract_qv_from_sequence(qual)
164
+ qual.each_with_index do |q,i|
165
+ add_qv(q,i)
166
+ end
167
+ end
168
+
169
+ def add_key(key)
170
+ if @keys[key].nil?
171
+ @keys[key]=1
172
+ else
173
+ @keys[key]+=1
174
+ end
175
+ end
176
+
177
+ def get_max_key
178
+ return @keys.keys.sort{|e1,e2| @keys[e1]<=>@keys[e2]}.last
179
+ end
180
+
181
+ def set_params_and_results
182
+
183
+ if @sequence_lengths.empty?
184
+ puts "No sequences has been sucessfully readed "
185
+ return
186
+ end
187
+
188
+
189
+ # set limiting parameters
190
+
191
+ @params.set_param('sequencing_key',get_max_key)
192
+ @params.set_param('all_found_keys',@keys.to_json)
193
+
194
+ # sequence min size, is taken directly from params file
195
+ # max sequence limit is calculated here
196
+ if (@sequence_lengths_stats.variance_coefficient<=10) or (@params.get_param('accept_very_long_sequences')=='true')
197
+
198
+ # high size limit is calculated with stats
199
+ @params.set_param('max_sequence_size_raw',(@sequence_lengths_stats.max+10).to_i)
200
+
201
+ else # > 10 %
202
+
203
+ # high size limit is calculated with stats
204
+ @params.set_param('max_sequence_size_raw',(@sequence_lengths_stats.mean+2*@sequence_lengths_stats.stddev).to_i)
205
+ end
206
+
207
+
208
+ end
209
+
210
+ def print_global_stats
211
+
212
+ if !@sequence_lengths_stats.nil?
213
+ initial_stats={}
214
+ initial_stats[:sequence_count] = @sequence_lengths_stats.size
215
+ initial_stats[:smallest_sequence_size] = @sequence_lengths_stats.min
216
+ initial_stats[:biggest_sequence_size] = @sequence_lengths_stats.max
217
+
218
+ initial_stats[:min_sequence_size_raw]=@params.get_param('min_sequence_size_raw')
219
+ initial_stats[:max_sequence_size_raw]=@params.get_param('max_sequence_size_raw')
220
+ initial_stats[:coefficient_of_variance]=@sequence_lengths_stats.variance_coefficient
221
+ initial_stats[:nucleotide_count]=@totalnt
222
+ initial_stats[:mode_of_sizes]=@length_frequency_stats.fat_mode[0]
223
+ initial_stats[:mean_of_sequence_sizes]=@sequence_lengths_stats.mean
224
+
225
+ initial_stats[:qv]=@qv
226
+ initial_stats[:used_key]=get_max_key
227
+ initial_stats[:all_keys]=@keys
228
+
229
+ File.open(File.join(OUTPUT_PATH,'initial_stats.json'),'w') do |f|
230
+ f.puts JSON.pretty_generate(initial_stats)
231
+ end
232
+
233
+ puts "_"*10+ " STATISTICS "+"_"*10
234
+ puts "Total sequence count: #{@sequence_lengths_stats.size}"
235
+
236
+ puts "Smallest sequence: #{initial_stats[:smallest_sequence_size]} nt"
237
+ puts "Biggest sequence : #{initial_stats[:biggest_sequence_size]} nt"
238
+ puts "Mean of sequence sizes : #{initial_stats[:mean_of_sequence_sizes]} nt"
239
+ puts "Mode of sequence sizes : #{initial_stats[:mode_of_sizes]} nt"
240
+
241
+ puts "Low size limit : #{initial_stats[:min_sequence_size_raw]} nt"
242
+ puts "High size limit : #{initial_stats[:max_sequence_size_raw]} nt"
243
+
244
+ puts "Coefficient of variation: #{initial_stats[:coefficient_of_variance]} %"
245
+ puts "Total nucleotide count: #{initial_stats[:nucleotide_count]} nt"
246
+
247
+ puts "_"*30
248
+
249
+
250
+ end
251
+
252
+ end
253
+
254
+
255
+ end
@@ -0,0 +1,140 @@
1
+ require 'gnuplot'
2
+
3
+ class GnuPlotGraph
4
+
5
+ def initialize(file_name,x,y,title=nil)
6
+ $VERBOSE=true
7
+ Gnuplot.open do |gp|
8
+ # histogram
9
+ Gnuplot::Plot.new( gp ) do |plot|
10
+
11
+ # plot.space= 5 # it's the free space between the first/last value and the begin/end of axis X
12
+
13
+ #plot.set("xrange [#{xr_min}: #{xr_max}]")
14
+ if !title
15
+ title=file_name
16
+ end
17
+
18
+ plot.title "#{title}"
19
+ plot.xlabel "length"
20
+ plot.ylabel "Number of sequences"
21
+ plot.set "key off" #leyend
22
+
23
+
24
+ # plot.set "style fill solid 1.00 border -1"
25
+ # #plot.set "style histogram clustered gap 0 title offset character 0, 0, 0"
26
+ # plot.set "style data histograms"
27
+ # plot.set "boxwidth 0.2 absolute"
28
+
29
+ # For this next line, lw is linewidth (2-4)?
30
+ #plot [XMIN:XMAX] 'myHistogramData' with boxes lw VALUE
31
+
32
+ contains_strings=false
33
+
34
+ x.each do |v|
35
+ begin
36
+ r=Integer(v)
37
+ rescue
38
+ contains_strings=true
39
+ break
40
+ end
41
+ end
42
+
43
+
44
+ if !contains_strings
45
+ # plot.set "xrange [*:*]"
46
+ # puts "INTEGER GRAPH"
47
+ plot.style "fill pattern 22 border -1"
48
+ plot.set "boxwidth 0.2" # Probably 3-5.
49
+
50
+ plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
51
+ #ds.with= " boxes lw 1"
52
+ # ds.using=""
53
+ ds.with= " imp lw 4"
54
+ end
55
+
56
+ else #graph with strings in X axis
57
+ # puts "STRING GRAPH"
58
+ plot.xlabel ""
59
+
60
+ plot.set "style fill solid 1.00 border -1"
61
+ plot.set "style histogram clustered gap 1 title offset character 0, 0, 0"
62
+ plot.set "style data histogram"
63
+ plot.set "boxwidth 0.2 absolute"
64
+ if x.count>4 then
65
+ plot.set "xtics offset 0,graph 0 rotate 90"
66
+ end
67
+ # $VERBOSE=true
68
+ # plot.set "style data linespoints"
69
+ # plot.set "xtics border in scale 1,0.5 nomirror rotate by -45 offset character 0, 0, 0"
70
+
71
+ # s = []
72
+ # # i=0
73
+ # x.each_with_index do |v,i|
74
+ # #s.push "\"#{v}\""
75
+ # s.push "#{v} #{i}"
76
+ #
77
+ # # i+=1
78
+ # end
79
+ #
80
+ #
81
+ # plot.set "xtics (#{s.join(',')})"
82
+ # puts "XTICKS: (#{s.join(',')})"
83
+ # puts "X:"
84
+ # puts x.join(';')
85
+ # puts "Y:"
86
+ # puts y.join(';')
87
+
88
+ # if more than 20 strings, then keep greater ones
89
+
90
+ if x.count>20
91
+ # puts "original X:#{x.count}"
92
+ $VERBOSE=true
93
+ h = {}
94
+
95
+ x.each_with_index do |x1,i|
96
+ h[x1]=y[i]
97
+ end
98
+
99
+ # puts h.to_json
100
+ x=[]
101
+ y=[]
102
+
103
+ 10.times do
104
+ ma=h.max_by{|k,v| v}
105
+ if ma
106
+ puts "MAX:",ma.join(' * '),"of",h.values.sort.join(',')
107
+ x.push ma[0]
108
+ y.push ma[1]
109
+ h.delete(ma[0])
110
+ end
111
+ end
112
+
113
+ # puts "MAX 20 #{x.length}:#{x.join(';')}"
114
+
115
+ # set key below
116
+ # plot.set "label 3 below"
117
+
118
+ end
119
+
120
+ plot.data << Gnuplot::DataSet.new( [x,y] ) do |ds|
121
+ ds.using = "2:xticlabels(1)" #show the graph and use labels at x
122
+ # ds.using="2"
123
+ #ds.with= " boxes lw 1"
124
+ # ds.using = "2 t 'Sequences' " #show the legend in the graph
125
+ end
126
+
127
+ end
128
+
129
+ if !file_name.nil?
130
+ plot.terminal "png size 800,600"
131
+ plot.output "#{file_name}"
132
+ end
133
+ end
134
+
135
+ end
136
+
137
+ end
138
+
139
+
140
+ end