genomer-plugin-summary 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/features/contigs.feature +347 -0
- data/features/gaps.feature +34 -0
- data/features/genome.feature +213 -0
- data/features/sequences.feature +39 -8
- data/lib/genomer-plugin-summary/contigs.rb +63 -0
- data/lib/genomer-plugin-summary/enumerators.rb +81 -0
- data/lib/genomer-plugin-summary/format.rb +87 -0
- data/lib/genomer-plugin-summary/gaps.rb +25 -33
- data/lib/genomer-plugin-summary/genome.rb +51 -0
- data/lib/genomer-plugin-summary/metrics.rb +23 -9
- data/lib/genomer-plugin-summary/sequences.rb +44 -70
- data/spec/genomer-plugin-summary_spec/contigs_spec.rb +211 -0
- data/spec/genomer-plugin-summary_spec/enumerators_spec.rb +383 -0
- data/spec/genomer-plugin-summary_spec/format_spec.rb +285 -0
- data/spec/genomer-plugin-summary_spec/gaps_spec.rb +32 -7
- data/spec/genomer-plugin-summary_spec/{scaffold_spec.rb → genome_spec.rb} +26 -7
- data/spec/genomer-plugin-summary_spec/metrics_spec.rb +64 -0
- data/spec/genomer-plugin-summary_spec/sequences_spec.rb +52 -85
- data/spec/spec_helper.rb +1 -1
- metadata +20 -9
- data/features/scaffold.feature +0 -122
- data/lib/genomer-plugin-summary/scaffold.rb +0 -56
data/features/sequences.feature
CHANGED
@@ -25,10 +25,10 @@ Feature: Producing a summary of the scaffold sequences
|
|
25
25
|
+------------------+------------+------------+------------+----------+--------+
|
26
26
|
| Scaffold Sequences |
|
27
27
|
+------------------+------------+------------+------------+----------+--------+
|
28
|
-
|
|
28
|
+
| Sequence | Start (bp) | End (bp) | Size (bp) | Size (%) | GC (%) |
|
29
29
|
+------------------+------------+------------+------------+----------+--------+
|
30
30
|
+------------------+------------+------------+------------+----------+--------+
|
31
|
-
| All |
|
31
|
+
| All | 0 | 0 | 0 | 0.00 | 0.00 |
|
32
32
|
+------------------+------------+------------+------------+----------+--------+
|
33
33
|
"""
|
34
34
|
|
@@ -53,7 +53,7 @@ Feature: Producing a summary of the scaffold sequences
|
|
53
53
|
+------------------+------------+------------+------------+----------+--------+
|
54
54
|
| Scaffold Sequences |
|
55
55
|
+------------------+------------+------------+------------+----------+--------+
|
56
|
-
|
|
56
|
+
| Sequence | Start (bp) | End (bp) | Size (bp) | Size (%) | GC (%) |
|
57
57
|
+------------------+------------+------------+------------+----------+--------+
|
58
58
|
| contig0001 | 1 | 4 | 4 | 100.00 | 50.00 |
|
59
59
|
+------------------+------------+------------+------------+----------+--------+
|
@@ -87,7 +87,7 @@ Feature: Producing a summary of the scaffold sequences
|
|
87
87
|
+------------------+------------+------------+------------+----------+--------+
|
88
88
|
| Scaffold Sequences |
|
89
89
|
+------------------+------------+------------+------------+----------+--------+
|
90
|
-
|
|
90
|
+
| Sequence | Start (bp) | End (bp) | Size (bp) | Size (%) | GC (%) |
|
91
91
|
+------------------+------------+------------+------------+----------+--------+
|
92
92
|
| contig0001 | 1 | 6 | 6 | 50.00 | 66.67 |
|
93
93
|
| contig0002 | 7 | 12 | 6 | 50.00 | 33.33 |
|
@@ -120,7 +120,7 @@ Feature: Producing a summary of the scaffold sequences
|
|
120
120
|
+------------------+------------+------------+------------+----------+--------+
|
121
121
|
| Scaffold Sequences |
|
122
122
|
+------------------+------------+------------+------------+----------+--------+
|
123
|
-
|
|
123
|
+
| Sequence | Start (bp) | End (bp) | Size (bp) | Size (%) | GC (%) |
|
124
124
|
+------------------+------------+------------+------------+----------+--------+
|
125
125
|
| contig0001 | 1 | 6 | 6 | 50.00 | 66.67 |
|
126
126
|
| contig0001 | 7 | 12 | 6 | 50.00 | 66.67 |
|
@@ -158,7 +158,7 @@ Feature: Producing a summary of the scaffold sequences
|
|
158
158
|
+------------------+------------+------------+------------+----------+--------+
|
159
159
|
| Scaffold Sequences |
|
160
160
|
+------------------+------------+------------+------------+----------+--------+
|
161
|
-
|
|
161
|
+
| Sequence | Start (bp) | End (bp) | Size (bp) | Size (%) | GC (%) |
|
162
162
|
+------------------+------------+------------+------------+----------+--------+
|
163
163
|
| contig0001 | 1 | 6 | 6 | 30.00 | 66.67 |
|
164
164
|
| contig0002 | 15 | 20 | 6 | 30.00 | 33.33 |
|
@@ -196,7 +196,7 @@ Feature: Producing a summary of the scaffold sequences
|
|
196
196
|
+------------------+------------+------------+------------+----------+--------+
|
197
197
|
| Scaffold Sequences |
|
198
198
|
+------------------+------------+------------+------------+----------+--------+
|
199
|
-
|
|
199
|
+
| Sequence | Start (bp) | End (bp) | Size (bp) | Size (%) | GC (%) |
|
200
200
|
+------------------+------------+------------+------------+----------+--------+
|
201
201
|
| contig0001 | 9 | 14 | 6 | 30.00 | 66.67 |
|
202
202
|
| contig0002 | 15 | 20 | 6 | 30.00 | 33.33 |
|
@@ -235,7 +235,7 @@ Feature: Producing a summary of the scaffold sequences
|
|
235
235
|
+------------------+------------+------------+------------+----------+--------+
|
236
236
|
| Scaffold Sequences |
|
237
237
|
+------------------+------------+------------+------------+----------+--------+
|
238
|
-
|
|
238
|
+
| Sequence | Start (bp) | End (bp) | Size (bp) | Size (%) | GC (%) |
|
239
239
|
+------------------+------------+------------+------------+----------+--------+
|
240
240
|
| contig0001 | 1 | 6 | 6 | 30.00 | 66.67 |
|
241
241
|
| contig0002 | 7 | 12 | 6 | 30.00 | 33.33 |
|
@@ -244,3 +244,34 @@ Feature: Producing a summary of the scaffold sequences
|
|
244
244
|
+------------------+------------+------------+------------+----------+--------+
|
245
245
|
"""
|
246
246
|
|
247
|
+
Scenario: Generating CSV output
|
248
|
+
Given I create a new genomer project
|
249
|
+
And I write to "assembly/scaffold.yml" with:
|
250
|
+
"""
|
251
|
+
---
|
252
|
+
-
|
253
|
+
sequence:
|
254
|
+
source: contig0001
|
255
|
+
-
|
256
|
+
sequence:
|
257
|
+
source: contig0002
|
258
|
+
-
|
259
|
+
unresolved:
|
260
|
+
length: 8
|
261
|
+
"""
|
262
|
+
And I write to "assembly/sequence.fna" with:
|
263
|
+
"""
|
264
|
+
>contig0001
|
265
|
+
ATGCGC
|
266
|
+
>contig0002
|
267
|
+
ATATGC
|
268
|
+
"""
|
269
|
+
When I run `genomer summary sequences --output=csv`
|
270
|
+
Then the exit status should be 0
|
271
|
+
And the output should contain:
|
272
|
+
"""
|
273
|
+
sequence,start_bp,end_bp,size_bp,size_%,gc_%
|
274
|
+
contig0001,1,6,6,30.00,66.67
|
275
|
+
contig0002,7,12,6,30.00,33.33
|
276
|
+
all,1,12,12,60.00,50.00
|
277
|
+
"""
|
@@ -0,0 +1,63 @@
|
|
1
|
+
require 'genomer'
|
2
|
+
require 'genomer-plugin-summary/format'
|
3
|
+
require 'genomer-plugin-summary/enumerators'
|
4
|
+
require 'genomer-plugin-summary/metrics'
|
5
|
+
|
6
|
+
class GenomerPluginSummary::Contigs < Genomer::Plugin
|
7
|
+
include GenomerPluginSummary::Metrics
|
8
|
+
include GenomerPluginSummary::Format
|
9
|
+
include GenomerPluginSummary::Enumerators
|
10
|
+
|
11
|
+
FORMATTING = {
|
12
|
+
:title => 'Scaffold Contigs',
|
13
|
+
:headers => ['Contig', 'Start (bp)', 'End (bp)', 'Size (bp)', 'Size (%)', 'GC (%)'],
|
14
|
+
:width => {
|
15
|
+
0 => 6,
|
16
|
+
1 => 10,
|
17
|
+
2 => 10,
|
18
|
+
3 => 10,
|
19
|
+
4 => 8,
|
20
|
+
5 => 6
|
21
|
+
},
|
22
|
+
:justification => {
|
23
|
+
0 => :right,
|
24
|
+
1 => :right,
|
25
|
+
2 => :right,
|
26
|
+
3 => :right,
|
27
|
+
4 => :right,
|
28
|
+
5 => :right
|
29
|
+
},
|
30
|
+
:format => {
|
31
|
+
4 => '%#.2f',
|
32
|
+
5 => '%#.2f'
|
33
|
+
}
|
34
|
+
}
|
35
|
+
COLUMNS = [:id, :start, :stop, :size, :percent, :gc]
|
36
|
+
|
37
|
+
def run
|
38
|
+
contigs = calculate(scaffold)
|
39
|
+
total = sequence_total(contigs)
|
40
|
+
|
41
|
+
tabulate(contigs,total,flags)
|
42
|
+
end
|
43
|
+
|
44
|
+
def tabulate(contigs,total,flags)
|
45
|
+
rows = contigs.map{|contig| COLUMNS.map{|col| contig[col]}}.
|
46
|
+
<<(:separator).
|
47
|
+
<<(COLUMNS.map{|col| total[col] || 'All'})
|
48
|
+
|
49
|
+
FORMATTING[:output] = flags[:output]
|
50
|
+
table(rows,FORMATTING)
|
51
|
+
end
|
52
|
+
|
53
|
+
def calculate(scaffold)
|
54
|
+
total_length = scaffold.mapping(&:sequence).mapping(&:length).inject(&:+).to_f
|
55
|
+
enumerator_for_contig(scaffold).
|
56
|
+
mapping{|i| i[:gc] = gc(i[:sequence]) / atgc(i[:sequence]) * 100; i}.
|
57
|
+
mapping{|i| i[:size] = i[:sequence].length; i}.
|
58
|
+
mapping{|i| i[:percent] = i[:size] / total_length * 100; i}.
|
59
|
+
mapping{|i| i.delete(:sequence); i}.
|
60
|
+
to_a
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'lazing'
|
2
|
+
|
3
|
+
module GenomerPluginSummary::Enumerators
|
4
|
+
|
5
|
+
def enumerator_for(type,scaffold)
|
6
|
+
send('enumerator_for_' + type.to_s, scaffold)
|
7
|
+
end
|
8
|
+
|
9
|
+
def enumerator_for_sequence(scaffold)
|
10
|
+
enumerator_for_all(scaffold).
|
11
|
+
selecting{|i| i[:type] == :sequence}
|
12
|
+
end
|
13
|
+
|
14
|
+
def enumerator_for_unresolved(scaffold)
|
15
|
+
enumerator_for_all(scaffold).
|
16
|
+
selecting{|i| i[:type] == :unresolved}
|
17
|
+
end
|
18
|
+
|
19
|
+
def enumerator_for_contig(scaffold)
|
20
|
+
genome = scaffold.mapping(&:sequence).to_a.join
|
21
|
+
regions = genome.
|
22
|
+
gsub(/([^Nn])([Nn])/,'\1 \2').
|
23
|
+
gsub(/([Nn])([^Nn])/,'\1 \2').
|
24
|
+
scan(/[^\s]+/)
|
25
|
+
|
26
|
+
regions.inject([0,1,[]]) do |memo,entry|
|
27
|
+
position, number, entries = memo
|
28
|
+
|
29
|
+
if entry.downcase.include? 'n'
|
30
|
+
next [position + entry.length, number, entries]
|
31
|
+
end
|
32
|
+
|
33
|
+
i = {:sequence => entry,
|
34
|
+
:start => position + 1,
|
35
|
+
:stop => position + entry.length,
|
36
|
+
:type => :contig,
|
37
|
+
:id => number}
|
38
|
+
|
39
|
+
[position + entry.length, number + 1, entries << i]
|
40
|
+
end.last
|
41
|
+
end
|
42
|
+
|
43
|
+
def enumerator_for_gap(scaffold)
|
44
|
+
genome = scaffold.mapping(&:sequence).to_a.join
|
45
|
+
regions = genome.
|
46
|
+
gsub(/([^Nn])([Nn])/,'\1 \2').
|
47
|
+
gsub(/([Nn])([^Nn])/,'\1 \2').
|
48
|
+
scan(/[^\s]+/)
|
49
|
+
|
50
|
+
regions.inject([0,1,[]]) do |memo,entry|
|
51
|
+
position, number, entries = memo
|
52
|
+
|
53
|
+
unless entry.downcase.include? 'n'
|
54
|
+
next [position + entry.length, number, entries]
|
55
|
+
end
|
56
|
+
|
57
|
+
i = {:sequence => entry,
|
58
|
+
:start => position + 1,
|
59
|
+
:stop => position + entry.length,
|
60
|
+
:type => :gap,
|
61
|
+
:id => number}
|
62
|
+
|
63
|
+
[position + entry.length, number + 1, entries << i]
|
64
|
+
end.last
|
65
|
+
end
|
66
|
+
|
67
|
+
def enumerator_for_all(scaffold)
|
68
|
+
scaffold.inject([0,[]]) do |memo,entry|
|
69
|
+
position, entries = memo
|
70
|
+
|
71
|
+
i = {:sequence => entry.sequence,
|
72
|
+
:start => position + 1,
|
73
|
+
:stop => position + entry.sequence.length,
|
74
|
+
:type => entry.entry_type,
|
75
|
+
:id => entry.entry_type == :sequence ? entry.source : nil}
|
76
|
+
|
77
|
+
[position + entry.sequence.length, entries << i]
|
78
|
+
end.last
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
require 'terminal-table'
|
2
|
+
require 'lazing'
|
3
|
+
|
4
|
+
module GenomerPluginSummary::Format
|
5
|
+
|
6
|
+
DEFAULTS = {
|
7
|
+
:justification => [],
|
8
|
+
:width => {},
|
9
|
+
:format => {}
|
10
|
+
}
|
11
|
+
|
12
|
+
def table(data,opts = {})
|
13
|
+
opts = DEFAULTS.merge opts
|
14
|
+
case opts[:output]
|
15
|
+
when 'csv' then csv(data,opts)
|
16
|
+
else pretty(data,opts)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def create_cells(data,opts)
|
21
|
+
data.map do |row|
|
22
|
+
if row == :separator
|
23
|
+
:separator
|
24
|
+
else
|
25
|
+
row.each_with_index.map do |cell,index|
|
26
|
+
format_cell(cell,
|
27
|
+
opts[:width][index],
|
28
|
+
opts[:justification][index],
|
29
|
+
opts[:format][index])
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def format_cell(cell,width,justification,format = nil)
|
36
|
+
formatted = case format
|
37
|
+
when String then sprintf(format,cell)
|
38
|
+
when Proc then format.call(cell).to_s
|
39
|
+
when nil then cell.to_s
|
40
|
+
end
|
41
|
+
|
42
|
+
return formatted if width.nil?
|
43
|
+
|
44
|
+
case justification
|
45
|
+
when :right then formatted.rjust(width)
|
46
|
+
when :center then formatted.center(width)
|
47
|
+
else formatted.ljust(width)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def csv(data,opts)
|
52
|
+
opts[:width] = {}
|
53
|
+
opts[:justification] = {}
|
54
|
+
|
55
|
+
cells = create_cells(data,opts)
|
56
|
+
|
57
|
+
cells.unshift opts[:headers] if opts[:headers]
|
58
|
+
|
59
|
+
cells.compact.
|
60
|
+
rejecting{|i| i == :separator}.
|
61
|
+
mapping{|i| i.join(',')}.
|
62
|
+
mapping{|i| i.gsub(' ','_')}.
|
63
|
+
mapping{|i| i.gsub(/[()]/,'')}.
|
64
|
+
mapping{|i| i.downcase}.
|
65
|
+
to_a. join("\n") + "\n"
|
66
|
+
end
|
67
|
+
|
68
|
+
def pretty(data,opts)
|
69
|
+
cells = create_cells(data,opts)
|
70
|
+
|
71
|
+
if opts[:headers]
|
72
|
+
cells.unshift :separator
|
73
|
+
cells.unshift(opts[:headers].each_with_index.map do |header,index|
|
74
|
+
width = opts[:width][index] || cells.mapping{|c| c[index].length }.max
|
75
|
+
format_cell(header, width, :center)
|
76
|
+
end)
|
77
|
+
end
|
78
|
+
|
79
|
+
table = Terminal::Table.new do |t|
|
80
|
+
cells.each{|c| t << c}
|
81
|
+
end
|
82
|
+
opts[:justification].each{|(k,v)| table.align_column k, v }
|
83
|
+
table.title ||= opts[:title]
|
84
|
+
table.to_s + "\n"
|
85
|
+
end
|
86
|
+
|
87
|
+
end
|
@@ -1,45 +1,37 @@
|
|
1
1
|
require 'genomer'
|
2
|
-
require '
|
2
|
+
require 'genomer-plugin-summary/format'
|
3
3
|
|
4
4
|
class GenomerPluginSummary::Gaps < Genomer::Plugin
|
5
|
+
include GenomerPluginSummary::Format
|
5
6
|
|
6
7
|
def run
|
7
|
-
tabulate
|
8
|
+
tabulate(determine_gaps(scaffold),flags)
|
8
9
|
end
|
9
10
|
|
10
|
-
|
11
|
-
['Number'.center(8),
|
12
|
-
'Length'.center(8),
|
13
|
-
'Start'.center(8),
|
14
|
-
'End'.center(8),
|
15
|
-
'Type'.center(12)]
|
16
|
-
end
|
17
|
-
|
18
|
-
def title
|
19
|
-
'Scaffold Gaps'
|
20
|
-
end
|
21
|
-
|
22
|
-
def tabulate(contigs)
|
23
|
-
table = Terminal::Table.new(:title => title) do |t|
|
24
|
-
t << headings
|
25
|
-
t << :separator
|
26
|
-
contigs.each do |ctg|
|
27
|
-
t << [ctg[:number],
|
28
|
-
ctg[:length],
|
29
|
-
ctg[:start],
|
30
|
-
ctg[:end],
|
31
|
-
ctg[:type]]
|
32
|
-
end
|
33
|
-
end
|
11
|
+
COLUMNS = [:number, :length, :start, :end, :type]
|
34
12
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
13
|
+
FORMATTING = {
|
14
|
+
:title => 'Scaffold Gaps',
|
15
|
+
:headers => ['Number', 'Length', 'Start', 'End', 'Type'],
|
16
|
+
:width => {
|
17
|
+
0 => 8,
|
18
|
+
1 => 8,
|
19
|
+
2 => 8,
|
20
|
+
3 => 8,
|
21
|
+
4 => 12
|
22
|
+
},
|
23
|
+
:justification => {
|
24
|
+
0 => :right,
|
25
|
+
1 => :right,
|
26
|
+
2 => :right,
|
27
|
+
3 => :right,
|
28
|
+
4 => :center
|
29
|
+
}
|
30
|
+
}
|
41
31
|
|
42
|
-
|
32
|
+
def tabulate(gaps,flags)
|
33
|
+
FORMATTING[:output] = flags[:output]
|
34
|
+
table(gaps.map{|gap| COLUMNS.map{|col| gap[col]}},FORMATTING)
|
43
35
|
end
|
44
36
|
|
45
37
|
def gap_locations(seq)
|
@@ -0,0 +1,51 @@
|
|
1
|
+
require 'genomer'
|
2
|
+
require 'genomer-plugin-summary/metrics'
|
3
|
+
require 'genomer-plugin-summary/format'
|
4
|
+
|
5
|
+
class GenomerPluginSummary::Genome < Genomer::Plugin
|
6
|
+
include GenomerPluginSummary::Metrics
|
7
|
+
include GenomerPluginSummary::Format
|
8
|
+
|
9
|
+
LAYOUT = [
|
10
|
+
{:name => 'Sequences (#)', :entry_type => :sequence, :method => :count},
|
11
|
+
{:name => 'Contigs (#)', :entry_type => :contig, :method => :count},
|
12
|
+
{:name => 'Gaps (#)', :entry_type => :gap, :method => :count},
|
13
|
+
:separator,
|
14
|
+
{:name => 'Size (bp)', :entry_type => :all, :method => :length},
|
15
|
+
{:name => 'Sequences (bp)', :entry_type => :sequence, :method => :length},
|
16
|
+
{:name => 'Contigs (bp)', :entry_type => :contig, :method => :length},
|
17
|
+
{:name => 'Gaps (bp)', :entry_type => :gap, :method => :length},
|
18
|
+
:separator,
|
19
|
+
{:name => 'G+C (%)', :entry_type => :all, :method => :gc_content},
|
20
|
+
{:name => 'Sequences (%)', :entry_type => :sequence, :method => :percent},
|
21
|
+
{:name => 'Contigs (%)', :entry_type => :contig, :method => :percent},
|
22
|
+
{:name => 'Gaps (%)', :entry_type => :gap, :method => :percent}
|
23
|
+
]
|
24
|
+
|
25
|
+
FORMATTING = {
|
26
|
+
:title => 'Scaffold',
|
27
|
+
:width => {0 => 12, 1 => 9},
|
28
|
+
:justification => {1 => :right},
|
29
|
+
:format => {1 => lambda{|i| i.class == Float ? sprintf('%#.2f',i) : i }}
|
30
|
+
}
|
31
|
+
|
32
|
+
def run
|
33
|
+
tabulate(calculate_metrics(LAYOUT, scaffold),flags)
|
34
|
+
end
|
35
|
+
|
36
|
+
def tabulate(data,flags)
|
37
|
+
FORMATTING.store(:output,flags[:output]) if flags[:output]
|
38
|
+
table(data,FORMATTING)
|
39
|
+
end
|
40
|
+
|
41
|
+
def calculate_metrics(specs,scaffold)
|
42
|
+
specs.map do |spec|
|
43
|
+
if spec == :separator
|
44
|
+
spec
|
45
|
+
else
|
46
|
+
[spec[:name], send(spec[:method],spec[:entry_type],scaffold)]
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
@@ -1,13 +1,15 @@
|
|
1
1
|
require 'genomer'
|
2
|
+
require 'genomer-plugin-summary/enumerators'
|
2
3
|
require 'lazing'
|
3
4
|
|
4
5
|
module GenomerPluginSummary::Metrics
|
6
|
+
include GenomerPluginSummary::Enumerators
|
5
7
|
|
6
8
|
ALL = :all
|
7
9
|
|
8
10
|
def gc_content(type,scfd)
|
9
|
-
gc = enumerator_for(type,scfd).mapping{|i| gc(i)}.inject(:+) || 0.0
|
10
|
-
atgc = enumerator_for(type,scfd).mapping{|i| atgc(i)}.inject(:+) || 0.0
|
11
|
+
gc = enumerator_for(type,scfd).mapping{|i| gc(i[:sequence])}.inject(:+) || 0.0
|
12
|
+
atgc = enumerator_for(type,scfd).mapping{|i| atgc(i[:sequence])}.inject(:+) || 0.0
|
11
13
|
gc / atgc * 100
|
12
14
|
end
|
13
15
|
|
@@ -21,21 +23,33 @@ module GenomerPluginSummary::Metrics
|
|
21
23
|
|
22
24
|
def length(type,scfd)
|
23
25
|
enumerator_for(type,scfd).
|
24
|
-
mapping
|
26
|
+
mapping{|i| i[:sequence]}.
|
25
27
|
mapping(&:length).
|
26
28
|
inject(:+) || 0
|
27
29
|
end
|
28
30
|
|
29
|
-
def gc(
|
30
|
-
|
31
|
+
def gc(sequence)
|
32
|
+
sequence.gsub(/[^GCgc]/,'').length.to_f
|
31
33
|
end
|
32
34
|
|
33
|
-
def atgc(
|
34
|
-
|
35
|
+
def atgc(sequence)
|
36
|
+
sequence.gsub(/[^ATGCatgc]/,'').length.to_f
|
35
37
|
end
|
36
38
|
|
37
|
-
def
|
38
|
-
|
39
|
+
def sequence_total(seqs)
|
40
|
+
return Hash[[:start, :stop, :size, :percent, :gc].map{|i| [i, 0]}] if seqs.empty?
|
41
|
+
|
42
|
+
totals = seqs.inject({:size => 0, :percent => 0, :gc => 0}) do |hash,entry|
|
43
|
+
hash[:start] ||= entry[:start]
|
44
|
+
hash[:stop] = entry[:stop]
|
45
|
+
hash[:size] += entry[:size]
|
46
|
+
hash[:percent] += entry[:percent]
|
47
|
+
hash[:gc] += entry[:gc] * entry[:size]
|
48
|
+
|
49
|
+
hash
|
50
|
+
end
|
51
|
+
totals[:gc] /= totals[:size]
|
52
|
+
totals
|
39
53
|
end
|
40
54
|
|
41
55
|
end
|
@@ -1,92 +1,66 @@
|
|
1
1
|
require 'genomer'
|
2
2
|
require 'genomer-plugin-summary/metrics'
|
3
|
-
require '
|
3
|
+
require 'genomer-plugin-summary/format'
|
4
4
|
|
5
5
|
class GenomerPluginSummary::Sequences < Genomer::Plugin
|
6
6
|
include GenomerPluginSummary::Metrics
|
7
|
+
include GenomerPluginSummary::Format
|
8
|
+
include GenomerPluginSummary::Enumerators
|
7
9
|
|
8
10
|
def run
|
9
11
|
sequences = calculate(scaffold)
|
10
|
-
total =
|
12
|
+
total = sequence_total(sequences)
|
11
13
|
|
12
|
-
tabulate(sequences,total)
|
14
|
+
tabulate(sequences,total,flags)
|
13
15
|
end
|
14
16
|
|
15
|
-
|
16
|
-
['Sequence'.left(16),
|
17
|
-
'Start (bp)'.center(10),
|
18
|
-
'End (bp)'.center(10),
|
19
|
-
'Size (bp)'.center(10),
|
20
|
-
'Size (%)'.center(8),
|
21
|
-
'GC (%)'.center(6)]
|
22
|
-
end
|
23
|
-
|
24
|
-
def title
|
25
|
-
'Scaffold Sequences'
|
26
|
-
end
|
27
|
-
|
28
|
-
def tabulate(rows,total)
|
29
|
-
table = Terminal::Table.new(:title => title) do |t|
|
30
|
-
t << headings
|
31
|
-
t << :separator
|
32
|
-
rows.each do |row|
|
33
|
-
t << table_array(row)
|
34
|
-
end
|
35
|
-
t << :separator
|
36
|
-
t << table_array(total.merge({:sequence => 'All'}))
|
37
|
-
end
|
17
|
+
COLUMNS = [:id, :start, :stop, :size, :percent, :gc]
|
38
18
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
19
|
+
FORMATTING = {
|
20
|
+
:title => 'Scaffold Sequences',
|
21
|
+
:headers => ['Sequence', 'Start (bp)', 'End (bp)', 'Size (bp)', 'Size (%)', 'GC (%)'],
|
22
|
+
:width => {
|
23
|
+
0 => 16,
|
24
|
+
1 => 10,
|
25
|
+
2 => 10,
|
26
|
+
3 => 10,
|
27
|
+
4 => 8,
|
28
|
+
5 => 6
|
29
|
+
},
|
30
|
+
:justification => {
|
31
|
+
0 => :left,
|
32
|
+
1 => :right,
|
33
|
+
2 => :right,
|
34
|
+
3 => :right,
|
35
|
+
4 => :right,
|
36
|
+
5 => :right
|
37
|
+
},
|
38
|
+
:format => {
|
39
|
+
4 => '%#.2f',
|
40
|
+
5 => '%#.2f'
|
41
|
+
}
|
42
|
+
}
|
45
43
|
|
46
|
-
|
47
|
-
|
44
|
+
def tabulate(sequences,total,flags)
|
45
|
+
rows = sequences.map{|sequence| COLUMNS.map{|col| sequence[col]}}.
|
46
|
+
<<(:separator).
|
47
|
+
<<(COLUMNS.map{|col| total[col] || 'All'})
|
48
48
|
|
49
|
-
|
50
|
-
|
51
|
-
map{|i| hash[i]}.
|
52
|
-
map{|i| i.class == Float ? sprintf('%#.2f',i) : i }
|
49
|
+
FORMATTING[:output] = flags[:output]
|
50
|
+
table(rows,FORMATTING)
|
53
51
|
end
|
54
52
|
|
55
53
|
def calculate(scaffold)
|
56
|
-
total_length = length(
|
57
|
-
running_length = 0
|
58
|
-
|
59
|
-
scaffold.map do |entry|
|
60
|
-
i = nil
|
61
|
-
if entry.entry_type != :unresolved
|
62
|
-
entry_length = entry.sequence.length
|
63
|
-
i = { :sequence => entry.source,
|
64
|
-
:start => running_length + 1,
|
65
|
-
:end => running_length + entry_length,
|
66
|
-
:size => entry_length,
|
67
|
-
:percent => entry_length / total_length * 100,
|
68
|
-
:gc => gc(entry) / atgc(entry) * 100 }
|
69
|
-
end
|
70
|
-
|
71
|
-
running_length += entry.sequence.length
|
72
|
-
i
|
73
|
-
end.compact
|
74
|
-
end
|
75
|
-
|
76
|
-
def total(seqs)
|
77
|
-
return Hash[[:start, :end, :size, :percent, :gc].map{|i| [i, 'NA']}] if seqs.empty?
|
54
|
+
total_length = scaffold.mapping(&:sequence).mapping(&:length).inject(&:+).to_f
|
78
55
|
|
79
|
-
|
80
|
-
|
81
|
-
hash[:end] = entry[:end]
|
82
|
-
hash[:size] += entry[:size]
|
83
|
-
hash[:percent] += entry[:percent]
|
84
|
-
hash[:gc] += entry[:gc] * entry[:size]
|
56
|
+
enumerator_for(:sequence,scaffold).mapping do |entry|
|
57
|
+
sequence = entry.delete(:sequence)
|
85
58
|
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
59
|
+
entry[:size] = sequence.length
|
60
|
+
entry[:gc] = gc(sequence) / atgc(sequence) * 100
|
61
|
+
entry[:percent] = sequence.length / total_length * 100
|
62
|
+
entry
|
63
|
+
end.to_a
|
90
64
|
end
|
91
65
|
|
92
66
|
end
|