genomer-plugin-summary 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/features/contigs.feature +347 -0
- data/features/gaps.feature +34 -0
- data/features/genome.feature +213 -0
- data/features/sequences.feature +39 -8
- data/lib/genomer-plugin-summary/contigs.rb +63 -0
- data/lib/genomer-plugin-summary/enumerators.rb +81 -0
- data/lib/genomer-plugin-summary/format.rb +87 -0
- data/lib/genomer-plugin-summary/gaps.rb +25 -33
- data/lib/genomer-plugin-summary/genome.rb +51 -0
- data/lib/genomer-plugin-summary/metrics.rb +23 -9
- data/lib/genomer-plugin-summary/sequences.rb +44 -70
- data/spec/genomer-plugin-summary_spec/contigs_spec.rb +211 -0
- data/spec/genomer-plugin-summary_spec/enumerators_spec.rb +383 -0
- data/spec/genomer-plugin-summary_spec/format_spec.rb +285 -0
- data/spec/genomer-plugin-summary_spec/gaps_spec.rb +32 -7
- data/spec/genomer-plugin-summary_spec/{scaffold_spec.rb → genome_spec.rb} +26 -7
- data/spec/genomer-plugin-summary_spec/metrics_spec.rb +64 -0
- data/spec/genomer-plugin-summary_spec/sequences_spec.rb +52 -85
- data/spec/spec_helper.rb +1 -1
- metadata +20 -9
- data/features/scaffold.feature +0 -122
- data/lib/genomer-plugin-summary/scaffold.rb +0 -56
data/features/sequences.feature
CHANGED
@@ -25,10 +25,10 @@ Feature: Producing a summary of the scaffold sequences
|
|
25
25
|
+------------------+------------+------------+------------+----------+--------+
|
26
26
|
| Scaffold Sequences |
|
27
27
|
+------------------+------------+------------+------------+----------+--------+
|
28
|
-
|
|
28
|
+
| Sequence | Start (bp) | End (bp) | Size (bp) | Size (%) | GC (%) |
|
29
29
|
+------------------+------------+------------+------------+----------+--------+
|
30
30
|
+------------------+------------+------------+------------+----------+--------+
|
31
|
-
| All |
|
31
|
+
| All | 0 | 0 | 0 | 0.00 | 0.00 |
|
32
32
|
+------------------+------------+------------+------------+----------+--------+
|
33
33
|
"""
|
34
34
|
|
@@ -53,7 +53,7 @@ Feature: Producing a summary of the scaffold sequences
|
|
53
53
|
+------------------+------------+------------+------------+----------+--------+
|
54
54
|
| Scaffold Sequences |
|
55
55
|
+------------------+------------+------------+------------+----------+--------+
|
56
|
-
|
|
56
|
+
| Sequence | Start (bp) | End (bp) | Size (bp) | Size (%) | GC (%) |
|
57
57
|
+------------------+------------+------------+------------+----------+--------+
|
58
58
|
| contig0001 | 1 | 4 | 4 | 100.00 | 50.00 |
|
59
59
|
+------------------+------------+------------+------------+----------+--------+
|
@@ -87,7 +87,7 @@ Feature: Producing a summary of the scaffold sequences
|
|
87
87
|
+------------------+------------+------------+------------+----------+--------+
|
88
88
|
| Scaffold Sequences |
|
89
89
|
+------------------+------------+------------+------------+----------+--------+
|
90
|
-
|
|
90
|
+
| Sequence | Start (bp) | End (bp) | Size (bp) | Size (%) | GC (%) |
|
91
91
|
+------------------+------------+------------+------------+----------+--------+
|
92
92
|
| contig0001 | 1 | 6 | 6 | 50.00 | 66.67 |
|
93
93
|
| contig0002 | 7 | 12 | 6 | 50.00 | 33.33 |
|
@@ -120,7 +120,7 @@ Feature: Producing a summary of the scaffold sequences
|
|
120
120
|
+------------------+------------+------------+------------+----------+--------+
|
121
121
|
| Scaffold Sequences |
|
122
122
|
+------------------+------------+------------+------------+----------+--------+
|
123
|
-
|
|
123
|
+
| Sequence | Start (bp) | End (bp) | Size (bp) | Size (%) | GC (%) |
|
124
124
|
+------------------+------------+------------+------------+----------+--------+
|
125
125
|
| contig0001 | 1 | 6 | 6 | 50.00 | 66.67 |
|
126
126
|
| contig0001 | 7 | 12 | 6 | 50.00 | 66.67 |
|
@@ -158,7 +158,7 @@ Feature: Producing a summary of the scaffold sequences
|
|
158
158
|
+------------------+------------+------------+------------+----------+--------+
|
159
159
|
| Scaffold Sequences |
|
160
160
|
+------------------+------------+------------+------------+----------+--------+
|
161
|
-
|
|
161
|
+
| Sequence | Start (bp) | End (bp) | Size (bp) | Size (%) | GC (%) |
|
162
162
|
+------------------+------------+------------+------------+----------+--------+
|
163
163
|
| contig0001 | 1 | 6 | 6 | 30.00 | 66.67 |
|
164
164
|
| contig0002 | 15 | 20 | 6 | 30.00 | 33.33 |
|
@@ -196,7 +196,7 @@ Feature: Producing a summary of the scaffold sequences
|
|
196
196
|
+------------------+------------+------------+------------+----------+--------+
|
197
197
|
| Scaffold Sequences |
|
198
198
|
+------------------+------------+------------+------------+----------+--------+
|
199
|
-
|
|
199
|
+
| Sequence | Start (bp) | End (bp) | Size (bp) | Size (%) | GC (%) |
|
200
200
|
+------------------+------------+------------+------------+----------+--------+
|
201
201
|
| contig0001 | 9 | 14 | 6 | 30.00 | 66.67 |
|
202
202
|
| contig0002 | 15 | 20 | 6 | 30.00 | 33.33 |
|
@@ -235,7 +235,7 @@ Feature: Producing a summary of the scaffold sequences
|
|
235
235
|
+------------------+------------+------------+------------+----------+--------+
|
236
236
|
| Scaffold Sequences |
|
237
237
|
+------------------+------------+------------+------------+----------+--------+
|
238
|
-
|
|
238
|
+
| Sequence | Start (bp) | End (bp) | Size (bp) | Size (%) | GC (%) |
|
239
239
|
+------------------+------------+------------+------------+----------+--------+
|
240
240
|
| contig0001 | 1 | 6 | 6 | 30.00 | 66.67 |
|
241
241
|
| contig0002 | 7 | 12 | 6 | 30.00 | 33.33 |
|
@@ -244,3 +244,34 @@ Feature: Producing a summary of the scaffold sequences
|
|
244
244
|
+------------------+------------+------------+------------+----------+--------+
|
245
245
|
"""
|
246
246
|
|
247
|
+
Scenario: Generating CSV output
|
248
|
+
Given I create a new genomer project
|
249
|
+
And I write to "assembly/scaffold.yml" with:
|
250
|
+
"""
|
251
|
+
---
|
252
|
+
-
|
253
|
+
sequence:
|
254
|
+
source: contig0001
|
255
|
+
-
|
256
|
+
sequence:
|
257
|
+
source: contig0002
|
258
|
+
-
|
259
|
+
unresolved:
|
260
|
+
length: 8
|
261
|
+
"""
|
262
|
+
And I write to "assembly/sequence.fna" with:
|
263
|
+
"""
|
264
|
+
>contig0001
|
265
|
+
ATGCGC
|
266
|
+
>contig0002
|
267
|
+
ATATGC
|
268
|
+
"""
|
269
|
+
When I run `genomer summary sequences --output=csv`
|
270
|
+
Then the exit status should be 0
|
271
|
+
And the output should contain:
|
272
|
+
"""
|
273
|
+
sequence,start_bp,end_bp,size_bp,size_%,gc_%
|
274
|
+
contig0001,1,6,6,30.00,66.67
|
275
|
+
contig0002,7,12,6,30.00,33.33
|
276
|
+
all,1,12,12,60.00,50.00
|
277
|
+
"""
|
@@ -0,0 +1,63 @@
|
|
1
|
+
require 'genomer'
|
2
|
+
require 'genomer-plugin-summary/format'
|
3
|
+
require 'genomer-plugin-summary/enumerators'
|
4
|
+
require 'genomer-plugin-summary/metrics'
|
5
|
+
|
6
|
+
class GenomerPluginSummary::Contigs < Genomer::Plugin
|
7
|
+
include GenomerPluginSummary::Metrics
|
8
|
+
include GenomerPluginSummary::Format
|
9
|
+
include GenomerPluginSummary::Enumerators
|
10
|
+
|
11
|
+
FORMATTING = {
|
12
|
+
:title => 'Scaffold Contigs',
|
13
|
+
:headers => ['Contig', 'Start (bp)', 'End (bp)', 'Size (bp)', 'Size (%)', 'GC (%)'],
|
14
|
+
:width => {
|
15
|
+
0 => 6,
|
16
|
+
1 => 10,
|
17
|
+
2 => 10,
|
18
|
+
3 => 10,
|
19
|
+
4 => 8,
|
20
|
+
5 => 6
|
21
|
+
},
|
22
|
+
:justification => {
|
23
|
+
0 => :right,
|
24
|
+
1 => :right,
|
25
|
+
2 => :right,
|
26
|
+
3 => :right,
|
27
|
+
4 => :right,
|
28
|
+
5 => :right
|
29
|
+
},
|
30
|
+
:format => {
|
31
|
+
4 => '%#.2f',
|
32
|
+
5 => '%#.2f'
|
33
|
+
}
|
34
|
+
}
|
35
|
+
COLUMNS = [:id, :start, :stop, :size, :percent, :gc]
|
36
|
+
|
37
|
+
def run
|
38
|
+
contigs = calculate(scaffold)
|
39
|
+
total = sequence_total(contigs)
|
40
|
+
|
41
|
+
tabulate(contigs,total,flags)
|
42
|
+
end
|
43
|
+
|
44
|
+
def tabulate(contigs,total,flags)
|
45
|
+
rows = contigs.map{|contig| COLUMNS.map{|col| contig[col]}}.
|
46
|
+
<<(:separator).
|
47
|
+
<<(COLUMNS.map{|col| total[col] || 'All'})
|
48
|
+
|
49
|
+
FORMATTING[:output] = flags[:output]
|
50
|
+
table(rows,FORMATTING)
|
51
|
+
end
|
52
|
+
|
53
|
+
def calculate(scaffold)
|
54
|
+
total_length = scaffold.mapping(&:sequence).mapping(&:length).inject(&:+).to_f
|
55
|
+
enumerator_for_contig(scaffold).
|
56
|
+
mapping{|i| i[:gc] = gc(i[:sequence]) / atgc(i[:sequence]) * 100; i}.
|
57
|
+
mapping{|i| i[:size] = i[:sequence].length; i}.
|
58
|
+
mapping{|i| i[:percent] = i[:size] / total_length * 100; i}.
|
59
|
+
mapping{|i| i.delete(:sequence); i}.
|
60
|
+
to_a
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'lazing'
|
2
|
+
|
3
|
+
module GenomerPluginSummary::Enumerators
|
4
|
+
|
5
|
+
def enumerator_for(type,scaffold)
|
6
|
+
send('enumerator_for_' + type.to_s, scaffold)
|
7
|
+
end
|
8
|
+
|
9
|
+
def enumerator_for_sequence(scaffold)
|
10
|
+
enumerator_for_all(scaffold).
|
11
|
+
selecting{|i| i[:type] == :sequence}
|
12
|
+
end
|
13
|
+
|
14
|
+
def enumerator_for_unresolved(scaffold)
|
15
|
+
enumerator_for_all(scaffold).
|
16
|
+
selecting{|i| i[:type] == :unresolved}
|
17
|
+
end
|
18
|
+
|
19
|
+
def enumerator_for_contig(scaffold)
|
20
|
+
genome = scaffold.mapping(&:sequence).to_a.join
|
21
|
+
regions = genome.
|
22
|
+
gsub(/([^Nn])([Nn])/,'\1 \2').
|
23
|
+
gsub(/([Nn])([^Nn])/,'\1 \2').
|
24
|
+
scan(/[^\s]+/)
|
25
|
+
|
26
|
+
regions.inject([0,1,[]]) do |memo,entry|
|
27
|
+
position, number, entries = memo
|
28
|
+
|
29
|
+
if entry.downcase.include? 'n'
|
30
|
+
next [position + entry.length, number, entries]
|
31
|
+
end
|
32
|
+
|
33
|
+
i = {:sequence => entry,
|
34
|
+
:start => position + 1,
|
35
|
+
:stop => position + entry.length,
|
36
|
+
:type => :contig,
|
37
|
+
:id => number}
|
38
|
+
|
39
|
+
[position + entry.length, number + 1, entries << i]
|
40
|
+
end.last
|
41
|
+
end
|
42
|
+
|
43
|
+
def enumerator_for_gap(scaffold)
|
44
|
+
genome = scaffold.mapping(&:sequence).to_a.join
|
45
|
+
regions = genome.
|
46
|
+
gsub(/([^Nn])([Nn])/,'\1 \2').
|
47
|
+
gsub(/([Nn])([^Nn])/,'\1 \2').
|
48
|
+
scan(/[^\s]+/)
|
49
|
+
|
50
|
+
regions.inject([0,1,[]]) do |memo,entry|
|
51
|
+
position, number, entries = memo
|
52
|
+
|
53
|
+
unless entry.downcase.include? 'n'
|
54
|
+
next [position + entry.length, number, entries]
|
55
|
+
end
|
56
|
+
|
57
|
+
i = {:sequence => entry,
|
58
|
+
:start => position + 1,
|
59
|
+
:stop => position + entry.length,
|
60
|
+
:type => :gap,
|
61
|
+
:id => number}
|
62
|
+
|
63
|
+
[position + entry.length, number + 1, entries << i]
|
64
|
+
end.last
|
65
|
+
end
|
66
|
+
|
67
|
+
def enumerator_for_all(scaffold)
|
68
|
+
scaffold.inject([0,[]]) do |memo,entry|
|
69
|
+
position, entries = memo
|
70
|
+
|
71
|
+
i = {:sequence => entry.sequence,
|
72
|
+
:start => position + 1,
|
73
|
+
:stop => position + entry.sequence.length,
|
74
|
+
:type => entry.entry_type,
|
75
|
+
:id => entry.entry_type == :sequence ? entry.source : nil}
|
76
|
+
|
77
|
+
[position + entry.sequence.length, entries << i]
|
78
|
+
end.last
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
require 'terminal-table'
|
2
|
+
require 'lazing'
|
3
|
+
|
4
|
+
module GenomerPluginSummary::Format
|
5
|
+
|
6
|
+
DEFAULTS = {
|
7
|
+
:justification => [],
|
8
|
+
:width => {},
|
9
|
+
:format => {}
|
10
|
+
}
|
11
|
+
|
12
|
+
def table(data,opts = {})
|
13
|
+
opts = DEFAULTS.merge opts
|
14
|
+
case opts[:output]
|
15
|
+
when 'csv' then csv(data,opts)
|
16
|
+
else pretty(data,opts)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def create_cells(data,opts)
|
21
|
+
data.map do |row|
|
22
|
+
if row == :separator
|
23
|
+
:separator
|
24
|
+
else
|
25
|
+
row.each_with_index.map do |cell,index|
|
26
|
+
format_cell(cell,
|
27
|
+
opts[:width][index],
|
28
|
+
opts[:justification][index],
|
29
|
+
opts[:format][index])
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def format_cell(cell,width,justification,format = nil)
|
36
|
+
formatted = case format
|
37
|
+
when String then sprintf(format,cell)
|
38
|
+
when Proc then format.call(cell).to_s
|
39
|
+
when nil then cell.to_s
|
40
|
+
end
|
41
|
+
|
42
|
+
return formatted if width.nil?
|
43
|
+
|
44
|
+
case justification
|
45
|
+
when :right then formatted.rjust(width)
|
46
|
+
when :center then formatted.center(width)
|
47
|
+
else formatted.ljust(width)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def csv(data,opts)
|
52
|
+
opts[:width] = {}
|
53
|
+
opts[:justification] = {}
|
54
|
+
|
55
|
+
cells = create_cells(data,opts)
|
56
|
+
|
57
|
+
cells.unshift opts[:headers] if opts[:headers]
|
58
|
+
|
59
|
+
cells.compact.
|
60
|
+
rejecting{|i| i == :separator}.
|
61
|
+
mapping{|i| i.join(',')}.
|
62
|
+
mapping{|i| i.gsub(' ','_')}.
|
63
|
+
mapping{|i| i.gsub(/[()]/,'')}.
|
64
|
+
mapping{|i| i.downcase}.
|
65
|
+
to_a. join("\n") + "\n"
|
66
|
+
end
|
67
|
+
|
68
|
+
def pretty(data,opts)
|
69
|
+
cells = create_cells(data,opts)
|
70
|
+
|
71
|
+
if opts[:headers]
|
72
|
+
cells.unshift :separator
|
73
|
+
cells.unshift(opts[:headers].each_with_index.map do |header,index|
|
74
|
+
width = opts[:width][index] || cells.mapping{|c| c[index].length }.max
|
75
|
+
format_cell(header, width, :center)
|
76
|
+
end)
|
77
|
+
end
|
78
|
+
|
79
|
+
table = Terminal::Table.new do |t|
|
80
|
+
cells.each{|c| t << c}
|
81
|
+
end
|
82
|
+
opts[:justification].each{|(k,v)| table.align_column k, v }
|
83
|
+
table.title ||= opts[:title]
|
84
|
+
table.to_s + "\n"
|
85
|
+
end
|
86
|
+
|
87
|
+
end
|
@@ -1,45 +1,37 @@
|
|
1
1
|
require 'genomer'
|
2
|
-
require '
|
2
|
+
require 'genomer-plugin-summary/format'
|
3
3
|
|
4
4
|
class GenomerPluginSummary::Gaps < Genomer::Plugin
|
5
|
+
include GenomerPluginSummary::Format
|
5
6
|
|
6
7
|
def run
|
7
|
-
tabulate
|
8
|
+
tabulate(determine_gaps(scaffold),flags)
|
8
9
|
end
|
9
10
|
|
10
|
-
|
11
|
-
['Number'.center(8),
|
12
|
-
'Length'.center(8),
|
13
|
-
'Start'.center(8),
|
14
|
-
'End'.center(8),
|
15
|
-
'Type'.center(12)]
|
16
|
-
end
|
17
|
-
|
18
|
-
def title
|
19
|
-
'Scaffold Gaps'
|
20
|
-
end
|
21
|
-
|
22
|
-
def tabulate(contigs)
|
23
|
-
table = Terminal::Table.new(:title => title) do |t|
|
24
|
-
t << headings
|
25
|
-
t << :separator
|
26
|
-
contigs.each do |ctg|
|
27
|
-
t << [ctg[:number],
|
28
|
-
ctg[:length],
|
29
|
-
ctg[:start],
|
30
|
-
ctg[:end],
|
31
|
-
ctg[:type]]
|
32
|
-
end
|
33
|
-
end
|
11
|
+
COLUMNS = [:number, :length, :start, :end, :type]
|
34
12
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
13
|
+
FORMATTING = {
|
14
|
+
:title => 'Scaffold Gaps',
|
15
|
+
:headers => ['Number', 'Length', 'Start', 'End', 'Type'],
|
16
|
+
:width => {
|
17
|
+
0 => 8,
|
18
|
+
1 => 8,
|
19
|
+
2 => 8,
|
20
|
+
3 => 8,
|
21
|
+
4 => 12
|
22
|
+
},
|
23
|
+
:justification => {
|
24
|
+
0 => :right,
|
25
|
+
1 => :right,
|
26
|
+
2 => :right,
|
27
|
+
3 => :right,
|
28
|
+
4 => :center
|
29
|
+
}
|
30
|
+
}
|
41
31
|
|
42
|
-
|
32
|
+
def tabulate(gaps,flags)
|
33
|
+
FORMATTING[:output] = flags[:output]
|
34
|
+
table(gaps.map{|gap| COLUMNS.map{|col| gap[col]}},FORMATTING)
|
43
35
|
end
|
44
36
|
|
45
37
|
def gap_locations(seq)
|
@@ -0,0 +1,51 @@
|
|
1
|
+
require 'genomer'
|
2
|
+
require 'genomer-plugin-summary/metrics'
|
3
|
+
require 'genomer-plugin-summary/format'
|
4
|
+
|
5
|
+
class GenomerPluginSummary::Genome < Genomer::Plugin
|
6
|
+
include GenomerPluginSummary::Metrics
|
7
|
+
include GenomerPluginSummary::Format
|
8
|
+
|
9
|
+
LAYOUT = [
|
10
|
+
{:name => 'Sequences (#)', :entry_type => :sequence, :method => :count},
|
11
|
+
{:name => 'Contigs (#)', :entry_type => :contig, :method => :count},
|
12
|
+
{:name => 'Gaps (#)', :entry_type => :gap, :method => :count},
|
13
|
+
:separator,
|
14
|
+
{:name => 'Size (bp)', :entry_type => :all, :method => :length},
|
15
|
+
{:name => 'Sequences (bp)', :entry_type => :sequence, :method => :length},
|
16
|
+
{:name => 'Contigs (bp)', :entry_type => :contig, :method => :length},
|
17
|
+
{:name => 'Gaps (bp)', :entry_type => :gap, :method => :length},
|
18
|
+
:separator,
|
19
|
+
{:name => 'G+C (%)', :entry_type => :all, :method => :gc_content},
|
20
|
+
{:name => 'Sequences (%)', :entry_type => :sequence, :method => :percent},
|
21
|
+
{:name => 'Contigs (%)', :entry_type => :contig, :method => :percent},
|
22
|
+
{:name => 'Gaps (%)', :entry_type => :gap, :method => :percent}
|
23
|
+
]
|
24
|
+
|
25
|
+
FORMATTING = {
|
26
|
+
:title => 'Scaffold',
|
27
|
+
:width => {0 => 12, 1 => 9},
|
28
|
+
:justification => {1 => :right},
|
29
|
+
:format => {1 => lambda{|i| i.class == Float ? sprintf('%#.2f',i) : i }}
|
30
|
+
}
|
31
|
+
|
32
|
+
def run
|
33
|
+
tabulate(calculate_metrics(LAYOUT, scaffold),flags)
|
34
|
+
end
|
35
|
+
|
36
|
+
def tabulate(data,flags)
|
37
|
+
FORMATTING.store(:output,flags[:output]) if flags[:output]
|
38
|
+
table(data,FORMATTING)
|
39
|
+
end
|
40
|
+
|
41
|
+
def calculate_metrics(specs,scaffold)
|
42
|
+
specs.map do |spec|
|
43
|
+
if spec == :separator
|
44
|
+
spec
|
45
|
+
else
|
46
|
+
[spec[:name], send(spec[:method],spec[:entry_type],scaffold)]
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
@@ -1,13 +1,15 @@
|
|
1
1
|
require 'genomer'
|
2
|
+
require 'genomer-plugin-summary/enumerators'
|
2
3
|
require 'lazing'
|
3
4
|
|
4
5
|
module GenomerPluginSummary::Metrics
|
6
|
+
include GenomerPluginSummary::Enumerators
|
5
7
|
|
6
8
|
ALL = :all
|
7
9
|
|
8
10
|
def gc_content(type,scfd)
|
9
|
-
gc = enumerator_for(type,scfd).mapping{|i| gc(i)}.inject(:+) || 0.0
|
10
|
-
atgc = enumerator_for(type,scfd).mapping{|i| atgc(i)}.inject(:+) || 0.0
|
11
|
+
gc = enumerator_for(type,scfd).mapping{|i| gc(i[:sequence])}.inject(:+) || 0.0
|
12
|
+
atgc = enumerator_for(type,scfd).mapping{|i| atgc(i[:sequence])}.inject(:+) || 0.0
|
11
13
|
gc / atgc * 100
|
12
14
|
end
|
13
15
|
|
@@ -21,21 +23,33 @@ module GenomerPluginSummary::Metrics
|
|
21
23
|
|
22
24
|
def length(type,scfd)
|
23
25
|
enumerator_for(type,scfd).
|
24
|
-
mapping
|
26
|
+
mapping{|i| i[:sequence]}.
|
25
27
|
mapping(&:length).
|
26
28
|
inject(:+) || 0
|
27
29
|
end
|
28
30
|
|
29
|
-
def gc(
|
30
|
-
|
31
|
+
def gc(sequence)
|
32
|
+
sequence.gsub(/[^GCgc]/,'').length.to_f
|
31
33
|
end
|
32
34
|
|
33
|
-
def atgc(
|
34
|
-
|
35
|
+
def atgc(sequence)
|
36
|
+
sequence.gsub(/[^ATGCatgc]/,'').length.to_f
|
35
37
|
end
|
36
38
|
|
37
|
-
def
|
38
|
-
|
39
|
+
def sequence_total(seqs)
|
40
|
+
return Hash[[:start, :stop, :size, :percent, :gc].map{|i| [i, 0]}] if seqs.empty?
|
41
|
+
|
42
|
+
totals = seqs.inject({:size => 0, :percent => 0, :gc => 0}) do |hash,entry|
|
43
|
+
hash[:start] ||= entry[:start]
|
44
|
+
hash[:stop] = entry[:stop]
|
45
|
+
hash[:size] += entry[:size]
|
46
|
+
hash[:percent] += entry[:percent]
|
47
|
+
hash[:gc] += entry[:gc] * entry[:size]
|
48
|
+
|
49
|
+
hash
|
50
|
+
end
|
51
|
+
totals[:gc] /= totals[:size]
|
52
|
+
totals
|
39
53
|
end
|
40
54
|
|
41
55
|
end
|
@@ -1,92 +1,66 @@
|
|
1
1
|
require 'genomer'
|
2
2
|
require 'genomer-plugin-summary/metrics'
|
3
|
-
require '
|
3
|
+
require 'genomer-plugin-summary/format'
|
4
4
|
|
5
5
|
class GenomerPluginSummary::Sequences < Genomer::Plugin
|
6
6
|
include GenomerPluginSummary::Metrics
|
7
|
+
include GenomerPluginSummary::Format
|
8
|
+
include GenomerPluginSummary::Enumerators
|
7
9
|
|
8
10
|
def run
|
9
11
|
sequences = calculate(scaffold)
|
10
|
-
total =
|
12
|
+
total = sequence_total(sequences)
|
11
13
|
|
12
|
-
tabulate(sequences,total)
|
14
|
+
tabulate(sequences,total,flags)
|
13
15
|
end
|
14
16
|
|
15
|
-
|
16
|
-
['Sequence'.left(16),
|
17
|
-
'Start (bp)'.center(10),
|
18
|
-
'End (bp)'.center(10),
|
19
|
-
'Size (bp)'.center(10),
|
20
|
-
'Size (%)'.center(8),
|
21
|
-
'GC (%)'.center(6)]
|
22
|
-
end
|
23
|
-
|
24
|
-
def title
|
25
|
-
'Scaffold Sequences'
|
26
|
-
end
|
27
|
-
|
28
|
-
def tabulate(rows,total)
|
29
|
-
table = Terminal::Table.new(:title => title) do |t|
|
30
|
-
t << headings
|
31
|
-
t << :separator
|
32
|
-
rows.each do |row|
|
33
|
-
t << table_array(row)
|
34
|
-
end
|
35
|
-
t << :separator
|
36
|
-
t << table_array(total.merge({:sequence => 'All'}))
|
37
|
-
end
|
17
|
+
COLUMNS = [:id, :start, :stop, :size, :percent, :gc]
|
38
18
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
19
|
+
FORMATTING = {
|
20
|
+
:title => 'Scaffold Sequences',
|
21
|
+
:headers => ['Sequence', 'Start (bp)', 'End (bp)', 'Size (bp)', 'Size (%)', 'GC (%)'],
|
22
|
+
:width => {
|
23
|
+
0 => 16,
|
24
|
+
1 => 10,
|
25
|
+
2 => 10,
|
26
|
+
3 => 10,
|
27
|
+
4 => 8,
|
28
|
+
5 => 6
|
29
|
+
},
|
30
|
+
:justification => {
|
31
|
+
0 => :left,
|
32
|
+
1 => :right,
|
33
|
+
2 => :right,
|
34
|
+
3 => :right,
|
35
|
+
4 => :right,
|
36
|
+
5 => :right
|
37
|
+
},
|
38
|
+
:format => {
|
39
|
+
4 => '%#.2f',
|
40
|
+
5 => '%#.2f'
|
41
|
+
}
|
42
|
+
}
|
45
43
|
|
46
|
-
|
47
|
-
|
44
|
+
def tabulate(sequences,total,flags)
|
45
|
+
rows = sequences.map{|sequence| COLUMNS.map{|col| sequence[col]}}.
|
46
|
+
<<(:separator).
|
47
|
+
<<(COLUMNS.map{|col| total[col] || 'All'})
|
48
48
|
|
49
|
-
|
50
|
-
|
51
|
-
map{|i| hash[i]}.
|
52
|
-
map{|i| i.class == Float ? sprintf('%#.2f',i) : i }
|
49
|
+
FORMATTING[:output] = flags[:output]
|
50
|
+
table(rows,FORMATTING)
|
53
51
|
end
|
54
52
|
|
55
53
|
def calculate(scaffold)
|
56
|
-
total_length = length(
|
57
|
-
running_length = 0
|
58
|
-
|
59
|
-
scaffold.map do |entry|
|
60
|
-
i = nil
|
61
|
-
if entry.entry_type != :unresolved
|
62
|
-
entry_length = entry.sequence.length
|
63
|
-
i = { :sequence => entry.source,
|
64
|
-
:start => running_length + 1,
|
65
|
-
:end => running_length + entry_length,
|
66
|
-
:size => entry_length,
|
67
|
-
:percent => entry_length / total_length * 100,
|
68
|
-
:gc => gc(entry) / atgc(entry) * 100 }
|
69
|
-
end
|
70
|
-
|
71
|
-
running_length += entry.sequence.length
|
72
|
-
i
|
73
|
-
end.compact
|
74
|
-
end
|
75
|
-
|
76
|
-
def total(seqs)
|
77
|
-
return Hash[[:start, :end, :size, :percent, :gc].map{|i| [i, 'NA']}] if seqs.empty?
|
54
|
+
total_length = scaffold.mapping(&:sequence).mapping(&:length).inject(&:+).to_f
|
78
55
|
|
79
|
-
|
80
|
-
|
81
|
-
hash[:end] = entry[:end]
|
82
|
-
hash[:size] += entry[:size]
|
83
|
-
hash[:percent] += entry[:percent]
|
84
|
-
hash[:gc] += entry[:gc] * entry[:size]
|
56
|
+
enumerator_for(:sequence,scaffold).mapping do |entry|
|
57
|
+
sequence = entry.delete(:sequence)
|
85
58
|
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
59
|
+
entry[:size] = sequence.length
|
60
|
+
entry[:gc] = gc(sequence) / atgc(sequence) * 100
|
61
|
+
entry[:percent] = sequence.length / total_length * 100
|
62
|
+
entry
|
63
|
+
end.to_a
|
90
64
|
end
|
91
65
|
|
92
66
|
end
|