cheripic 1.2.5 → 1.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ChangeLog.md +10 -3
- data/README.md +20 -10
- data/Rakefile +2 -2
- data/galaxy_cheripic_tool.xml +47 -19
- data/lib/cheripic/cmd.rb +40 -14
- data/lib/cheripic/implementer.rb +5 -2
- data/lib/cheripic/options.rb +5 -0
- data/lib/cheripic/variants.rb +84 -77
- data/lib/cheripic/vcf.rb +32 -8
- data/lib/cheripic/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 4b848b1b52f01215fabf404f7387e6f621a2285b
|
|
4
|
+
data.tar.gz: e434cbad53ac982974c30cf2a940668862a6d2bd
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ef7eb0c0008b8d8ed45d6e1deeeb4c59ef187a8af3a92fbc91dba33d91af03b475c337c2e69dcdcb8c34c3cfeea3f9f5d54e42fadba8d0270e0a776c96609c79
|
|
7
|
+
data.tar.gz: beb3fccf28f54a30fc355a93261043d27dfd4d2ebb34043245404608d1608d2185aae57b8263192ca5973caf1a99c9b9b1614187e5d253656efc9ec9fab9aa50
|
data/ChangeLog.md
CHANGED
|
@@ -6,8 +6,15 @@ All significant changes to this project at each release are documented in this f
|
|
|
6
6
|
#### Future changes to include
|
|
7
7
|
|
|
8
8
|
1. option to take multiple background pileup files
|
|
9
|
-
|
|
10
|
-
|
|
9
|
+
|
|
10
|
+
#### [1.2.6] - 2016-10-26
|
|
11
|
+
|
|
12
|
+
1. option to run only using with vcf files as bulk inputs to increase speed of analysis for larger genomes
|
|
13
|
+
|
|
14
|
+
#### [1.2.5] - 2016-10-17
|
|
15
|
+
|
|
16
|
+
1. Updated methods take bam file (along with a vcf file) or pileup file as inputs of bulks
|
|
17
|
+
2. Replaced output directory with output file name tag, since we only write to one file
|
|
11
18
|
|
|
12
19
|
#### [1.2.0] - 2016-08-11
|
|
13
20
|
|
|
@@ -18,4 +25,4 @@ All significant changes to this project at each release are documented in this f
|
|
|
18
25
|
|
|
19
26
|
#### [1.1.0] - 2016-07-26
|
|
20
27
|
|
|
21
|
-
first release of the binaries for Linux 64 bit and OSX 64bit
|
|
28
|
+
first release of the binaries for Linux 64 bit and OSX 64bit
|
data/README.md
CHANGED
|
@@ -45,7 +45,7 @@ Running `cheripic` without any input at command line interface shows following h
|
|
|
45
45
|
|
|
46
46
|
```
|
|
47
47
|
|
|
48
|
-
Cheripic v1.2.
|
|
48
|
+
Cheripic v1.2.6
|
|
49
49
|
Authors: Shyam Rallapalli and Dan MacLean
|
|
50
50
|
|
|
51
51
|
Description: Candidate mutation and closely linked marker selection for non reference genomes
|
|
@@ -53,35 +53,45 @@ Uses bulk segregant data from non-reference sequence genomes
|
|
|
53
53
|
|
|
54
54
|
Inputs:
|
|
55
55
|
1. Needs a reference fasta file of asssembly use for variant analysis
|
|
56
|
-
2. Pileup files for mutant (phenotype of interest) bulks and background (wildtype phenotype) bulks
|
|
57
|
-
3. If
|
|
56
|
+
2. Pileup/Bam files for mutant (phenotype of interest) bulks and background (wildtype phenotype) bulks
|
|
57
|
+
3. If providing bam files, you have to include vcf files for the respective bulks
|
|
58
|
+
4. If polyploid species, include pileup/bam files from one or both parents
|
|
58
59
|
|
|
59
60
|
USAGE:
|
|
60
61
|
cheripic <options>
|
|
61
62
|
|
|
62
63
|
OPTIONS:
|
|
63
64
|
-f, --assembly=<s> Assembly file in FASTA format
|
|
64
|
-
-F, --input-format=<s> bulk and parent alignment file format types - set either pileup or bam (default: pileup)
|
|
65
|
+
-F, --input-format=<s> bulk and parent alignment file format types - set either pileup or bam or vcf (default: pileup)
|
|
65
66
|
-a, --mut-bulk=<s> Pileup or sorted BAM file alignments from mutant/trait of interest bulk 1
|
|
67
|
+
--mut-bulk-vcf=<s> vcf file for variants from mutant/trait of interest bulk 1 (default: )
|
|
66
68
|
-b, --bg-bulk=<s> Pileup or sorted BAM file alignments from background/wildtype bulk 2
|
|
67
|
-
--
|
|
69
|
+
--bg-bulk-vcf=<s> vcf file for variants from background/wildtype bulk 2 (default: )
|
|
70
|
+
--output=<s> custom name tag to include in the output file name (default: cheripic_results)
|
|
68
71
|
--loglevel=<s> Choose any one of "info / warn / debug" level for logs generated (default: debug)
|
|
69
72
|
--hmes-adjust=<f> factor added to snp count of each contig to adjust for hme score calculations (default: 0.5)
|
|
70
73
|
--htlow=<f> lower level for categorizing heterozygosity (default: 0.2)
|
|
71
74
|
--hthigh=<f> high level for categorizing heterozygosity (default: 0.9)
|
|
72
|
-
--mindepth=<i> minimum read depth
|
|
75
|
+
--mindepth=<i> minimum read depth at a position to consider for variant calls (default: 6)
|
|
76
|
+
--max-d-multiple=<i> multiplication factor for average coverage to calculate maximum read coverage
|
|
77
|
+
if set zero no calculation will be made from bam file.
|
|
78
|
+
setting this value will override user set max depth (Default: 5)
|
|
79
|
+
--maxdepth=<i> maximum read depth at a position to consider for variant calls
|
|
80
|
+
if set to zero no user max depth will be used (default: 0)
|
|
73
81
|
--min-non-ref-count=<i> minimum read depth supporting non reference base at each position (default: 3)
|
|
74
82
|
--min-indel-count-support=<i> minimum read depth supporting an indel at each position (default: 3)
|
|
75
|
-
--ambiguous-ref-bases
|
|
83
|
+
--ambiguous-ref-bases=<s> including variant at completely ambiguous bases in the reference (default: false)
|
|
76
84
|
-q, --mapping-quality=<i> minimum mapping quality of read covering the position (default: 20)
|
|
77
85
|
-Q, --base-quality=<i> minimum base quality of bases covering the position (default: 15)
|
|
78
86
|
--noise=<f> praportion of reads for a variant to conisder as noise (default: 0.1)
|
|
79
87
|
--cross-type=<s> type of cross used to generated mapping population - back or out (default: back)
|
|
80
|
-
--use-all-contigs
|
|
81
|
-
--include-low-hmes
|
|
82
|
-
|
|
88
|
+
--use-all-contigs=<s> option to select all contigs or only contigs containing variants for analysis (default: false)
|
|
89
|
+
--include-low-hmes=<s> option to include or discard variants from contigs with
|
|
90
|
+
low hme-score or bfr score to list in the final output (default: false)
|
|
91
|
+
--polyploidy=<s> Set if the data input is from polyploids (default: false)
|
|
83
92
|
-p, --mut-parent=<s> Pileup or sorted BAM file alignments from mutant/trait of interest parent (default: )
|
|
84
93
|
-r, --bg-parent=<s> Pileup or sorted BAM file alignments from background/wildtype parent (default: )
|
|
94
|
+
-R, --repeats-file=<s> repeat masker output file for the assembly (default: )
|
|
85
95
|
--bfr-adjust=<f> factor added to hemi snp frequency of each parent to adjust for bfr calculations (default: 0.05)
|
|
86
96
|
--sel-seq-len=<i> sequence length to print from either side of selected variants (default: 50)
|
|
87
97
|
--examples shows some example commands with explanation
|
data/Rakefile
CHANGED
|
@@ -61,8 +61,8 @@ def create_package(target)
|
|
|
61
61
|
package_dir = "packaging/#{package_dest}"
|
|
62
62
|
sh "rm -rf #{package_dir}"
|
|
63
63
|
sh "mkdir #{package_dir}"
|
|
64
|
-
sh "mkdir -p #{package_dir}/lib/app"
|
|
65
|
-
sh "cp
|
|
64
|
+
sh "mkdir -p #{package_dir}/lib/app/bin"
|
|
65
|
+
sh "cp bin/cheripic #{package_dir}/lib/app/bin/"
|
|
66
66
|
sh "cp -R lib #{package_dir}/lib/app/"
|
|
67
67
|
sh "mkdir #{package_dir}/lib/app/ruby"
|
|
68
68
|
sh "tar -xzf packaging/traveling-ruby-#{TRAVELING_RUBY_VERSION}-#{target}.tar.gz -C #{package_dir}/lib/app/ruby"
|
data/galaxy_cheripic_tool.xml
CHANGED
|
@@ -1,21 +1,25 @@
|
|
|
1
|
-
<tool id="cheripic" name="CHERIPIC" version="1.2.
|
|
1
|
+
<tool id="cheripic" name="CHERIPIC" version="1.2.6">
|
|
2
2
|
|
|
3
3
|
<description>CHERIPIC</description>
|
|
4
4
|
|
|
5
|
-
<version_command
|
|
5
|
+
<version_command>/full_path_to/cheripic -v</version_command>
|
|
6
6
|
|
|
7
7
|
<command>
|
|
8
8
|
<![CDATA[
|
|
9
|
-
cheripic
|
|
9
|
+
/full_path_to/cheripic
|
|
10
10
|
--assembly $assembly
|
|
11
|
+
--input-format $input_format
|
|
11
12
|
--mut-bulk $mut_bulk
|
|
12
13
|
--bg-bulk $bg_bulk
|
|
13
|
-
--
|
|
14
|
+
--mut-bulk-vcf $mut_bulk_vcf
|
|
15
|
+
--bg-bulk-vcf $bg_bulk_vcf
|
|
14
16
|
--loglevel $loglevel
|
|
15
17
|
--hmes-adjust $hmes_adjust
|
|
16
18
|
--htlow $ht_low
|
|
17
19
|
--hthigh $ht_high
|
|
18
20
|
--mindepth $min_depth
|
|
21
|
+
--max-d-multiple $max_d_multiple
|
|
22
|
+
--maxdepth $max_depth
|
|
19
23
|
--min-non-ref-count $min_non_ref_count
|
|
20
24
|
--min-indel-count-support $min_indel_count_support
|
|
21
25
|
--ambiguous-ref-bases $ambiguous_ref_bases
|
|
@@ -28,17 +32,26 @@
|
|
|
28
32
|
--polyploidy $polyploidy
|
|
29
33
|
--mut-parent $mut_parent
|
|
30
34
|
--bg-parent $bg_parent
|
|
35
|
+
--repeats-file $repeats_file
|
|
31
36
|
--bfr-adjust $bfr_adjust
|
|
32
37
|
--sel-seq-len $sel_seq_len
|
|
38
|
+
&> output_log.txt
|
|
33
39
|
]]>
|
|
34
40
|
</command>
|
|
35
41
|
|
|
36
42
|
<inputs>
|
|
37
43
|
<param name="assembly" type="data" format="fasta" label="Input Assembly file" help="Select Assembly fasta file" />
|
|
38
|
-
<param name="
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
<option value="
|
|
44
|
+
<param name="input_format" type="select" optional="true" label="input file format" help="choose between vcf, bam and pileup format" >
|
|
45
|
+
<option value="vcf" selected="true">vcf</option>
|
|
46
|
+
<option value="bam">bam</option>
|
|
47
|
+
<option value="pileup">pileup</option>
|
|
48
|
+
</param>
|
|
49
|
+
<param name="mut_bulk" type="data" label="mutant bulk input file" help="Select mutant bulk input file" />
|
|
50
|
+
<param name="bg_bulk" type="data" label="background bulk input file" help="Select background bulk input file" />
|
|
51
|
+
<param name="mut_bulk_vcf" type="data" optional="true" label="mutant bulk input vcf file" help="Select mutant bulk input vcf file" />
|
|
52
|
+
<param name="bg_bulk_vcf" type="data" optional="true" label="background bulk input vcf file" help="Select background bulk input vcf file" />
|
|
53
|
+
<param name="loglevel" type="select" optional="true" label="analysis log level" help="choose between info, warn and debug levels" >
|
|
54
|
+
<option value="info" selected="true">info</option>
|
|
42
55
|
<option value="warn">warnings</option>
|
|
43
56
|
<option value="debug">debug</option>
|
|
44
57
|
</param>
|
|
@@ -50,6 +63,10 @@
|
|
|
50
63
|
label="heterozygosity high limit" help="upper limit to heterozygosity allele fraction" />
|
|
51
64
|
<param name="min_depth" size="4" type="integer" optional="true" value="6" min="1" max="8000"
|
|
52
65
|
label="minimum read coverage" help="minimum read depth to conisder a position for variant calls" />
|
|
66
|
+
<param name="max_d_multiple" size="4" type="integer" optional="true" value="5" min="0" max="100"
|
|
67
|
+
label="multiplication factor avg read coverage" help="multiplication factor for average coverage to calculate maximum read coverage" />
|
|
68
|
+
<param name="max_depth" size="4" type="integer" optional="true" value="0" min="0" max="8000"
|
|
69
|
+
label="maximum read coverage" help="maximum read depth to conisder a position for variant calls" />
|
|
53
70
|
<param name="min_non_ref_count" size="4" type="integer" optional="true" value="3" min="1" max="8000"
|
|
54
71
|
label="minimum alternate read coverage" help="minimum read depth supporting non reference base at each position" />
|
|
55
72
|
<param name="min_indel_count_support" size="4" type="integer" optional="true" value="3" min="1" max="8000"
|
|
@@ -73,20 +90,22 @@
|
|
|
73
90
|
help="option to include or discard variants from contigs with low hme-score or bfr score to list in the final output" truevalue="true" falsevalue="false" />
|
|
74
91
|
<param name="polyploidy" type="boolean" optional="true" checked="false" label="polyploid data"
|
|
75
92
|
help="Set if the input data is from polyploids" truevalue="true" falsevalue="false" />
|
|
76
|
-
<param name="
|
|
77
|
-
<param name="
|
|
93
|
+
<param name="mut_parent" type="data" optional="true" format="pileup" label="mutant parent pileup file" help="Select mutant parent pileup file" />
|
|
94
|
+
<param name="bg_parent" type="data" optional="true" format="pileup" label="background parent pileup file" help="Select background parent pileup file" />
|
|
95
|
+
<param name="repeats_file" type="data" optional="true" format="txt" label="Repeat masker output file" help="Repeat masker output file of repeat positions" />
|
|
78
96
|
|
|
79
97
|
<param name="bfr_adjust" size="4" type="float" optional="true" value="0.05" min="0.01" max="1.0"
|
|
80
98
|
label="bfr score adjuster" help="factor added to hemi snp frequency of each parent to adjust for bfr calculations (default: 0.05)" />
|
|
81
99
|
<param name="sel_seq_len" size="4" type="integer" optional="true" value="50" min="10" max="250"
|
|
82
100
|
label="selected variant seq length out" help="sequence length to print from either side of selected variants (default: 50)" />
|
|
83
|
-
|
|
84
|
-
<param name="output" type="text" size="30" value="cheripic_results" label="tag for output filename" help="write a tag to include with output filename" />
|
|
85
101
|
</inputs>
|
|
86
102
|
|
|
87
103
|
<outputs>
|
|
88
|
-
<data name="
|
|
89
|
-
<data name="
|
|
104
|
+
<data name="output1" format="txt" from_work_dir="output_log.txt" label="cheripic log file" />
|
|
105
|
+
<data name="output2" format="txt" from_work_dir="cheripic_results_selected_hme_variants.txt" label="selected hmes variants" />
|
|
106
|
+
<data name="output3" format="txt" from_work_dir="cheripic_results_selected_bfr_variants.txt" label="selected bfr variants" >
|
|
107
|
+
<filter>polyploidy == "true"</filter>
|
|
108
|
+
</data>
|
|
90
109
|
</outputs>
|
|
91
110
|
|
|
92
111
|
<tests>
|
|
@@ -154,15 +173,22 @@ All of the options have a default value. You can change any of them. All of the
|
|
|
154
173
|
|
|
155
174
|
OPTIONS:
|
|
156
175
|
-f, --assembly Assembly file in FASTA format
|
|
157
|
-
-F, --input-format bulk and parent alignment file format types - set either pileup or bam (default: pileup)
|
|
176
|
+
-F, --input-format bulk and parent alignment file format types - set either pileup or bam or vcf (default: pileup)
|
|
158
177
|
-a, --mut-bulk Pileup or sorted BAM file alignments from mutant/trait of interest bulk 1
|
|
178
|
+
--mut-bulk-vcf vcf file for variants from mutant/trait of interest bulk 1
|
|
159
179
|
-b, --bg-bulk Pileup or sorted BAM file alignments from background/wildtype bulk 2
|
|
160
|
-
--
|
|
180
|
+
--bg-bulk-vcf vcf file for variants from background/wildtype bulk 2
|
|
181
|
+
--output custom name tag to include in the output file name (default: cheripic_results)
|
|
161
182
|
--loglevel Choose any one of "info / warn / debug" level for logs generated (default: debug)
|
|
162
183
|
--hmes-adjust factor added to snp count of each contig to adjust for hme score calculations (default: 0.5)
|
|
163
184
|
--htlow lower level for categorizing heterozygosity (default: 0.2)
|
|
164
185
|
--hthigh high level for categorizing heterozygosity (default: 0.9)
|
|
165
|
-
--mindepth minimum read depth
|
|
186
|
+
--mindepth minimum read depth at a position to consider for variant calls (default: 6)
|
|
187
|
+
--max-d-multiple multiplication factor for average coverage to calculate maximum read coverage
|
|
188
|
+
if set zero no calculation will be made from bam file.
|
|
189
|
+
setting this value will override user set max depth (Default: 5)
|
|
190
|
+
--maxdepth maximum read depth at a position to consider for variant calls
|
|
191
|
+
if set to zero no user max depth will be used (default: 0)
|
|
166
192
|
--min-non-ref-count minimum read depth supporting non reference base at each position (default: 3)
|
|
167
193
|
--min-indel-count-support minimum read depth supporting an indel at each position (default: 3)
|
|
168
194
|
--ambiguous-ref-bases including variant at completely ambiguous bases in the reference
|
|
@@ -171,10 +197,12 @@ OPTIONS:
|
|
|
171
197
|
--noise praportion of reads for a variant to conisder as noise (default: 0.1)
|
|
172
198
|
--cross-type type of cross used to generated mapping population - back or out (default: back)
|
|
173
199
|
--use-all-contigs option to select all contigs or only contigs containing variants for analysis
|
|
174
|
-
--include-low-hmes option to include or discard variants from contigs with
|
|
200
|
+
--include-low-hmes option to include or discard variants from contigs with
|
|
201
|
+
low hme-score or bfr score to list in the final output
|
|
175
202
|
--polyploidy Set if the data input is from polyploids
|
|
176
203
|
-p, --mut-parent Pileup or sorted BAM file alignments from mutant/trait of interest parent (default: )
|
|
177
204
|
-r, --bg-parent Pileup or sorted BAM file alignments from background/wildtype parent (default: )
|
|
205
|
+
-R, --repeats-file repeat masker output file for the assembly (default: )
|
|
178
206
|
--bfr-adjust factor added to hemi snp frequency of each parent to adjust for bfr calculations (default: 0.05)
|
|
179
207
|
--sel-seq-len sequence length to print from either side of selected variants (default: 50)
|
|
180
208
|
|
|
@@ -190,7 +218,7 @@ Shyam Rallapalli
|
|
|
190
218
|
</help>
|
|
191
219
|
|
|
192
220
|
<citations>
|
|
193
|
-
<citation type="doi">
|
|
221
|
+
<citation type="doi">spaceholder</citation>
|
|
194
222
|
</citations>
|
|
195
223
|
|
|
196
224
|
</tool>
|
data/lib/cheripic/cmd.rb
CHANGED
|
@@ -45,7 +45,7 @@ module Cheripic
|
|
|
45
45
|
opt :assembly, 'Assembly file in FASTA format',
|
|
46
46
|
:short => '-f',
|
|
47
47
|
:type => String
|
|
48
|
-
opt :input_format, 'bulk and parent alignment file format types - set either pileup or bam',
|
|
48
|
+
opt :input_format, 'bulk and parent alignment file format types - set either pileup or bam or vcf',
|
|
49
49
|
:short => '-F',
|
|
50
50
|
:type => String,
|
|
51
51
|
:default => 'pileup'
|
|
@@ -53,16 +53,18 @@ module Cheripic
|
|
|
53
53
|
:short => '-a',
|
|
54
54
|
:type => String
|
|
55
55
|
opt :mut_bulk_vcf, 'vcf file for variants from mutant/trait of interest bulk 1',
|
|
56
|
-
:type => String
|
|
56
|
+
:type => String,
|
|
57
|
+
:default => ''
|
|
57
58
|
opt :bg_bulk, 'Pileup or sorted BAM file alignments from background/wildtype bulk 2',
|
|
58
59
|
:short => '-b',
|
|
59
60
|
:type => String
|
|
60
61
|
opt :bg_bulk_vcf, 'vcf file for variants from background/wildtype bulk 2',
|
|
61
|
-
:type => String
|
|
62
|
+
:type => String,
|
|
63
|
+
:default => ''
|
|
62
64
|
opt :output, 'custom name tag to include in the output file name',
|
|
63
65
|
:default => 'cheripic_results'
|
|
64
66
|
opt :loglevel, 'Choose any one of "info / warn / debug" level for logs generated',
|
|
65
|
-
:default => '
|
|
67
|
+
:default => 'info'
|
|
66
68
|
opt :hmes_adjust, 'factor added to snp count of each contig to adjust for hme score calculations',
|
|
67
69
|
:type => Float,
|
|
68
70
|
:default => 0.5
|
|
@@ -79,8 +81,8 @@ module Cheripic
|
|
|
79
81
|
if set zero no calculation will be made from bam file.\nsetting this value will override user set max depth",
|
|
80
82
|
:type => Integer,
|
|
81
83
|
:default => 5
|
|
82
|
-
opt :maxdepth,
|
|
83
|
-
if set to zero no user max depth will be used
|
|
84
|
+
opt :maxdepth, 'maximum read depth at a position to consider for variant calls
|
|
85
|
+
if set to zero no user max depth will be used',
|
|
84
86
|
:type => Integer,
|
|
85
87
|
:default => 0
|
|
86
88
|
opt :min_non_ref_count, 'minimum read depth supporting non reference base at each position',
|
|
@@ -90,8 +92,8 @@ if set to zero no user max depth will be used",
|
|
|
90
92
|
:type => Integer,
|
|
91
93
|
:default => 3
|
|
92
94
|
opt :ambiguous_ref_bases, 'including variant at completely ambiguous bases in the reference',
|
|
93
|
-
:type =>
|
|
94
|
-
:default => false
|
|
95
|
+
:type => String,
|
|
96
|
+
:default => 'false'
|
|
95
97
|
opt :mapping_quality, 'minimum mapping quality of read covering the position',
|
|
96
98
|
:short => '-q',
|
|
97
99
|
:type => Integer,
|
|
@@ -107,15 +109,15 @@ if set to zero no user max depth will be used",
|
|
|
107
109
|
:type => String,
|
|
108
110
|
:default => 'back'
|
|
109
111
|
opt :use_all_contigs, 'option to select all contigs or only contigs containing variants for analysis',
|
|
110
|
-
:type =>
|
|
111
|
-
:default => false
|
|
112
|
+
:type => String,
|
|
113
|
+
:default => 'false'
|
|
112
114
|
opt :include_low_hmes, 'option to include or discard variants from contigs with
|
|
113
115
|
low hme-score or bfr score to list in the final output',
|
|
114
|
-
:type =>
|
|
115
|
-
:default => false
|
|
116
|
+
:type => String,
|
|
117
|
+
:default => 'false'
|
|
116
118
|
opt :polyploidy, 'Set if the data input is from polyploids',
|
|
117
|
-
:type =>
|
|
118
|
-
:default => false
|
|
119
|
+
:type => String,
|
|
120
|
+
:default => 'false'
|
|
119
121
|
opt :mut_parent, 'Pileup or sorted BAM file alignments from mutant/trait of interest parent',
|
|
120
122
|
:short => '-p',
|
|
121
123
|
:type => String,
|
|
@@ -187,11 +189,35 @@ low hme-score or bfr score to list in the final output',
|
|
|
187
189
|
|
|
188
190
|
# calls other methods to check if command line inputs are valid
|
|
189
191
|
def check_arguments
|
|
192
|
+
convert_boolean_strings
|
|
190
193
|
check_output
|
|
191
194
|
check_log_level
|
|
195
|
+
check_input_entry
|
|
192
196
|
check_input_types
|
|
193
197
|
end
|
|
194
198
|
|
|
199
|
+
# convert true or false options to boolean
|
|
200
|
+
def convert_boolean_strings
|
|
201
|
+
%i{ambiguous_ref_bases use_all_contigs include_low_hmes polyploidy}.each do | symbol |
|
|
202
|
+
if @options.key?(symbol)
|
|
203
|
+
@options[symbol] = @options[symbol] == 'false' ? false : true
|
|
204
|
+
end
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
# set file given option to false if input is nil or None or ''
|
|
209
|
+
def check_input_entry
|
|
210
|
+
%i{assembly mut_bulk bg_bulk mut_bulk_vcf bg_bulk_vcf mut_parent bg_parent repeats_file}.each do | symbol |
|
|
211
|
+
if @options.key?(symbol)
|
|
212
|
+
if @options[symbol] == 'None'
|
|
213
|
+
param = (symbol.to_s + '_given').to_sym
|
|
214
|
+
@options[symbol] = ''
|
|
215
|
+
@options.delete(param)
|
|
216
|
+
end
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
end
|
|
220
|
+
|
|
195
221
|
# checks input files based on bulk file type
|
|
196
222
|
def check_input_types
|
|
197
223
|
inputfiles = {}
|
data/lib/cheripic/implementer.rb
CHANGED
|
@@ -54,6 +54,7 @@ module Cheripic
|
|
|
54
54
|
sel_seq_len}
|
|
55
55
|
settings = inputs.select { |k| set2.include?(k) }
|
|
56
56
|
Options.update(settings)
|
|
57
|
+
logger.debug "parameter values set\n#{Options.current_values.to_yaml}"
|
|
57
58
|
@vars_extracted = false
|
|
58
59
|
@has_run = false
|
|
59
60
|
end
|
|
@@ -74,19 +75,21 @@ module Cheripic
|
|
|
74
75
|
end
|
|
75
76
|
# print selected variants that could be potential markers or mutation
|
|
76
77
|
out_file = File.open(@options[pos_type], 'w')
|
|
77
|
-
out_file.puts "Score\tAlleleFreq\tseq_id\tposition\tref_base\tcoverage\tbases\tbase_quals\tsequence_left\tAlt_seq\tsequence_right"
|
|
78
|
+
out_file.puts "Score\tAlleleFreq\tlength\tseq_id\tposition\tref_base\tcoverage\tbases\tbase_quals\tsequence_left\tAlt_seq\tsequence_right"
|
|
78
79
|
regions = Regions.new(@options.assembly)
|
|
79
80
|
@variants.send(pos_type).each_key do | frag |
|
|
80
81
|
contig_obj = @variants.assembly[frag]
|
|
81
82
|
if pos_type == :hmes_frags
|
|
82
83
|
positions = contig_obj.hm_pos.keys
|
|
84
|
+
score = contig_obj.hme_score
|
|
83
85
|
else
|
|
84
86
|
positions = contig_obj.hemi_pos.keys
|
|
87
|
+
score = contig_obj.bfr_score
|
|
85
88
|
end
|
|
86
89
|
positions.each do | pos |
|
|
87
90
|
pileup = @variants.pileups[frag].mut_bulk[pos]
|
|
88
91
|
seqs = regions.fetch_seq(frag,pos)
|
|
89
|
-
out_file.puts "#{
|
|
92
|
+
out_file.puts "#{score}\t#{contig_obj.hm_pos[pos]}\t#{contig_obj.length}\t#{pileup.to_s.chomp}\t#{seqs[0]}\t#{pileup.consensus}\t#{seqs[1]}"
|
|
90
93
|
end
|
|
91
94
|
end
|
|
92
95
|
out_file.close
|
data/lib/cheripic/options.rb
CHANGED
data/lib/cheripic/variants.rb
CHANGED
|
@@ -4,35 +4,6 @@ require 'forwardable'
|
|
|
4
4
|
|
|
5
5
|
module Cheripic
|
|
6
6
|
|
|
7
|
-
require 'bio-samtools'
|
|
8
|
-
require 'bio/db/sam'
|
|
9
|
-
require 'open3'
|
|
10
|
-
|
|
11
|
-
# An extension of Bio::DB::Sam object to modify depth method
|
|
12
|
-
class Bio::DB::Sam
|
|
13
|
-
|
|
14
|
-
# A method to retrieve depth information from bam object
|
|
15
|
-
# @param opts [Hash] a hash of following input options
|
|
16
|
-
# b [File] list of positions or regions in BED format
|
|
17
|
-
# l [INT] minQLen
|
|
18
|
-
# q [INT] base quality threshold
|
|
19
|
-
# Q [INT] mapping quality threshold
|
|
20
|
-
# r [chr:from-to] region
|
|
21
|
-
# @returns a block with each line reporting sequence_name, position and depth
|
|
22
|
-
def depth(opts={})
|
|
23
|
-
command = form_opt_string(self.samtools, 'depth', opts)
|
|
24
|
-
# capture returns string output, so careful not to give whole genome or big contigs for depth analysis
|
|
25
|
-
stdout, stderr, status = Open3.capture3(command)
|
|
26
|
-
unless status.success?
|
|
27
|
-
logger.error "resulted in exit code #{status.exitstatus} using #{command}"
|
|
28
|
-
logger.error "stderr output is: #{stderr}"
|
|
29
|
-
raise CheripicError
|
|
30
|
-
end
|
|
31
|
-
# return stdout
|
|
32
|
-
stdout
|
|
33
|
-
end
|
|
34
|
-
|
|
35
|
-
end
|
|
36
7
|
|
|
37
8
|
# Custom error handling for Variants class
|
|
38
9
|
class VariantsError < CheripicError; end
|
|
@@ -110,6 +81,8 @@ module Cheripic
|
|
|
110
81
|
logger.info "processing #{input} file"
|
|
111
82
|
if @params.input_format == 'pileup'
|
|
112
83
|
extract_pileup(infile, input)
|
|
84
|
+
elsif @params.input_format == 'vcf'
|
|
85
|
+
extract_vcfs(infile, input)
|
|
113
86
|
else
|
|
114
87
|
extract_bam_pileup(infile, input)
|
|
115
88
|
end
|
|
@@ -119,35 +92,21 @@ module Cheripic
|
|
|
119
92
|
@pileups_analyzed = true
|
|
120
93
|
end
|
|
121
94
|
|
|
122
|
-
#
|
|
123
|
-
# @param
|
|
124
|
-
#
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
data.split("\n").each do |line|
|
|
136
|
-
info = line.split("\t")
|
|
137
|
-
depths << info[2].to_i
|
|
138
|
-
end
|
|
139
|
-
variance = 0
|
|
140
|
-
mean_depth = depths.reduce(0, :+) / len.to_f
|
|
141
|
-
depths.each do |value|
|
|
142
|
-
variance += (value.to_f - mean_depth)**2
|
|
95
|
+
# Input vcf file is read and positions are selected that pass the thresholds
|
|
96
|
+
# @param vcffile [String] path to the pileup file to read
|
|
97
|
+
# @param sym [Symbol] Symbol of the pileup file used to write selected variants
|
|
98
|
+
# pileup information to respective ContigPileups object
|
|
99
|
+
def extract_vcfs(vcffile, sym)
|
|
100
|
+
# read vcf file and process each variant
|
|
101
|
+
File.foreach(vcffile) do |line|
|
|
102
|
+
next if line =~ /^#/
|
|
103
|
+
v = Bio::DB::Vcf.new(line)
|
|
104
|
+
unless v.alt == '.'
|
|
105
|
+
pileup_string = Vcf.to_pileup(v)
|
|
106
|
+
pileup = Pileup.new(pileup_string)
|
|
107
|
+
store_pileup_info(pileup, sym)
|
|
143
108
|
end
|
|
144
|
-
all_depths << mean_depth
|
|
145
|
-
contig_obj.sd_depth = Math.sqrt(variance)
|
|
146
|
-
contig_obj.mean_depth = mean_depth
|
|
147
109
|
end
|
|
148
|
-
# setting max depth as 3 times the average depth
|
|
149
|
-
mean_coverage = all_depths.reduce(0, :+) / @assembly.length.to_f
|
|
150
|
-
Options.maxdepth = Options.max_d_multiple * mean_coverage
|
|
151
110
|
end
|
|
152
111
|
|
|
153
112
|
# Input pileup file is read and positions are selected that pass the thresholds
|
|
@@ -159,8 +118,7 @@ module Cheripic
|
|
|
159
118
|
File.foreach(pileupfile) do |line|
|
|
160
119
|
pileup = Pileup.new(line)
|
|
161
120
|
if pileup.is_var
|
|
162
|
-
|
|
163
|
-
contig_obj.send(sym).store(pileup.pos, pileup)
|
|
121
|
+
store_pileup_info(pileup, sym)
|
|
164
122
|
end
|
|
165
123
|
end
|
|
166
124
|
end
|
|
@@ -175,44 +133,93 @@ module Cheripic
|
|
|
175
133
|
bamobject = Bio::DB::Sam.new(:bam=>bamfile, :fasta=>@params.assembly)
|
|
176
134
|
bamobject.index unless bamobject.indexed?
|
|
177
135
|
|
|
136
|
+
# or calculate from bamfile
|
|
137
|
+
set_max_depth(bamobject, bamfile) if Options.max_d_multiple > 0 and sym == :mut_bulk
|
|
178
138
|
# check if user has set max depth or set to zero to ignore
|
|
179
139
|
max_d = Options.maxdepth
|
|
180
|
-
|
|
181
|
-
if Options.max_d_multiple > 0
|
|
182
|
-
set_max_depth(bamobject, bamfile)
|
|
183
|
-
max_d = Options.maxdepth
|
|
184
|
-
logger.info "max depth used for #{sym} file\t#{max_d}"
|
|
185
|
-
end
|
|
140
|
+
logger.info "max depth used for #{sym} file\t#{max_d}"
|
|
186
141
|
|
|
187
142
|
@vcf_hash.each_key do | id |
|
|
188
143
|
positions = @vcf_hash[id][:het].keys
|
|
189
144
|
positions << @vcf_hash[id][:hom].keys
|
|
190
145
|
positions.flatten!
|
|
191
146
|
next if positions.empty?
|
|
192
|
-
contig_obj = @pileups[id]
|
|
193
147
|
positions.each do | pos |
|
|
194
148
|
command = "#{bamobject.samtools} mpileup -r #{id}:#{pos}-#{pos} -Q #{bq} -q #{mq} -B -f #{@params.assembly} #{bamfile}"
|
|
195
|
-
stdout
|
|
196
|
-
unless status.success?
|
|
197
|
-
logger.error "resulted in exit code #{status.exitstatus} using #{command}"
|
|
198
|
-
logger.error "stderr output is: #{stderr}"
|
|
199
|
-
raise CheripicError
|
|
200
|
-
end
|
|
201
|
-
stdout.chomp!
|
|
149
|
+
stdout = capture_command(command)
|
|
202
150
|
if stdout == '' or stdout.split("\t")[3].to_i == 0 or stdout =~ /^\t0/
|
|
203
151
|
logger.info "pileup data empty for\t#{id}\t#{pos}"
|
|
204
152
|
else
|
|
205
153
|
pileup = Pileup.new(stdout)
|
|
206
|
-
|
|
207
|
-
logger.info "pileup coverage is higher than max\t#{pileup.to_s}"
|
|
208
|
-
next
|
|
209
|
-
end
|
|
210
|
-
contig_obj.send(sym).store(pos, pileup)
|
|
154
|
+
store_pileup_info(pileup, sym)
|
|
211
155
|
end
|
|
212
156
|
end
|
|
213
157
|
end
|
|
214
158
|
end
|
|
215
159
|
|
|
160
|
+
# Bam object is read and each contig mean and std deviation of depth calculated
|
|
161
|
+
# @param bamobject [Bio::DB::Sam]
|
|
162
|
+
# Open3 capture returns string output, so careful not to give whole genome or big contigs for depth analysis
|
|
163
|
+
def set_max_depth(bamobject, bamfile)
|
|
164
|
+
logger.info "processing #{bamfile} file for depth"
|
|
165
|
+
all_depths = []
|
|
166
|
+
bq = Options.base_quality
|
|
167
|
+
mq = Options.mapping_quality
|
|
168
|
+
@assembly.each_key do | id |
|
|
169
|
+
contig_obj = @assembly[id]
|
|
170
|
+
len = contig_obj.length
|
|
171
|
+
command = "#{bamobject.samtools} depth -r #{id} -Q #{bq} -q #{mq} #{bamfile}"
|
|
172
|
+
data = capture_command(command)
|
|
173
|
+
if data == ''
|
|
174
|
+
logger.info "depth data empty for\t#{id}"
|
|
175
|
+
next
|
|
176
|
+
end
|
|
177
|
+
depths = []
|
|
178
|
+
data.split("\n").each do |line|
|
|
179
|
+
info = line.split("\t")
|
|
180
|
+
depths << info[2].to_i
|
|
181
|
+
end
|
|
182
|
+
variance = 0
|
|
183
|
+
mean_depth = depths.reduce(0, :+) / len.to_f
|
|
184
|
+
depths.each do |value|
|
|
185
|
+
variance += (value.to_f - mean_depth)**2
|
|
186
|
+
end
|
|
187
|
+
all_depths << mean_depth
|
|
188
|
+
contig_obj.sd_depth = Math.sqrt(variance)
|
|
189
|
+
contig_obj.mean_depth = mean_depth
|
|
190
|
+
end
|
|
191
|
+
# setting max depth as 3 times the average depth
|
|
192
|
+
mean_coverage = all_depths.reduce(0, :+) / @assembly.length.to_f
|
|
193
|
+
Options.maxdepth = Options.max_d_multiple * mean_coverage
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
def capture_command(command)
|
|
197
|
+
stdout, stderr, status = Open3.capture3(command)
|
|
198
|
+
unless status.success?
|
|
199
|
+
logger.error "resulted in exit code #{status.exitstatus} using #{command}"
|
|
200
|
+
logger.error "stderr output is: #{stderr}"
|
|
201
|
+
raise CheripicError
|
|
202
|
+
end
|
|
203
|
+
stdout.chomp
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
# stores pileup information provided to respective contig_pileup object using sym input
|
|
207
|
+
# @param pileup [Pileup] Pileup objects
|
|
208
|
+
# @param sym [Symbol] Symbol of the input file used to write selected variants
|
|
209
|
+
# pileup information stored to respective ContigPileups object
|
|
210
|
+
def store_pileup_info(pileup, sym)
|
|
211
|
+
# discarding variants with higher than max depth only for mut_bulk
|
|
212
|
+
if sym == :mut_bulk
|
|
213
|
+
unless Options.maxdepth == 0 or pileup.coverage <= Options.maxdepth
|
|
214
|
+
logger.info "pileup coverage is higher than max\t#{pileup.ref_name}\t#{pileup.pos}\t#{pileup.coverage}"
|
|
215
|
+
return nil
|
|
216
|
+
end
|
|
217
|
+
end
|
|
218
|
+
contig_obj = @pileups[pileup.ref_name]
|
|
219
|
+
contig_obj.send(sym).store(pileup.pos, pileup)
|
|
220
|
+
nil
|
|
221
|
+
end
|
|
222
|
+
|
|
216
223
|
# Once pileup files are analysed and variants are extracted from each bulk;
|
|
217
224
|
# bulks are compared to identify and isolate variants for downstream analysis.
|
|
218
225
|
# If polyploidy set to trye and mut_parent and bg_parent bulks are provided
|
data/lib/cheripic/vcf.rb
CHANGED
|
@@ -9,33 +9,38 @@ module Cheripic
|
|
|
9
9
|
|
|
10
10
|
class Vcf
|
|
11
11
|
|
|
12
|
-
def self.
|
|
12
|
+
def self.get_allele_depth(vcf_obj)
|
|
13
13
|
# check if the vcf is from samtools (has DP4 and AF1 fields in INFO)
|
|
14
14
|
if vcf_obj.info.key?('DP4')
|
|
15
15
|
freq = vcf_obj.info['DP4'].split(',')
|
|
16
|
-
|
|
16
|
+
ref = freq[0].to_f + freq[1].to_f
|
|
17
17
|
alt = freq[2].to_f + freq[3].to_f
|
|
18
|
-
allele_freq = alt / depth
|
|
19
|
-
# allele_freq = vcf_obj.non_ref_allele_freq
|
|
20
18
|
# check if the vcf is from VarScan (has RD, AD and FREQ fields in FORMAT)
|
|
21
19
|
elsif vcf_obj.samples['1'].key?('RD')
|
|
20
|
+
ref = vcf_obj.samples['1']['RD'].to_f
|
|
22
21
|
alt = vcf_obj.samples['1']['AD'].to_f
|
|
23
|
-
depth = vcf_obj.samples['1']['RD'].to_f + alt
|
|
24
|
-
allele_freq = alt / depth
|
|
25
22
|
# check if the vcf is from GATK (has AD and GT fields in FORMAT)
|
|
26
23
|
elsif vcf_obj.samples['1'].key?('AD') and vcf_obj.samples['1']['AD'].include?(',')
|
|
27
24
|
freq = vcf_obj.samples['1']['AD'].split(',')
|
|
28
|
-
|
|
25
|
+
ref = freq[0].to_i
|
|
26
|
+
alt = freq[1].to_i
|
|
29
27
|
# check if the vcf has has AF fields in INFO
|
|
30
28
|
elsif vcf_obj.info.key?('AF')
|
|
31
29
|
allele_freq = vcf_obj.info['AF'].to_f
|
|
30
|
+
depth = vcf_obj.info['DP'].to_i
|
|
31
|
+
alt = (depth * allele_freq).round
|
|
32
|
+
ref = depth - alt
|
|
32
33
|
else
|
|
33
34
|
raise VcfError.new 'not a supported vcf format (VarScan, GATK, Bcftools(Samtools), Vcf 4.0, 4.1 and 4.2)' +
|
|
34
35
|
" and check that it is one sample vcf\n"
|
|
35
36
|
end
|
|
36
|
-
|
|
37
|
+
[ref, alt]
|
|
37
38
|
end
|
|
38
39
|
|
|
40
|
+
def self.get_allele_freq(vcf_obj)
|
|
41
|
+
ref, alt = get_allele_depth(vcf_obj)
|
|
42
|
+
alt.to_f/(ref + alt)
|
|
43
|
+
end
|
|
39
44
|
|
|
40
45
|
##Input: vcf file
|
|
41
46
|
##Ouput: lists of hm and ht SNPS and hash of all fragments with variants
|
|
@@ -78,6 +83,25 @@ module Cheripic
|
|
|
78
83
|
var_pos_mut
|
|
79
84
|
end
|
|
80
85
|
|
|
86
|
+
def self.to_pileup(v)
|
|
87
|
+
ref, alt = Vcf.get_allele_depth(v)
|
|
88
|
+
depth = ref + alt
|
|
89
|
+
alt_bases = '' + v.alt
|
|
90
|
+
ref_len = v.ref.length
|
|
91
|
+
alt_len = v.alt.length
|
|
92
|
+
if ref_len > alt_len
|
|
93
|
+
seq = v.ref[alt_len..-1]
|
|
94
|
+
alt_bases = '-' + seq.length.to_s + seq
|
|
95
|
+
v.ref = v.ref[0]
|
|
96
|
+
elsif ref_len < alt_len
|
|
97
|
+
seq = v.alt[ref_len..-1]
|
|
98
|
+
alt_bases = '+' + seq.length.to_s + seq
|
|
99
|
+
end
|
|
100
|
+
bases = ('.' * ref) + ( alt_bases * alt)
|
|
101
|
+
quality = 'D' * depth
|
|
102
|
+
[v.chrom, v.pos, v.ref, depth, bases, quality].join("\t")
|
|
103
|
+
end
|
|
104
|
+
|
|
81
105
|
end
|
|
82
106
|
|
|
83
107
|
end
|
data/lib/cheripic/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: cheripic
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.2.
|
|
4
|
+
version: 1.2.6
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Shyam Rallapalli
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2016-10-
|
|
11
|
+
date: 2016-10-26 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: yell
|