bio-sam-mutation 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.travis.yml +12 -0
- data/Gemfile +21 -0
- data/LICENSE.txt +20 -0
- data/README.md +88 -0
- data/README.rdoc +48 -0
- data/Rakefile +54 -0
- data/bin/mutations +108 -0
- data/bin/sam-mutation +20 -0
- data/lib/bio-sam-mutation.rb +26 -0
- data/lib/bio-sam-mutation/bio/alignment/cigar.rb +239 -0
- data/lib/bio-sam-mutation/bio/alignment/iterate_pairs.rb +68 -0
- data/lib/bio-sam-mutation/bio/db/alignment.rb +176 -0
- data/lib/bio-sam-mutation/bio/db/tag.rb +5 -0
- data/lib/bio-sam-mutation/bio/db/tag/md.rb +126 -0
- data/lib/bio-sam-mutation/bio/mutantallele.rb +24 -0
- data/lib/bio-sam-mutation/bio/mutation.rb +63 -0
- data/lib/bio-sam-mutation/bio/mutation_array.rb +15 -0
- data/lib/bio-sam-mutation/bio/vephgvs.rb +21 -0
- data/lib/bio-sam-mutation/mutationscli.rb +83 -0
- data/test/helper.rb +34 -0
- data/test/test_cigar.rb +145 -0
- data/test/test_mdtag.rb +46 -0
- data/test/test_mutant_allele.rb +21 -0
- data/test/test_mutation.rb +84 -0
- data/test/test_mutation_array.rb +13 -0
- data/test/test_sam.rb +160 -0
- data/test/test_vep_hgvs.rb +9 -0
- metadata +247 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 89bbe12a6c812fdc4726e74a758d3243575875ea
|
4
|
+
data.tar.gz: 651ffa24264f666bf86740d7f7043845e3eae5a7
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 3f27e36ba12e338179e310964e5867c3fdaedb8f799e50b49d06f65fe545d79223dc7f37ef6b88770d61d18b31b79c4edd4db58b5d77919f28c6587787db1a7b
|
7
|
+
data.tar.gz: 0904d6a55dd48f26ba20cbc9e11c65d00d1720e37b61cf9d15fe2f0b81afdf1c38b796ec265fc690f02b77abe6a39a6148da131dac9a42ed87e29424938875b1
|
data/.document
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
|
3
|
+
gem "bio", ">= 1.4.2"
|
4
|
+
# Using edge version due to a problem with ruby >2.1 in biogems version at time of writing
|
5
|
+
gem "bio-samtools", "~>2.3.4", git: "https://github.com/helios/bioruby-samtools.git", ref: "2e77274"
|
6
|
+
# JSON serialisation:
|
7
|
+
gem "oj", "~>2.14"
|
8
|
+
# At the time of writing, the released version 0.2.0 does not include the variation#vep_hgvs method
|
9
|
+
# so use this specific commit:
|
10
|
+
gem "bio-ensembl-rest", "0.2.0", git: "https://github.com/ALTree/bio-ensembl-rest.git", ref: "c934fa0"
|
11
|
+
gem "trollop"
|
12
|
+
gem "rake", "~>0.9"
|
13
|
+
|
14
|
+
group :development do
|
15
|
+
gem "shoulda", ">= 0"
|
16
|
+
gem "rdoc", "~> 3.12"
|
17
|
+
gem "simplecov", ">= 0"
|
18
|
+
gem "jeweler", "~> 2.0"
|
19
|
+
gem "bundler"
|
20
|
+
gem "test-unit", "~> 3.0"
|
21
|
+
end
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2015 stveep
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,88 @@
|
|
1
|
+
# bio-sam-mutation
|
2
|
+
|
3
|
+
[](http://travis-ci.org/stveep/bioruby-sam-mutation)
|
4
|
+
|
5
|
+
* Methods for calling mutations from SAM alignments, including CIGAR and MD tag parsers.
|
6
|
+
|
7
|
+
* Annotates mutations in HGVS format: http://www.hgvs.org/mutnomen/recs.html.
|
8
|
+
|
9
|
+
* Incorporates Ensembl VEP lookup.
|
10
|
+
|
11
|
+
## Installation
|
12
|
+
|
13
|
+
```sh
|
14
|
+
gem install bio-sam-mutation
|
15
|
+
```
|
16
|
+
|
17
|
+
## Usage
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
require 'bio-sam-mutation'
|
21
|
+
|
22
|
+
# NB must be tab-delimited
|
23
|
+
insertion_and_deletion = Bio::DB::Alignment.new("I2M5K:00253:00406\t0\t5\t112839854\t70\t63M2I138M1D27M7S\t*\t0\t0\tCAGTGATCTTCCAGATAGCCCTGGACAAACCATGCCACCAAGCAGAAGTAAAACACCTCCACCATACCTCCTCAAACAGCTCAAACCAAGCGAGAAGTACCTAAAAATAAAGCACCTACTGCTGAAAAGAGAGAGAGTGGACCTAAGCAAGCTGCAGTAAATGCTGCAGTTCAGAGGGTCCAGGTTCTTCCAGATGCTGATACTTATTACATTTTGCCACGGAAAGTACTGCTGAGG\t@CDDDCCCCACACCCCCCCC?CCACCCC>A6;;;;7;;6;6;BC;;6;;;;;.;;>ADDA??;;;;;?CCACCCD>C??@CCCC>C@C;>?CCCC@C=::@:::::+:::/:CCC?>>>>CCCCDDD9CCCC@AB????=AB>??;?BB>@@@AA???CC<@@?????BB>??;;;B<BC;??8;6:A=@=@BBB;;;?<77//*08*088888*8=9=?B7;;4;??????????<\tPG:Z:novoalign\tAS:i:183\tUQ:i:183\tNM:i:3\tMD:Z:201^T27")
|
24
|
+
|
25
|
+
insertion_and_deletion.mutations
|
26
|
+
#=> [#<Bio::Mutation:0x007fa20b5b4fc8 @position=112839916, @type=:insertion, @reference=nil, @mutant="AT", @seqname="5">, #<Bio::Mutation:0x007fa20b5b4960 @position=112840055, @type=:deletion, @reference="T", @mutant=nil, @seqname="5">]
|
27
|
+
|
28
|
+
insertion_and_deletion.mutations.first.to_hgvs("g")
|
29
|
+
#=> "5:g.112839916_112839917insAT"
|
30
|
+
|
31
|
+
puts YAML.dump(insertion_and_deletion.mutations.first.vep("human","g").first["transcript_consequences"].keep_if{|c| c["transcript_id"] == "ENST00000257430"})
|
32
|
+
#---
|
33
|
+
# - variant_allele: AT
|
34
|
+
# cdna_end: 4379
|
35
|
+
# codons: cca/ccATa
|
36
|
+
# protein_end: 1441
|
37
|
+
# strand: 1
|
38
|
+
# hgnc_id: HGNC:583
|
39
|
+
# amino_acids: P/PX
|
40
|
+
# gene_symbol: APC
|
41
|
+
# cdna_start: 4378
|
42
|
+
# transcript_id: ENST00000257430
|
43
|
+
# cds_start: 4322
|
44
|
+
# gene_id: ENSG00000134982
|
45
|
+
# protein_start: 1441
|
46
|
+
# biotype: protein_coding
|
47
|
+
# gene_symbol_source: HGNC
|
48
|
+
# cds_end: 4323
|
49
|
+
# consequence_terms:
|
50
|
+
# - frameshift_variant
|
51
|
+
# impact: HIGH
|
52
|
+
# => nil
|
53
|
+
|
54
|
+
# E.g. of full request return
|
55
|
+
# http://rest.ensembl.org/documentation/info/vep_hgvs_get
|
56
|
+
insertion_and_deletion.mutations(112839854).first.vep("human","g")
|
57
|
+
# => [{"assembly_name"=>"GRCh38", "end"=>112839917, "seq_region_name"=>"5", "transcript_consequences"=>[{"gene_id"=>"ENSG00000134982", "distance"=>46, "variant_allele"=>"AT", "biotype"=>"nonsense_mediated_decay", "gene_symbol_source"=>"HGNC", "consequence_terms"=>["downstream_gene_variant"], "strand"=>1, "hgnc_id"=>"HGNC:583", "gene_symbol"=>"APC", "transcript_id"=>"ENST00000502371", "impact"=>"MODIFIER"}, {"variant_allele"=>"AT", "cdna_end"=>4380, "codons"=>"-/AT", "protein_end"=>1442, "strand"=>1, "hgnc_id"=>"HGNC:583", "amino_acids"=>"-/X", "gene_symbol"=>"APC", "cdna_start"=>4379, "transcript_id"=>"ENST00000257430", "cds_start"=>4323, "gene_id"=>"ENSG00000134982", "protein_start"=>1441, "biotype"=>"protein_coding", "gene_symbol_source"=>"HGNC", "cds_end"=>4324, "consequence_terms"=>["frameshift_variant"], "impact"=>"HIGH"}, {"gene_id"=>"ENSG00000134982", "distance"=>863, "variant_allele"=>"AT", "biotype"=>"protein_coding", "gene_symbol_source"=>"HGNC", "consequence_terms"=>["downstream_gene_variant"], "strand"=>1, "hgnc_id"=>"HGNC:583", "gene_symbol"=>"APC", "transcript_id"=>"ENST00000507379", "impact"=>"MODIFIER"}, {"variant_allele"=>"AT", "cdna_end"=>4481, "codons"=>"-/AT", "protein_end"=>1442, "strand"=>1, "hgnc_id"=>"HGNC:583", "amino_acids"=>"-/X", "gene_symbol"=>"APC", "cdna_start"=>4480, "transcript_id"=>"ENST00000508376", "cds_start"=>4323, "gene_id"=>"ENSG00000134982", "protein_start"=>1441, "biotype"=>"protein_coding", "gene_symbol_source"=>"HGNC", "cds_end"=>4324, "consequence_terms"=>["frameshift_variant"], "impact"=>"HIGH"}, {"gene_id"=>"ENSG00000134982", "distance"=>409, "variant_allele"=>"AT", "biotype"=>"protein_coding", "gene_symbol_source"=>"HGNC", "consequence_terms"=>["downstream_gene_variant"], "strand"=>1, "hgnc_id"=>"HGNC:583", "gene_symbol"=>"APC", "transcript_id"=>"ENST00000512211", "impact"=>"MODIFIER"}, {"gene_id"=>"ENSG00000134982", "variant_allele"=>"AT", "cdna_end"=>4569, "biotype"=>"nonsense_mediated_decay", "gene_symbol_source"=>"HGNC", "consequence_terms"=>["3_prime_UTR_variant", "NMD_transcript_variant"], "strand"=>1, "hgnc_id"=>"HGNC:583", "gene_symbol"=>"APC", "cdna_start"=>4568, "transcript_id"=>"ENST00000508624", "impact"=>"MODIFIER"}, {"gene_id"=>"ENSG00000258864", "variant_allele"=>"AT", "biotype"=>"nonsense_mediated_decay", "gene_symbol_source"=>"Clone_based_vega_gene", "consequence_terms"=>["intron_variant", "NMD_transcript_variant"], "strand"=>1, "gene_symbol"=>"CTC-554D6.1", "transcript_id"=>"ENST00000520401", "impact"=>"MODIFIER"}, {"gene_id"=>"ENSG00000134982", "distance"=>2195, "variant_allele"=>"AT", "biotype"=>"protein_coding", "gene_symbol_source"=>"HGNC", "consequence_terms"=>["downstream_gene_variant"], "strand"=>1, "hgnc_id"=>"HGNC:583", "gene_symbol"=>"APC", "transcript_id"=>"ENST00000504915", "impact"=>"MODIFIER"}], "strand"=>1, "id"=>"5:g.112839917_112839918insAT", "allele_string"=>"-/AT", "most_severe_consequence"=>"frameshift_variant", "start"=>112839918}]
|
58
|
+
|
59
|
+
|
60
|
+
|
61
|
+
```
|
62
|
+
|
63
|
+
The API doc is online. For more code examples see the test files in
|
64
|
+
the source tree.
|
65
|
+
|
66
|
+
## Project home page
|
67
|
+
|
68
|
+
Information on the source tree, documentation, examples, issues and
|
69
|
+
how to contribute, see
|
70
|
+
|
71
|
+
http://github.com/stveep/bioruby-sam-mutation
|
72
|
+
|
73
|
+
The BioRuby community is on IRC server: irc.freenode.org, channel: #bioruby.
|
74
|
+
|
75
|
+
## Cite
|
76
|
+
|
77
|
+
If you use this software, please cite one of
|
78
|
+
|
79
|
+
* [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
|
80
|
+
* [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
|
81
|
+
|
82
|
+
## Biogems.info
|
83
|
+
|
84
|
+
This Biogem is published at (http://biogems.info/index.html#bio-sam)
|
85
|
+
|
86
|
+
## Copyright
|
87
|
+
|
88
|
+
Copyright (c) 2015 stveep. See LICENSE.txt for further details.
|
data/README.rdoc
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
= bio-sam
|
2
|
+
|
3
|
+
{<img
|
4
|
+
src="https://secure.travis-ci.org/stveep/bioruby-sam.png"
|
5
|
+
/>}[http://travis-ci.org/#!/stveep/bioruby-sam]
|
6
|
+
|
7
|
+
Full description goes here
|
8
|
+
|
9
|
+
Note: this software is under active development!
|
10
|
+
|
11
|
+
== Installation
|
12
|
+
|
13
|
+
gem install bio-sam
|
14
|
+
|
15
|
+
== Usage
|
16
|
+
|
17
|
+
== Developers
|
18
|
+
|
19
|
+
To use the library
|
20
|
+
|
21
|
+
require 'bio-sam'
|
22
|
+
|
23
|
+
The API doc is online. For more code examples see also the test files in
|
24
|
+
the source tree.
|
25
|
+
|
26
|
+
== Project home page
|
27
|
+
|
28
|
+
Information on the source tree, documentation, issues and how to contribute, see
|
29
|
+
|
30
|
+
http://github.com/stveep/bioruby-sam
|
31
|
+
|
32
|
+
The BioRuby community is on IRC server: irc.freenode.org, channel: #bioruby.
|
33
|
+
|
34
|
+
== Cite
|
35
|
+
|
36
|
+
If you use this software, please cite one of
|
37
|
+
|
38
|
+
* [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
|
39
|
+
* [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
|
40
|
+
|
41
|
+
== Biogems.info
|
42
|
+
|
43
|
+
This Biogem is published at http://biogems.info/index.html#bio-sam
|
44
|
+
|
45
|
+
== Copyright
|
46
|
+
|
47
|
+
Copyright (c) 2015 stveep. See LICENSE.txt for further details.
|
48
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
begin
|
6
|
+
Bundler.setup(:default, :development)
|
7
|
+
rescue Bundler::BundlerError => e
|
8
|
+
$stderr.puts e.message
|
9
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
+
exit e.status_code
|
11
|
+
end
|
12
|
+
require 'rake'
|
13
|
+
|
14
|
+
require 'jeweler'
|
15
|
+
Jeweler::Tasks.new do |gem|
|
16
|
+
# gem is a Gem::Specification... see http://guides.rubygems.org/specification-reference/ for more options
|
17
|
+
gem.name = "bio-sam-mutation"
|
18
|
+
gem.version = '0.4.1'
|
19
|
+
gem.homepage = "http://github.com/stveep/bioruby-sam-mutation"
|
20
|
+
gem.license = "MIT"
|
21
|
+
gem.summary = %Q{Parsing and mutation calling from SAM, CIGAR and MD:Z.}
|
22
|
+
gem.description = %Q{Simple classes for parsing SAM, CIGAR and MD:Z strings, including slices. Methods for calling mutations in HGVS format and looking up consequences using Ensembl VEP REST API. Developed for calling mutations at an expected position in an alignment - e.g. Amplicon sequencing of CRISPR-induced mutations.}
|
23
|
+
gem.email = "spettitt@gmail.com"
|
24
|
+
gem.authors = ["Stephen Pettitt"]
|
25
|
+
# dependencies defined in Gemfile
|
26
|
+
# gem.required_ruby_version = '>= 1.9.3'
|
27
|
+
gem.executables << "mutations"
|
28
|
+
end
|
29
|
+
Jeweler::RubygemsDotOrgTasks.new
|
30
|
+
|
31
|
+
require 'rake/testtask'
|
32
|
+
Rake::TestTask.new(:test) do |test|
|
33
|
+
test.libs << 'lib' << 'test'
|
34
|
+
test.pattern = 'test/**/test_*.rb'
|
35
|
+
test.verbose = true
|
36
|
+
end
|
37
|
+
|
38
|
+
desc "Code coverage detail"
|
39
|
+
task :simplecov do
|
40
|
+
ENV['COVERAGE'] = "true"
|
41
|
+
Rake::Task['test'].execute
|
42
|
+
end
|
43
|
+
|
44
|
+
task :default => :test
|
45
|
+
|
46
|
+
require 'rdoc/task'
|
47
|
+
Rake::RDocTask.new do |rdoc|
|
48
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
49
|
+
|
50
|
+
rdoc.rdoc_dir = 'rdoc'
|
51
|
+
rdoc.title = "bio-sam-mutation #{version}"
|
52
|
+
rdoc.rdoc_files.include('README*')
|
53
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
54
|
+
end
|
data/bin/mutations
ADDED
@@ -0,0 +1,108 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$LOAD_PATH.unshift File.join(__dir__, '..', 'lib')
|
3
|
+
require 'trollop'
|
4
|
+
require 'bio-sam-mutation'
|
5
|
+
require 'pry'
|
6
|
+
|
7
|
+
opts = Trollop::options do
|
8
|
+
opt :config, "Configuration file in YAML format. Defaults to ./config.yml. Run with --example-config for example.", type: :string, default: "config.yml"
|
9
|
+
opt :example_config, "Show example configuration file."
|
10
|
+
opt :output, "Output file, works with single product only. Better to define in config file.", type: :string, default: "output.sam"
|
11
|
+
opt :tag, "Tag an input SAM file (or piped stream of SAM data) with mutation calls"
|
12
|
+
opt :report, "Produce a report by amplicon for each file given, including read counts and annotation for each allele. (does not work on a stream)"
|
13
|
+
opt :mincov, "Minimum fraction of reads to report a mutant allele.", default: 0.01
|
14
|
+
opt :flag, "Minimum reads to flag a potential mutant (= no wild type).", default: 20
|
15
|
+
end
|
16
|
+
|
17
|
+
raise "Cannot use report and tag simultaneously." if opts[:report] && opts[:tag]
|
18
|
+
|
19
|
+
# Trollop removes and parses the options, leaving either input files or the incoming stream:
|
20
|
+
config = opts[:config]? YAML.load_file(opts[:config]) : {}
|
21
|
+
if opts[:tag]
|
22
|
+
if config.keys.include? :products
|
23
|
+
config[:products].each do |product_name, config_hash|
|
24
|
+
config[product_name] = MutationsCLI.set_defaults(config_hash)
|
25
|
+
end
|
26
|
+
config = MutationsCLI.construct_products(config)
|
27
|
+
else
|
28
|
+
config[:single_product] = true
|
29
|
+
config[:output] ||= opt[:output]
|
30
|
+
config[:outfile] = File.open(config[:output],'w')
|
31
|
+
end
|
32
|
+
|
33
|
+
ARGF.each do |input|
|
34
|
+
MutationsCLI.tag input, config
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
if opts[:report]
|
39
|
+
config[:products].each do |product_name, config_hash|
|
40
|
+
config_hash = MutationsCLI.set_defaults(config_hash)
|
41
|
+
lookups = {}
|
42
|
+
out = File.open(product_name+"-report.txt",'w')
|
43
|
+
mutants = []
|
44
|
+
ARGV.each do |file|
|
45
|
+
calls = Hash.new{|h,k| h[k] = MutantAllele.new}
|
46
|
+
File.open(file).readlines.each do |input|
|
47
|
+
next if input.match /^@/ # skip sam headers
|
48
|
+
sam = Bio::DB::Alignment.new(input)
|
49
|
+
# Check correct start in case of file with mixed amplicons
|
50
|
+
if config_hash[:start]
|
51
|
+
next unless sam.seq.match Regexp.new("^"+config_hash[:start])
|
52
|
+
end
|
53
|
+
# Must be have sufficient mapped length to call mutations in the given interval:
|
54
|
+
next if sam.query_unmapped
|
55
|
+
next if config_hash[:length] > Bio::Alignment::CIGAR.new(sam.cigar).reference_length - config_hash[:offset]
|
56
|
+
key = "w.t."
|
57
|
+
muts = MutationsCLI.call_mutations_given_product sam, config_hash
|
58
|
+
if muts
|
59
|
+
key = muts.to_hgvs
|
60
|
+
calls[key].mutations ||= muts
|
61
|
+
end
|
62
|
+
calls[key].seq ||= sam.query(config_hash[:offset], config_hash[:length])
|
63
|
+
calls[key].example ||= sam
|
64
|
+
calls[key].count += 1
|
65
|
+
end
|
66
|
+
total_reads = calls.map{|k,v| v.count}.reduce(:+)
|
67
|
+
out.puts file
|
68
|
+
out.puts "Total reads: #{total_reads}"
|
69
|
+
threshold = total_reads ? opts[:mincov] * total_reads : 0
|
70
|
+
calls.keep_if{|k, v| v.count > threshold}
|
71
|
+
mutants << file unless calls.keys.include? "w.t." || total_reads < opts[:flag]
|
72
|
+
calls = calls.sort_by{|k,v| v.count}.reverse.to_h
|
73
|
+
calls.each do |key, allele|
|
74
|
+
hgvs, vep = ""
|
75
|
+
formatted = nil
|
76
|
+
if allele.mutations
|
77
|
+
hgvs = allele.mutations.to_hgvs
|
78
|
+
if allele.mutations.size == 1 # VEP lookup doesn't work for compound mutations
|
79
|
+
begin
|
80
|
+
vep = allele.lookup
|
81
|
+
rescue RuntimeError
|
82
|
+
result = nil
|
83
|
+
end
|
84
|
+
# TODO: sort out the cacheing - does it actually work?
|
85
|
+
if vep
|
86
|
+
result = lookups[hgvs] ? lookups[hgvs] : VepHgvs.consequences_for_transcript(vep,config_hash[:transcript]).first
|
87
|
+
if result
|
88
|
+
result.each
|
89
|
+
formatted = [[result["CDS position"].to_s,result["Allele"]].join(" "),[result["Protein start"].to_s,result["Mutation"]].join(" "),result["Consequence"]].join("\t")
|
90
|
+
end
|
91
|
+
else
|
92
|
+
formatted = "No VEP result"
|
93
|
+
end
|
94
|
+
lookups[hgvs] ||= result
|
95
|
+
else
|
96
|
+
formatted = "Compound mutant"
|
97
|
+
end
|
98
|
+
end
|
99
|
+
formatted ||= "No mutation"
|
100
|
+
out.puts ([allele.seq, allele.count, key, formatted]).join("\t")
|
101
|
+
end
|
102
|
+
out.puts "\n===================================="
|
103
|
+
end
|
104
|
+
out.puts "Files with wild type below threshold:"
|
105
|
+
out.puts mutants.join("\n")
|
106
|
+
end
|
107
|
+
|
108
|
+
end
|
data/bin/sam-mutation
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'bio-sam-mutation'
|
3
|
+
require 'thor'
|
4
|
+
|
5
|
+
class SamMutationCLI < Thor
|
6
|
+
desc "tag [--config=config.yml]", "Tag a SAM file with HGVS annotations."
|
7
|
+
option :config
|
8
|
+
def tag (file_name)
|
9
|
+
File.open(file_name).each do |line|
|
10
|
+
next if line.match(/^@/)
|
11
|
+
sam = Bio::DB::Alignment.new(line)
|
12
|
+
if sam.mutations
|
13
|
+
new_tag = Bio::DB::Tag.new("YH:m:"+sam.mutations.to_hgvs)
|
14
|
+
puts sam.add_tag(new_tag)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
SamMutationCLI.start ARGV
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# Please require your code below, respecting the naming conventions in the
|
2
|
+
# bioruby directory tree.
|
3
|
+
#
|
4
|
+
# For example, say you have a plugin named bio-plugin, the only uncommented
|
5
|
+
# line in this file would be
|
6
|
+
#
|
7
|
+
# require 'bio/bio-plugin/plugin'
|
8
|
+
#
|
9
|
+
# In this file only require other files. Avoid other source code.
|
10
|
+
|
11
|
+
require 'bio'
|
12
|
+
require 'bio-ensembl-rest'
|
13
|
+
require 'bio-samtools'
|
14
|
+
require 'oj'
|
15
|
+
require 'yaml'
|
16
|
+
require 'bio-sam-mutation/bio/db/alignment'
|
17
|
+
require 'bio-sam-mutation/bio/alignment/iterate_pairs'
|
18
|
+
require 'bio-sam-mutation/bio/alignment/cigar'
|
19
|
+
require 'bio-sam-mutation/bio/db/tag'
|
20
|
+
require 'bio-sam-mutation/bio/db/tag/md'
|
21
|
+
require 'bio-sam-mutation/bio/vephgvs'
|
22
|
+
require 'bio-sam-mutation/bio/mutation'
|
23
|
+
require 'bio-sam-mutation/bio/mutation_array'
|
24
|
+
require 'bio-sam-mutation/bio/mutantallele'
|
25
|
+
|
26
|
+
require 'bio-sam-mutation/mutationscli'
|
@@ -0,0 +1,239 @@
|
|
1
|
+
# Parse a CIGAR string
|
2
|
+
# An example from Exonerate output. Ideally will also allow SAM file input to be used.
|
3
|
+
# 1 : CGGCTATGGGGTCGTGGGTCCCGCGTTG-CTCTGGGGCTCGGCACCCTGGGGCGGCACGGCCGT : 63
|
4
|
+
# | | || | ||||||||||||||||||| |||||||||||||||||||||||||||||||||||
|
5
|
+
# 1 : CAG-TA-GTGGTCGTGGGTCCCGCGTTGTCTCTGGGGCTCGGCACCCTGGGGCGGCACGGCCGT : 62
|
6
|
+
#
|
7
|
+
# ref: CAGTAGTGGTCGTGGGTCCCGCGTTGTCTCTGG...
|
8
|
+
# cigar: SP-A12_D02_2015-01-16.seq 0 611 + SP-A3_ref 0 621 + 2514 M 3 I 1 M 2 I 1 M 21 D 1 M 306 D 9 M 89 I 1 M 126 D 1 M 24 D 1 M 8 I 1 M 6 D 1 M 5 D 1 M 17
|
9
|
+
# I are not counted in reference
|
10
|
+
# Regexp (from SAM specification) - but in exonerate the number comes first: ([0-9]+[MIDNSHP])+|\*
|
11
|
+
|
12
|
+
class Bio::Alignment::CIGAR
|
13
|
+
include Bio::Alignment::IteratePairs
|
14
|
+
|
15
|
+
class << self
|
16
|
+
attr_accessor :reference_operations, :query_operations, :subexp, :regexps
|
17
|
+
end
|
18
|
+
|
19
|
+
self.regexps = {"exonerate" => /([MIDNSHP]{1})(\d+)/, "sam" => /(\d+)([MIDNSHP]{1})/}
|
20
|
+
# Type of elements that count towards the reference length:
|
21
|
+
# TODO: add full support for other elements S, H etc.
|
22
|
+
self.reference_operations = /[MD]/
|
23
|
+
self.query_operations = /[MIS]/
|
24
|
+
self.subexp = /([atgcAGCT]+)>([atgcAGTC]+)/
|
25
|
+
attr_accessor :pairs, :reference
|
26
|
+
|
27
|
+
def initialize(string,ref=nil,source="")
|
28
|
+
# strip out whitespace
|
29
|
+
string.gsub!(/\s+/,"")
|
30
|
+
|
31
|
+
# Auto-detect source if not supplied
|
32
|
+
if !(Bio::Alignment::CIGAR.regexps.keys.include? source)
|
33
|
+
Bio::Alignment::CIGAR.regexps.each do |k,v|
|
34
|
+
# Look for match at start of string
|
35
|
+
if m = string.match(v)
|
36
|
+
source = k if m.offset(0)[0] == 0
|
37
|
+
end
|
38
|
+
end
|
39
|
+
if source == ""
|
40
|
+
raise "Source (e.g. 'exonerate', 'sam') not given and failed to auto-detect."
|
41
|
+
end
|
42
|
+
end
|
43
|
+
# Make an array of pairs of of cigar elements:
|
44
|
+
@pairs = string.scan(Bio::Alignment::CIGAR.regexps[source])
|
45
|
+
if source == "exonerate"
|
46
|
+
@pairs.map!{|pair| [pair[0].to_s, pair[1].to_i]}
|
47
|
+
else
|
48
|
+
# Provision to have number and identifier the other way round
|
49
|
+
@pairs.map!{|pair| [pair[1].to_s, pair[0].to_i]}
|
50
|
+
end
|
51
|
+
|
52
|
+
# Include reference sequence if provided
|
53
|
+
@reference = ref
|
54
|
+
# Check length of reference = sum(M+D)?
|
55
|
+
#warn "Reference length is not equal to that implied by CIGAR string: #{@reference.length}, #{self.reference_length}." unless @reference.length == self.reference_length
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
# Given an offset in reference sequence and length, return an object corresponding to that subregion of the alignment
|
60
|
+
def subalignment(offset,length,regexp=Bio::Alignment::CIGAR.reference_operations)
|
61
|
+
new_array = iterate_pairs(@pairs,offset,length,regexp)
|
62
|
+
# Return a CIGAR instance with just the new alignment
|
63
|
+
new_string = new_array.join(" ")
|
64
|
+
# -1 from offset as ruby string starts at zero
|
65
|
+
new_cigar = Bio::Alignment::CIGAR.new(new_string,@reference[offset-1,length])
|
66
|
+
new_cigar.remove_empty!
|
67
|
+
end
|
68
|
+
alias_method :slice, :subalignment
|
69
|
+
|
70
|
+
# Given a CIGAR-based [not reference - use subalignment] offset and length, return a subregion
|
71
|
+
def subcigar(offset,length)
|
72
|
+
# No regexp - includes everything
|
73
|
+
self.subalignment(offset,length,//)
|
74
|
+
end
|
75
|
+
|
76
|
+
def unmasked(offset,length)
|
77
|
+
self.subalignment(offset,length,/[MDI]/)
|
78
|
+
end
|
79
|
+
|
80
|
+
def remove_small!(threshold=1)
|
81
|
+
# Deletions convert to matches, insertions just remove
|
82
|
+
deletions_to_matches(threshold)
|
83
|
+
remove_small_nonmatches(threshold)
|
84
|
+
self
|
85
|
+
end
|
86
|
+
|
87
|
+
def remove_empty!
|
88
|
+
self.pairs.keep_if{|pair| pair[1] != 0 }
|
89
|
+
self
|
90
|
+
end
|
91
|
+
|
92
|
+
def matched_length
|
93
|
+
count_type("M")
|
94
|
+
end
|
95
|
+
|
96
|
+
def deleted_length
|
97
|
+
count_type("D")
|
98
|
+
end
|
99
|
+
|
100
|
+
def inserted_length
|
101
|
+
count_type("I")
|
102
|
+
end
|
103
|
+
|
104
|
+
def masked_length
|
105
|
+
count_type(/[SH]/)
|
106
|
+
end
|
107
|
+
|
108
|
+
def reference_length
|
109
|
+
count_type(Bio::Alignment::CIGAR.reference_operations)
|
110
|
+
end
|
111
|
+
|
112
|
+
def query_length
|
113
|
+
count_type(Bio::Alignment::CIGAR.query_operations)
|
114
|
+
end
|
115
|
+
|
116
|
+
# Output a representation of the query: replace deleted portions with "-", flag insertions with "*" or sim. Optionally provide the sequence (or symbols to use) of insertions, in order of appearence.
|
117
|
+
# Should be able to accept an array
|
118
|
+
# TODO: Add support for substitution highlighting (e.g lowercasing)
|
119
|
+
def query(insertions=nil)
|
120
|
+
if (insertions && (insertions.is_a? String))
|
121
|
+
insertions = [insertions]
|
122
|
+
end
|
123
|
+
sequence = []
|
124
|
+
total = 0
|
125
|
+
@pairs.each do |pair|
|
126
|
+
if pair[0].match("M")
|
127
|
+
sequence << @reference[total..total+pair[1]-1].upcase
|
128
|
+
total += pair[1]
|
129
|
+
end
|
130
|
+
if pair[0].match("I")
|
131
|
+
if (insertions)
|
132
|
+
insertion = insertions.shift.to_s
|
133
|
+
else
|
134
|
+
insertion = '['+pair[1].to_s+']'
|
135
|
+
end
|
136
|
+
sequence << insertion
|
137
|
+
end
|
138
|
+
if pair[0].match("D")
|
139
|
+
pair[1].times{ sequence << "-" }
|
140
|
+
total += pair[1]
|
141
|
+
end
|
142
|
+
end
|
143
|
+
sequence.join("")
|
144
|
+
end
|
145
|
+
|
146
|
+
# Output hgnc variant format given reference position. Only deletions can be accurately annotated from the cigar string; insertions or wild type seqeunces return nil
|
147
|
+
# NB mutation calling and annotation now implemented as extension to Bio::DB::Alignment (SAM)
|
148
|
+
def hgnc(reference_pos=0,insertions=[],type="g",*subs)
|
149
|
+
if insertions
|
150
|
+
if insertions.is_a? String
|
151
|
+
insertions = [insertions]
|
152
|
+
end
|
153
|
+
end
|
154
|
+
first_match = true
|
155
|
+
total = 0
|
156
|
+
hgnc_format = []
|
157
|
+
@pairs.each do |pair|
|
158
|
+
case pair[0]
|
159
|
+
when "M"
|
160
|
+
#break if first_match == false
|
161
|
+
reference_pos += pair[1]
|
162
|
+
total += pair[1]
|
163
|
+
first_match = false
|
164
|
+
when "D"
|
165
|
+
deleted_bases = @reference[total,pair[1]].upcase
|
166
|
+
if (pair[1] == 1)
|
167
|
+
string = (reference_pos + 1).to_s
|
168
|
+
else
|
169
|
+
string = (reference_pos + 1).to_s + "_" + (reference_pos + pair[1]).to_s
|
170
|
+
end
|
171
|
+
string = string + "del" + deleted_bases
|
172
|
+
hgnc_format << string
|
173
|
+
total += pair[1]
|
174
|
+
when "I"
|
175
|
+
inserted_bases = (insertions.length == 0) ? "N" : insertions.shift
|
176
|
+
|
177
|
+
hgnc_format << (reference_pos).to_s + "_" + (reference_pos + 1).to_s + "ins" + inserted_bases.upcase
|
178
|
+
end
|
179
|
+
end
|
180
|
+
# Use for substitutions, but could also pass any other annotation to include in here, as an array of strings
|
181
|
+
subs = subs.first # >1 arguments discarded
|
182
|
+
if subs
|
183
|
+
if (subs.length > 0 && (subs.is_a? Array))
|
184
|
+
hgnc_format = hgnc_format + subs
|
185
|
+
end
|
186
|
+
end
|
187
|
+
if hgnc_format.length == 0
|
188
|
+
nil
|
189
|
+
elsif hgnc_format.length == 1
|
190
|
+
type.to_s + "." + hgnc_format[0]
|
191
|
+
else
|
192
|
+
type.to_s + "." + "[" + hgnc_format.join(";") + "]"
|
193
|
+
end
|
194
|
+
|
195
|
+
end
|
196
|
+
|
197
|
+
# TODO combine adjacent operations of the same type into a single pair
|
198
|
+
def combine_adjacent
|
199
|
+
|
200
|
+
end
|
201
|
+
|
202
|
+
# Returns a hash (keyed by operation type) of three element arrays: the start positions on the reference of operations of the given type(s) and the length of the operation,
|
203
|
+
# followed by query position (for e.g. retrieving inserted bases from SAM). A regexp can be used to specify multiple types e.g. /[ID]/.
|
204
|
+
def positions(type)
|
205
|
+
total = 0
|
206
|
+
qtotal = 0
|
207
|
+
hash = Hash.new{|h,k| h[k] = []}
|
208
|
+
@pairs.each do |pair|
|
209
|
+
if pair[0].match(type)
|
210
|
+
hash[$&] << [total, pair[1], qtotal]
|
211
|
+
end
|
212
|
+
total += pair[1] if pair[0].match Bio::Alignment::CIGAR.reference_operations
|
213
|
+
qtotal += pair[1] if pair[0].match Bio::Alignment::CIGAR.query_operations
|
214
|
+
end
|
215
|
+
hash
|
216
|
+
end
|
217
|
+
|
218
|
+
|
219
|
+
private
|
220
|
+
|
221
|
+
def deletions_to_matches(threshold)
|
222
|
+
self.pairs = @pairs.each{|pair| pair[0].sub!("D","M") if pair[1] <= threshold}
|
223
|
+
end
|
224
|
+
|
225
|
+
def remove_small_nonmatches(threshold)
|
226
|
+
self.pairs = @pairs.keep_if{|pair| pair[0] == "M" || pair[1] > threshold}
|
227
|
+
end
|
228
|
+
|
229
|
+
def count_type(type)
|
230
|
+
sum = 0
|
231
|
+
@pairs.each do |pair|
|
232
|
+
if pair[0].match(type)
|
233
|
+
sum += pair[1]
|
234
|
+
end
|
235
|
+
end
|
236
|
+
sum
|
237
|
+
end
|
238
|
+
|
239
|
+
end #class
|