bio-sam-mutation 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.travis.yml +12 -0
- data/Gemfile +21 -0
- data/LICENSE.txt +20 -0
- data/README.md +88 -0
- data/README.rdoc +48 -0
- data/Rakefile +54 -0
- data/bin/mutations +108 -0
- data/bin/sam-mutation +20 -0
- data/lib/bio-sam-mutation.rb +26 -0
- data/lib/bio-sam-mutation/bio/alignment/cigar.rb +239 -0
- data/lib/bio-sam-mutation/bio/alignment/iterate_pairs.rb +68 -0
- data/lib/bio-sam-mutation/bio/db/alignment.rb +176 -0
- data/lib/bio-sam-mutation/bio/db/tag.rb +5 -0
- data/lib/bio-sam-mutation/bio/db/tag/md.rb +126 -0
- data/lib/bio-sam-mutation/bio/mutantallele.rb +24 -0
- data/lib/bio-sam-mutation/bio/mutation.rb +63 -0
- data/lib/bio-sam-mutation/bio/mutation_array.rb +15 -0
- data/lib/bio-sam-mutation/bio/vephgvs.rb +21 -0
- data/lib/bio-sam-mutation/mutationscli.rb +83 -0
- data/test/helper.rb +34 -0
- data/test/test_cigar.rb +145 -0
- data/test/test_mdtag.rb +46 -0
- data/test/test_mutant_allele.rb +21 -0
- data/test/test_mutation.rb +84 -0
- data/test/test_mutation_array.rb +13 -0
- data/test/test_sam.rb +160 -0
- data/test/test_vep_hgvs.rb +9 -0
- metadata +247 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 89bbe12a6c812fdc4726e74a758d3243575875ea
|
4
|
+
data.tar.gz: 651ffa24264f666bf86740d7f7043845e3eae5a7
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 3f27e36ba12e338179e310964e5867c3fdaedb8f799e50b49d06f65fe545d79223dc7f37ef6b88770d61d18b31b79c4edd4db58b5d77919f28c6587787db1a7b
|
7
|
+
data.tar.gz: 0904d6a55dd48f26ba20cbc9e11c65d00d1720e37b61cf9d15fe2f0b81afdf1c38b796ec265fc690f02b77abe6a39a6148da131dac9a42ed87e29424938875b1
|
data/.document
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
|
3
|
+
gem "bio", ">= 1.4.2"
|
4
|
+
# Using edge version due to a problem with ruby >2.1 in biogems version at time of writing
|
5
|
+
gem "bio-samtools", "~>2.3.4", git: "https://github.com/helios/bioruby-samtools.git", ref: "2e77274"
|
6
|
+
# JSON serialisation:
|
7
|
+
gem "oj", "~>2.14"
|
8
|
+
# At the time of writing, the released version 0.2.0 does not include the variation#vep_hgvs method
|
9
|
+
# so use this specific commit:
|
10
|
+
gem "bio-ensembl-rest", "0.2.0", git: "https://github.com/ALTree/bio-ensembl-rest.git", ref: "c934fa0"
|
11
|
+
gem "trollop"
|
12
|
+
gem "rake", "~>0.9"
|
13
|
+
|
14
|
+
group :development do
|
15
|
+
gem "shoulda", ">= 0"
|
16
|
+
gem "rdoc", "~> 3.12"
|
17
|
+
gem "simplecov", ">= 0"
|
18
|
+
gem "jeweler", "~> 2.0"
|
19
|
+
gem "bundler"
|
20
|
+
gem "test-unit", "~> 3.0"
|
21
|
+
end
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2015 stveep
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,88 @@
|
|
1
|
+
# bio-sam-mutation
|
2
|
+
|
3
|
+
[![Build Status](https://secure.travis-ci.org/stveep/bioruby-sam-mutation.png)](http://travis-ci.org/stveep/bioruby-sam-mutation)
|
4
|
+
|
5
|
+
* Methods for calling mutations from SAM alignments, including CIGAR and MD tag parsers.
|
6
|
+
|
7
|
+
* Annotates mutations in HGVS format: http://www.hgvs.org/mutnomen/recs.html.
|
8
|
+
|
9
|
+
* Incorporates Ensembl VEP lookup.
|
10
|
+
|
11
|
+
## Installation
|
12
|
+
|
13
|
+
```sh
|
14
|
+
gem install bio-sam-mutation
|
15
|
+
```
|
16
|
+
|
17
|
+
## Usage
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
require 'bio-sam-mutation'
|
21
|
+
|
22
|
+
# NB must be tab-delimited
|
23
|
+
insertion_and_deletion = Bio::DB::Alignment.new("I2M5K:00253:00406\t0\t5\t112839854\t70\t63M2I138M1D27M7S\t*\t0\t0\tCAGTGATCTTCCAGATAGCCCTGGACAAACCATGCCACCAAGCAGAAGTAAAACACCTCCACCATACCTCCTCAAACAGCTCAAACCAAGCGAGAAGTACCTAAAAATAAAGCACCTACTGCTGAAAAGAGAGAGAGTGGACCTAAGCAAGCTGCAGTAAATGCTGCAGTTCAGAGGGTCCAGGTTCTTCCAGATGCTGATACTTATTACATTTTGCCACGGAAAGTACTGCTGAGG\t@CDDDCCCCACACCCCCCCC?CCACCCC>A6;;;;7;;6;6;BC;;6;;;;;.;;>ADDA??;;;;;?CCACCCD>C??@CCCC>C@C;>?CCCC@C=::@:::::+:::/:CCC?>>>>CCCCDDD9CCCC@AB????=AB>??;?BB>@@@AA???CC<@@?????BB>??;;;B<BC;??8;6:A=@=@BBB;;;?<77//*08*088888*8=9=?B7;;4;??????????<\tPG:Z:novoalign\tAS:i:183\tUQ:i:183\tNM:i:3\tMD:Z:201^T27")
|
24
|
+
|
25
|
+
insertion_and_deletion.mutations
|
26
|
+
#=> [#<Bio::Mutation:0x007fa20b5b4fc8 @position=112839916, @type=:insertion, @reference=nil, @mutant="AT", @seqname="5">, #<Bio::Mutation:0x007fa20b5b4960 @position=112840055, @type=:deletion, @reference="T", @mutant=nil, @seqname="5">]
|
27
|
+
|
28
|
+
insertion_and_deletion.mutations.first.to_hgvs("g")
|
29
|
+
#=> "5:g.112839916_112839917insAT"
|
30
|
+
|
31
|
+
puts YAML.dump(insertion_and_deletion.mutations.first.vep("human","g").first["transcript_consequences"].keep_if{|c| c["transcript_id"] == "ENST00000257430"})
|
32
|
+
#---
|
33
|
+
# - variant_allele: AT
|
34
|
+
# cdna_end: 4379
|
35
|
+
# codons: cca/ccATa
|
36
|
+
# protein_end: 1441
|
37
|
+
# strand: 1
|
38
|
+
# hgnc_id: HGNC:583
|
39
|
+
# amino_acids: P/PX
|
40
|
+
# gene_symbol: APC
|
41
|
+
# cdna_start: 4378
|
42
|
+
# transcript_id: ENST00000257430
|
43
|
+
# cds_start: 4322
|
44
|
+
# gene_id: ENSG00000134982
|
45
|
+
# protein_start: 1441
|
46
|
+
# biotype: protein_coding
|
47
|
+
# gene_symbol_source: HGNC
|
48
|
+
# cds_end: 4323
|
49
|
+
# consequence_terms:
|
50
|
+
# - frameshift_variant
|
51
|
+
# impact: HIGH
|
52
|
+
# => nil
|
53
|
+
|
54
|
+
# E.g. of full request return
|
55
|
+
# http://rest.ensembl.org/documentation/info/vep_hgvs_get
|
56
|
+
insertion_and_deletion.mutations(112839854).first.vep("human","g")
|
57
|
+
# => [{"assembly_name"=>"GRCh38", "end"=>112839917, "seq_region_name"=>"5", "transcript_consequences"=>[{"gene_id"=>"ENSG00000134982", "distance"=>46, "variant_allele"=>"AT", "biotype"=>"nonsense_mediated_decay", "gene_symbol_source"=>"HGNC", "consequence_terms"=>["downstream_gene_variant"], "strand"=>1, "hgnc_id"=>"HGNC:583", "gene_symbol"=>"APC", "transcript_id"=>"ENST00000502371", "impact"=>"MODIFIER"}, {"variant_allele"=>"AT", "cdna_end"=>4380, "codons"=>"-/AT", "protein_end"=>1442, "strand"=>1, "hgnc_id"=>"HGNC:583", "amino_acids"=>"-/X", "gene_symbol"=>"APC", "cdna_start"=>4379, "transcript_id"=>"ENST00000257430", "cds_start"=>4323, "gene_id"=>"ENSG00000134982", "protein_start"=>1441, "biotype"=>"protein_coding", "gene_symbol_source"=>"HGNC", "cds_end"=>4324, "consequence_terms"=>["frameshift_variant"], "impact"=>"HIGH"}, {"gene_id"=>"ENSG00000134982", "distance"=>863, "variant_allele"=>"AT", "biotype"=>"protein_coding", "gene_symbol_source"=>"HGNC", "consequence_terms"=>["downstream_gene_variant"], "strand"=>1, "hgnc_id"=>"HGNC:583", "gene_symbol"=>"APC", "transcript_id"=>"ENST00000507379", "impact"=>"MODIFIER"}, {"variant_allele"=>"AT", "cdna_end"=>4481, "codons"=>"-/AT", "protein_end"=>1442, "strand"=>1, "hgnc_id"=>"HGNC:583", "amino_acids"=>"-/X", "gene_symbol"=>"APC", "cdna_start"=>4480, "transcript_id"=>"ENST00000508376", "cds_start"=>4323, "gene_id"=>"ENSG00000134982", "protein_start"=>1441, "biotype"=>"protein_coding", "gene_symbol_source"=>"HGNC", "cds_end"=>4324, "consequence_terms"=>["frameshift_variant"], "impact"=>"HIGH"}, {"gene_id"=>"ENSG00000134982", "distance"=>409, "variant_allele"=>"AT", "biotype"=>"protein_coding", "gene_symbol_source"=>"HGNC", "consequence_terms"=>["downstream_gene_variant"], "strand"=>1, "hgnc_id"=>"HGNC:583", "gene_symbol"=>"APC", "transcript_id"=>"ENST00000512211", "impact"=>"MODIFIER"}, {"gene_id"=>"ENSG00000134982", "variant_allele"=>"AT", "cdna_end"=>4569, "biotype"=>"nonsense_mediated_decay", "gene_symbol_source"=>"HGNC", "consequence_terms"=>["3_prime_UTR_variant", "NMD_transcript_variant"], "strand"=>1, "hgnc_id"=>"HGNC:583", "gene_symbol"=>"APC", "cdna_start"=>4568, "transcript_id"=>"ENST00000508624", "impact"=>"MODIFIER"}, {"gene_id"=>"ENSG00000258864", "variant_allele"=>"AT", "biotype"=>"nonsense_mediated_decay", "gene_symbol_source"=>"Clone_based_vega_gene", "consequence_terms"=>["intron_variant", "NMD_transcript_variant"], "strand"=>1, "gene_symbol"=>"CTC-554D6.1", "transcript_id"=>"ENST00000520401", "impact"=>"MODIFIER"}, {"gene_id"=>"ENSG00000134982", "distance"=>2195, "variant_allele"=>"AT", "biotype"=>"protein_coding", "gene_symbol_source"=>"HGNC", "consequence_terms"=>["downstream_gene_variant"], "strand"=>1, "hgnc_id"=>"HGNC:583", "gene_symbol"=>"APC", "transcript_id"=>"ENST00000504915", "impact"=>"MODIFIER"}], "strand"=>1, "id"=>"5:g.112839917_112839918insAT", "allele_string"=>"-/AT", "most_severe_consequence"=>"frameshift_variant", "start"=>112839918}]
|
58
|
+
|
59
|
+
|
60
|
+
|
61
|
+
```
|
62
|
+
|
63
|
+
The API doc is online. For more code examples see the test files in
|
64
|
+
the source tree.
|
65
|
+
|
66
|
+
## Project home page
|
67
|
+
|
68
|
+
Information on the source tree, documentation, examples, issues and
|
69
|
+
how to contribute, see
|
70
|
+
|
71
|
+
http://github.com/stveep/bioruby-sam-mutation
|
72
|
+
|
73
|
+
The BioRuby community is on IRC server: irc.freenode.org, channel: #bioruby.
|
74
|
+
|
75
|
+
## Cite
|
76
|
+
|
77
|
+
If you use this software, please cite one of
|
78
|
+
|
79
|
+
* [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
|
80
|
+
* [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
|
81
|
+
|
82
|
+
## Biogems.info
|
83
|
+
|
84
|
+
This Biogem is published at (http://biogems.info/index.html#bio-sam)
|
85
|
+
|
86
|
+
## Copyright
|
87
|
+
|
88
|
+
Copyright (c) 2015 stveep. See LICENSE.txt for further details.
|
data/README.rdoc
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
= bio-sam
|
2
|
+
|
3
|
+
{<img
|
4
|
+
src="https://secure.travis-ci.org/stveep/bioruby-sam.png"
|
5
|
+
/>}[http://travis-ci.org/#!/stveep/bioruby-sam]
|
6
|
+
|
7
|
+
Full description goes here
|
8
|
+
|
9
|
+
Note: this software is under active development!
|
10
|
+
|
11
|
+
== Installation
|
12
|
+
|
13
|
+
gem install bio-sam
|
14
|
+
|
15
|
+
== Usage
|
16
|
+
|
17
|
+
== Developers
|
18
|
+
|
19
|
+
To use the library
|
20
|
+
|
21
|
+
require 'bio-sam'
|
22
|
+
|
23
|
+
The API doc is online. For more code examples see also the test files in
|
24
|
+
the source tree.
|
25
|
+
|
26
|
+
== Project home page
|
27
|
+
|
28
|
+
Information on the source tree, documentation, issues and how to contribute, see
|
29
|
+
|
30
|
+
http://github.com/stveep/bioruby-sam
|
31
|
+
|
32
|
+
The BioRuby community is on IRC server: irc.freenode.org, channel: #bioruby.
|
33
|
+
|
34
|
+
== Cite
|
35
|
+
|
36
|
+
If you use this software, please cite one of
|
37
|
+
|
38
|
+
* [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
|
39
|
+
* [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
|
40
|
+
|
41
|
+
== Biogems.info
|
42
|
+
|
43
|
+
This Biogem is published at http://biogems.info/index.html#bio-sam
|
44
|
+
|
45
|
+
== Copyright
|
46
|
+
|
47
|
+
Copyright (c) 2015 stveep. See LICENSE.txt for further details.
|
48
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
begin
|
6
|
+
Bundler.setup(:default, :development)
|
7
|
+
rescue Bundler::BundlerError => e
|
8
|
+
$stderr.puts e.message
|
9
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
+
exit e.status_code
|
11
|
+
end
|
12
|
+
require 'rake'
|
13
|
+
|
14
|
+
require 'jeweler'
|
15
|
+
Jeweler::Tasks.new do |gem|
|
16
|
+
# gem is a Gem::Specification... see http://guides.rubygems.org/specification-reference/ for more options
|
17
|
+
gem.name = "bio-sam-mutation"
|
18
|
+
gem.version = '0.4.1'
|
19
|
+
gem.homepage = "http://github.com/stveep/bioruby-sam-mutation"
|
20
|
+
gem.license = "MIT"
|
21
|
+
gem.summary = %Q{Parsing and mutation calling from SAM, CIGAR and MD:Z.}
|
22
|
+
gem.description = %Q{Simple classes for parsing SAM, CIGAR and MD:Z strings, including slices. Methods for calling mutations in HGVS format and looking up consequences using Ensembl VEP REST API. Developed for calling mutations at an expected position in an alignment - e.g. Amplicon sequencing of CRISPR-induced mutations.}
|
23
|
+
gem.email = "spettitt@gmail.com"
|
24
|
+
gem.authors = ["Stephen Pettitt"]
|
25
|
+
# dependencies defined in Gemfile
|
26
|
+
# gem.required_ruby_version = '>= 1.9.3'
|
27
|
+
gem.executables << "mutations"
|
28
|
+
end
|
29
|
+
Jeweler::RubygemsDotOrgTasks.new
|
30
|
+
|
31
|
+
require 'rake/testtask'
|
32
|
+
Rake::TestTask.new(:test) do |test|
|
33
|
+
test.libs << 'lib' << 'test'
|
34
|
+
test.pattern = 'test/**/test_*.rb'
|
35
|
+
test.verbose = true
|
36
|
+
end
|
37
|
+
|
38
|
+
desc "Code coverage detail"
|
39
|
+
task :simplecov do
|
40
|
+
ENV['COVERAGE'] = "true"
|
41
|
+
Rake::Task['test'].execute
|
42
|
+
end
|
43
|
+
|
44
|
+
task :default => :test
|
45
|
+
|
46
|
+
require 'rdoc/task'
|
47
|
+
Rake::RDocTask.new do |rdoc|
|
48
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
49
|
+
|
50
|
+
rdoc.rdoc_dir = 'rdoc'
|
51
|
+
rdoc.title = "bio-sam-mutation #{version}"
|
52
|
+
rdoc.rdoc_files.include('README*')
|
53
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
54
|
+
end
|
data/bin/mutations
ADDED
@@ -0,0 +1,108 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$LOAD_PATH.unshift File.join(__dir__, '..', 'lib')
|
3
|
+
require 'trollop'
|
4
|
+
require 'bio-sam-mutation'
|
5
|
+
require 'pry'
|
6
|
+
|
7
|
+
opts = Trollop::options do
|
8
|
+
opt :config, "Configuration file in YAML format. Defaults to ./config.yml. Run with --example-config for example.", type: :string, default: "config.yml"
|
9
|
+
opt :example_config, "Show example configuration file."
|
10
|
+
opt :output, "Output file, works with single product only. Better to define in config file.", type: :string, default: "output.sam"
|
11
|
+
opt :tag, "Tag an input SAM file (or piped stream of SAM data) with mutation calls"
|
12
|
+
opt :report, "Produce a report by amplicon for each file given, including read counts and annotation for each allele. (does not work on a stream)"
|
13
|
+
opt :mincov, "Minimum fraction of reads to report a mutant allele.", default: 0.01
|
14
|
+
opt :flag, "Minimum reads to flag a potential mutant (= no wild type).", default: 20
|
15
|
+
end
|
16
|
+
|
17
|
+
raise "Cannot use report and tag simultaneously." if opts[:report] && opts[:tag]
|
18
|
+
|
19
|
+
# Trollop removes and parses the options, leaving either input files or the incoming stream:
|
20
|
+
config = opts[:config]? YAML.load_file(opts[:config]) : {}
|
21
|
+
if opts[:tag]
|
22
|
+
if config.keys.include? :products
|
23
|
+
config[:products].each do |product_name, config_hash|
|
24
|
+
config[product_name] = MutationsCLI.set_defaults(config_hash)
|
25
|
+
end
|
26
|
+
config = MutationsCLI.construct_products(config)
|
27
|
+
else
|
28
|
+
config[:single_product] = true
|
29
|
+
config[:output] ||= opt[:output]
|
30
|
+
config[:outfile] = File.open(config[:output],'w')
|
31
|
+
end
|
32
|
+
|
33
|
+
ARGF.each do |input|
|
34
|
+
MutationsCLI.tag input, config
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
if opts[:report]
|
39
|
+
config[:products].each do |product_name, config_hash|
|
40
|
+
config_hash = MutationsCLI.set_defaults(config_hash)
|
41
|
+
lookups = {}
|
42
|
+
out = File.open(product_name+"-report.txt",'w')
|
43
|
+
mutants = []
|
44
|
+
ARGV.each do |file|
|
45
|
+
calls = Hash.new{|h,k| h[k] = MutantAllele.new}
|
46
|
+
File.open(file).readlines.each do |input|
|
47
|
+
next if input.match /^@/ # skip sam headers
|
48
|
+
sam = Bio::DB::Alignment.new(input)
|
49
|
+
# Check correct start in case of file with mixed amplicons
|
50
|
+
if config_hash[:start]
|
51
|
+
next unless sam.seq.match Regexp.new("^"+config_hash[:start])
|
52
|
+
end
|
53
|
+
# Must be have sufficient mapped length to call mutations in the given interval:
|
54
|
+
next if sam.query_unmapped
|
55
|
+
next if config_hash[:length] > Bio::Alignment::CIGAR.new(sam.cigar).reference_length - config_hash[:offset]
|
56
|
+
key = "w.t."
|
57
|
+
muts = MutationsCLI.call_mutations_given_product sam, config_hash
|
58
|
+
if muts
|
59
|
+
key = muts.to_hgvs
|
60
|
+
calls[key].mutations ||= muts
|
61
|
+
end
|
62
|
+
calls[key].seq ||= sam.query(config_hash[:offset], config_hash[:length])
|
63
|
+
calls[key].example ||= sam
|
64
|
+
calls[key].count += 1
|
65
|
+
end
|
66
|
+
total_reads = calls.map{|k,v| v.count}.reduce(:+)
|
67
|
+
out.puts file
|
68
|
+
out.puts "Total reads: #{total_reads}"
|
69
|
+
threshold = total_reads ? opts[:mincov] * total_reads : 0
|
70
|
+
calls.keep_if{|k, v| v.count > threshold}
|
71
|
+
mutants << file unless calls.keys.include? "w.t." || total_reads < opts[:flag]
|
72
|
+
calls = calls.sort_by{|k,v| v.count}.reverse.to_h
|
73
|
+
calls.each do |key, allele|
|
74
|
+
hgvs, vep = ""
|
75
|
+
formatted = nil
|
76
|
+
if allele.mutations
|
77
|
+
hgvs = allele.mutations.to_hgvs
|
78
|
+
if allele.mutations.size == 1 # VEP lookup doesn't work for compound mutations
|
79
|
+
begin
|
80
|
+
vep = allele.lookup
|
81
|
+
rescue RuntimeError
|
82
|
+
result = nil
|
83
|
+
end
|
84
|
+
# TODO: sort out the cacheing - does it actually work?
|
85
|
+
if vep
|
86
|
+
result = lookups[hgvs] ? lookups[hgvs] : VepHgvs.consequences_for_transcript(vep,config_hash[:transcript]).first
|
87
|
+
if result
|
88
|
+
result.each
|
89
|
+
formatted = [[result["CDS position"].to_s,result["Allele"]].join(" "),[result["Protein start"].to_s,result["Mutation"]].join(" "),result["Consequence"]].join("\t")
|
90
|
+
end
|
91
|
+
else
|
92
|
+
formatted = "No VEP result"
|
93
|
+
end
|
94
|
+
lookups[hgvs] ||= result
|
95
|
+
else
|
96
|
+
formatted = "Compound mutant"
|
97
|
+
end
|
98
|
+
end
|
99
|
+
formatted ||= "No mutation"
|
100
|
+
out.puts ([allele.seq, allele.count, key, formatted]).join("\t")
|
101
|
+
end
|
102
|
+
out.puts "\n===================================="
|
103
|
+
end
|
104
|
+
out.puts "Files with wild type below threshold:"
|
105
|
+
out.puts mutants.join("\n")
|
106
|
+
end
|
107
|
+
|
108
|
+
end
|
data/bin/sam-mutation
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'bio-sam-mutation'
|
3
|
+
require 'thor'
|
4
|
+
|
5
|
+
class SamMutationCLI < Thor
|
6
|
+
desc "tag [--config=config.yml]", "Tag a SAM file with HGVS annotations."
|
7
|
+
option :config
|
8
|
+
def tag (file_name)
|
9
|
+
File.open(file_name).each do |line|
|
10
|
+
next if line.match(/^@/)
|
11
|
+
sam = Bio::DB::Alignment.new(line)
|
12
|
+
if sam.mutations
|
13
|
+
new_tag = Bio::DB::Tag.new("YH:m:"+sam.mutations.to_hgvs)
|
14
|
+
puts sam.add_tag(new_tag)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
SamMutationCLI.start ARGV
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# Please require your code below, respecting the naming conventions in the
|
2
|
+
# bioruby directory tree.
|
3
|
+
#
|
4
|
+
# For example, say you have a plugin named bio-plugin, the only uncommented
|
5
|
+
# line in this file would be
|
6
|
+
#
|
7
|
+
# require 'bio/bio-plugin/plugin'
|
8
|
+
#
|
9
|
+
# In this file only require other files. Avoid other source code.
|
10
|
+
|
11
|
+
require 'bio'
|
12
|
+
require 'bio-ensembl-rest'
|
13
|
+
require 'bio-samtools'
|
14
|
+
require 'oj'
|
15
|
+
require 'yaml'
|
16
|
+
require 'bio-sam-mutation/bio/db/alignment'
|
17
|
+
require 'bio-sam-mutation/bio/alignment/iterate_pairs'
|
18
|
+
require 'bio-sam-mutation/bio/alignment/cigar'
|
19
|
+
require 'bio-sam-mutation/bio/db/tag'
|
20
|
+
require 'bio-sam-mutation/bio/db/tag/md'
|
21
|
+
require 'bio-sam-mutation/bio/vephgvs'
|
22
|
+
require 'bio-sam-mutation/bio/mutation'
|
23
|
+
require 'bio-sam-mutation/bio/mutation_array'
|
24
|
+
require 'bio-sam-mutation/bio/mutantallele'
|
25
|
+
|
26
|
+
require 'bio-sam-mutation/mutationscli'
|
@@ -0,0 +1,239 @@
|
|
1
|
+
# Parse a CIGAR string
|
2
|
+
# An example from Exonerate output. Ideally will also allow SAM file input to be used.
|
3
|
+
# 1 : CGGCTATGGGGTCGTGGGTCCCGCGTTG-CTCTGGGGCTCGGCACCCTGGGGCGGCACGGCCGT : 63
|
4
|
+
# | | || | ||||||||||||||||||| |||||||||||||||||||||||||||||||||||
|
5
|
+
# 1 : CAG-TA-GTGGTCGTGGGTCCCGCGTTGTCTCTGGGGCTCGGCACCCTGGGGCGGCACGGCCGT : 62
|
6
|
+
#
|
7
|
+
# ref: CAGTAGTGGTCGTGGGTCCCGCGTTGTCTCTGG...
|
8
|
+
# cigar: SP-A12_D02_2015-01-16.seq 0 611 + SP-A3_ref 0 621 + 2514 M 3 I 1 M 2 I 1 M 21 D 1 M 306 D 9 M 89 I 1 M 126 D 1 M 24 D 1 M 8 I 1 M 6 D 1 M 5 D 1 M 17
|
9
|
+
# I are not counted in reference
|
10
|
+
# Regexp (from SAM specification) - but in exonerate the number comes first: ([0-9]+[MIDNSHP])+|\*
|
11
|
+
|
12
|
+
class Bio::Alignment::CIGAR
|
13
|
+
include Bio::Alignment::IteratePairs
|
14
|
+
|
15
|
+
class << self
|
16
|
+
attr_accessor :reference_operations, :query_operations, :subexp, :regexps
|
17
|
+
end
|
18
|
+
|
19
|
+
self.regexps = {"exonerate" => /([MIDNSHP]{1})(\d+)/, "sam" => /(\d+)([MIDNSHP]{1})/}
|
20
|
+
# Type of elements that count towards the reference length:
|
21
|
+
# TODO: add full support for other elements S, H etc.
|
22
|
+
self.reference_operations = /[MD]/
|
23
|
+
self.query_operations = /[MIS]/
|
24
|
+
self.subexp = /([atgcAGCT]+)>([atgcAGTC]+)/
|
25
|
+
attr_accessor :pairs, :reference
|
26
|
+
|
27
|
+
def initialize(string,ref=nil,source="")
|
28
|
+
# strip out whitespace
|
29
|
+
string.gsub!(/\s+/,"")
|
30
|
+
|
31
|
+
# Auto-detect source if not supplied
|
32
|
+
if !(Bio::Alignment::CIGAR.regexps.keys.include? source)
|
33
|
+
Bio::Alignment::CIGAR.regexps.each do |k,v|
|
34
|
+
# Look for match at start of string
|
35
|
+
if m = string.match(v)
|
36
|
+
source = k if m.offset(0)[0] == 0
|
37
|
+
end
|
38
|
+
end
|
39
|
+
if source == ""
|
40
|
+
raise "Source (e.g. 'exonerate', 'sam') not given and failed to auto-detect."
|
41
|
+
end
|
42
|
+
end
|
43
|
+
# Make an array of pairs of of cigar elements:
|
44
|
+
@pairs = string.scan(Bio::Alignment::CIGAR.regexps[source])
|
45
|
+
if source == "exonerate"
|
46
|
+
@pairs.map!{|pair| [pair[0].to_s, pair[1].to_i]}
|
47
|
+
else
|
48
|
+
# Provision to have number and identifier the other way round
|
49
|
+
@pairs.map!{|pair| [pair[1].to_s, pair[0].to_i]}
|
50
|
+
end
|
51
|
+
|
52
|
+
# Include reference sequence if provided
|
53
|
+
@reference = ref
|
54
|
+
# Check length of reference = sum(M+D)?
|
55
|
+
#warn "Reference length is not equal to that implied by CIGAR string: #{@reference.length}, #{self.reference_length}." unless @reference.length == self.reference_length
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
# Given an offset in reference sequence and length, return an object corresponding to that subregion of the alignment
|
60
|
+
def subalignment(offset,length,regexp=Bio::Alignment::CIGAR.reference_operations)
|
61
|
+
new_array = iterate_pairs(@pairs,offset,length,regexp)
|
62
|
+
# Return a CIGAR instance with just the new alignment
|
63
|
+
new_string = new_array.join(" ")
|
64
|
+
# -1 from offset as ruby string starts at zero
|
65
|
+
new_cigar = Bio::Alignment::CIGAR.new(new_string,@reference[offset-1,length])
|
66
|
+
new_cigar.remove_empty!
|
67
|
+
end
|
68
|
+
alias_method :slice, :subalignment
|
69
|
+
|
70
|
+
# Given a CIGAR-based [not reference - use subalignment] offset and length, return a subregion
|
71
|
+
def subcigar(offset,length)
|
72
|
+
# No regexp - includes everything
|
73
|
+
self.subalignment(offset,length,//)
|
74
|
+
end
|
75
|
+
|
76
|
+
def unmasked(offset,length)
|
77
|
+
self.subalignment(offset,length,/[MDI]/)
|
78
|
+
end
|
79
|
+
|
80
|
+
def remove_small!(threshold=1)
|
81
|
+
# Deletions convert to matches, insertions just remove
|
82
|
+
deletions_to_matches(threshold)
|
83
|
+
remove_small_nonmatches(threshold)
|
84
|
+
self
|
85
|
+
end
|
86
|
+
|
87
|
+
def remove_empty!
|
88
|
+
self.pairs.keep_if{|pair| pair[1] != 0 }
|
89
|
+
self
|
90
|
+
end
|
91
|
+
|
92
|
+
def matched_length
|
93
|
+
count_type("M")
|
94
|
+
end
|
95
|
+
|
96
|
+
def deleted_length
|
97
|
+
count_type("D")
|
98
|
+
end
|
99
|
+
|
100
|
+
def inserted_length
|
101
|
+
count_type("I")
|
102
|
+
end
|
103
|
+
|
104
|
+
def masked_length
|
105
|
+
count_type(/[SH]/)
|
106
|
+
end
|
107
|
+
|
108
|
+
def reference_length
|
109
|
+
count_type(Bio::Alignment::CIGAR.reference_operations)
|
110
|
+
end
|
111
|
+
|
112
|
+
def query_length
|
113
|
+
count_type(Bio::Alignment::CIGAR.query_operations)
|
114
|
+
end
|
115
|
+
|
116
|
+
# Output a representation of the query: replace deleted portions with "-", flag insertions with "*" or sim. Optionally provide the sequence (or symbols to use) of insertions, in order of appearence.
|
117
|
+
# Should be able to accept an array
|
118
|
+
# TODO: Add support for substitution highlighting (e.g lowercasing)
|
119
|
+
def query(insertions=nil)
|
120
|
+
if (insertions && (insertions.is_a? String))
|
121
|
+
insertions = [insertions]
|
122
|
+
end
|
123
|
+
sequence = []
|
124
|
+
total = 0
|
125
|
+
@pairs.each do |pair|
|
126
|
+
if pair[0].match("M")
|
127
|
+
sequence << @reference[total..total+pair[1]-1].upcase
|
128
|
+
total += pair[1]
|
129
|
+
end
|
130
|
+
if pair[0].match("I")
|
131
|
+
if (insertions)
|
132
|
+
insertion = insertions.shift.to_s
|
133
|
+
else
|
134
|
+
insertion = '['+pair[1].to_s+']'
|
135
|
+
end
|
136
|
+
sequence << insertion
|
137
|
+
end
|
138
|
+
if pair[0].match("D")
|
139
|
+
pair[1].times{ sequence << "-" }
|
140
|
+
total += pair[1]
|
141
|
+
end
|
142
|
+
end
|
143
|
+
sequence.join("")
|
144
|
+
end
|
145
|
+
|
146
|
+
# Output hgnc variant format given reference position. Only deletions can be accurately annotated from the cigar string; insertions or wild type seqeunces return nil
|
147
|
+
# NB mutation calling and annotation now implemented as extension to Bio::DB::Alignment (SAM)
|
148
|
+
def hgnc(reference_pos=0,insertions=[],type="g",*subs)
|
149
|
+
if insertions
|
150
|
+
if insertions.is_a? String
|
151
|
+
insertions = [insertions]
|
152
|
+
end
|
153
|
+
end
|
154
|
+
first_match = true
|
155
|
+
total = 0
|
156
|
+
hgnc_format = []
|
157
|
+
@pairs.each do |pair|
|
158
|
+
case pair[0]
|
159
|
+
when "M"
|
160
|
+
#break if first_match == false
|
161
|
+
reference_pos += pair[1]
|
162
|
+
total += pair[1]
|
163
|
+
first_match = false
|
164
|
+
when "D"
|
165
|
+
deleted_bases = @reference[total,pair[1]].upcase
|
166
|
+
if (pair[1] == 1)
|
167
|
+
string = (reference_pos + 1).to_s
|
168
|
+
else
|
169
|
+
string = (reference_pos + 1).to_s + "_" + (reference_pos + pair[1]).to_s
|
170
|
+
end
|
171
|
+
string = string + "del" + deleted_bases
|
172
|
+
hgnc_format << string
|
173
|
+
total += pair[1]
|
174
|
+
when "I"
|
175
|
+
inserted_bases = (insertions.length == 0) ? "N" : insertions.shift
|
176
|
+
|
177
|
+
hgnc_format << (reference_pos).to_s + "_" + (reference_pos + 1).to_s + "ins" + inserted_bases.upcase
|
178
|
+
end
|
179
|
+
end
|
180
|
+
# Use for substitutions, but could also pass any other annotation to include in here, as an array of strings
|
181
|
+
subs = subs.first # >1 arguments discarded
|
182
|
+
if subs
|
183
|
+
if (subs.length > 0 && (subs.is_a? Array))
|
184
|
+
hgnc_format = hgnc_format + subs
|
185
|
+
end
|
186
|
+
end
|
187
|
+
if hgnc_format.length == 0
|
188
|
+
nil
|
189
|
+
elsif hgnc_format.length == 1
|
190
|
+
type.to_s + "." + hgnc_format[0]
|
191
|
+
else
|
192
|
+
type.to_s + "." + "[" + hgnc_format.join(";") + "]"
|
193
|
+
end
|
194
|
+
|
195
|
+
end
|
196
|
+
|
197
|
+
# TODO combine adjacent operations of the same type into a single pair
|
198
|
+
def combine_adjacent
|
199
|
+
|
200
|
+
end
|
201
|
+
|
202
|
+
# Returns a hash (keyed by operation type) of three element arrays: the start positions on the reference of operations of the given type(s) and the length of the operation,
|
203
|
+
# followed by query position (for e.g. retrieving inserted bases from SAM). A regexp can be used to specify multiple types e.g. /[ID]/.
|
204
|
+
def positions(type)
|
205
|
+
total = 0
|
206
|
+
qtotal = 0
|
207
|
+
hash = Hash.new{|h,k| h[k] = []}
|
208
|
+
@pairs.each do |pair|
|
209
|
+
if pair[0].match(type)
|
210
|
+
hash[$&] << [total, pair[1], qtotal]
|
211
|
+
end
|
212
|
+
total += pair[1] if pair[0].match Bio::Alignment::CIGAR.reference_operations
|
213
|
+
qtotal += pair[1] if pair[0].match Bio::Alignment::CIGAR.query_operations
|
214
|
+
end
|
215
|
+
hash
|
216
|
+
end
|
217
|
+
|
218
|
+
|
219
|
+
private
|
220
|
+
|
221
|
+
def deletions_to_matches(threshold)
|
222
|
+
self.pairs = @pairs.each{|pair| pair[0].sub!("D","M") if pair[1] <= threshold}
|
223
|
+
end
|
224
|
+
|
225
|
+
def remove_small_nonmatches(threshold)
|
226
|
+
self.pairs = @pairs.keep_if{|pair| pair[0] == "M" || pair[1] > threshold}
|
227
|
+
end
|
228
|
+
|
229
|
+
def count_type(type)
|
230
|
+
sum = 0
|
231
|
+
@pairs.each do |pair|
|
232
|
+
if pair[0].match(type)
|
233
|
+
sum += pair[1]
|
234
|
+
end
|
235
|
+
end
|
236
|
+
sum
|
237
|
+
end
|
238
|
+
|
239
|
+
end #class
|