bio-polyploid-tools 0.1.0 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,128 +1,4 @@
1
1
 
2
- module Bio::NucleicAcid::Data
3
- IUPAC_CODES = {
4
-
5
- 'y' => 'ct',
6
- 'r' => 'ag',
7
- 'w' => 'at',
8
- 's' => 'cg',
9
- 'k' => 'gt',
10
- 'm' => 'ac',
11
-
12
- 'b' => 'cgt',
13
- 'd' => 'agt',
14
- 'h' => 'act',
15
- 'v' => 'acg',
16
-
17
- 'n' => 'acgt',
18
-
19
- 'a' => 'a',
20
- 't' => 't',
21
- 'g' => 'g',
22
- 'c' => 'c',
23
- 'u' => 'u',
24
-
25
- 'ct' => 'y',
26
- 'ag' => 'r',
27
- 'at' => 'w',
28
- 'cg' => 's',
29
- 'gt' => 'k',
30
- 'ac' => 'm',
31
-
32
- 'cgt' => 'b',
33
- 'agt' => 'd',
34
- 'act' => 'h',
35
- 'acg' => 'v',
36
-
37
- 'acgt' => 'n'
38
- }
39
-
40
-
41
- end
42
-
43
- class Bio::NucleicAcid
44
-
45
- IUPAC_CODES = {
46
-
47
- 'y' => 'ct',
48
- 'r' => 'ag',
49
- 'w' => 'at',
50
- 's' => 'cg',
51
- 'k' => 'gt',
52
- 'm' => 'ac',
53
-
54
- 'b' => 'cgt',
55
- 'd' => 'agt',
56
- 'h' => 'act',
57
- 'v' => 'acg',
58
-
59
- 'n' => 'acgt',
60
-
61
- 'a' => 'a',
62
- 't' => 't',
63
- 'g' => 'g',
64
- 'c' => 'c',
65
- 'u' => 'u',
66
-
67
- 'ct' => 'y',
68
- 'ag' => 'r',
69
- 'at' => 'w',
70
- 'cg' => 's',
71
- 'gt' => 'k',
72
- 'ac' => 'm',
73
-
74
- 'cgt' => 'b',
75
- 'agt' => 'd',
76
- 'act' => 'h',
77
- 'acg' => 'v',
78
-
79
- 'acgt' => 'n'
80
- }
81
-
82
- def self.is_unambiguous(base)
83
- "acgtACGT".match(base)
84
- end
85
-
86
- def self.to_IUAPC(bases)
87
- base = IUPAC_CODES[bases.to_s.downcase.chars.sort.uniq.join]
88
- if base == nil
89
- p "Invalid base! #{base}"
90
- base = 'n' #This is a patch... as one of the scripts failed here.
91
- end
92
- base.upcase
93
- end
94
-
95
- def self.is_valid(code, base)
96
- IUPAC_CODES[code.downcase].chars.include? base.downcase
97
- end
98
-
99
- end
100
-
101
- class Bio::Sequence
102
- def self.snps_between(seq1, seq2)
103
- snps=0
104
- for i in (0..seq1.size-1)
105
- snps += 1 if seq1[i] != seq2[i]
106
- end
107
- snps
108
- end
109
- end
110
-
111
- class String
112
- def count_ambiguities
113
- snps=0
114
-
115
- for i in (0..self.size-1)
116
-
117
- snps += 1 if !Bio::NucleicAcid.is_unambiguous(self[i])
118
- end
119
- snps
120
- end
121
-
122
- def upper_case_count
123
- match(/[^A-Z]*/).to_s.size
124
- end
125
- end
126
2
 
127
3
  class Bio::Blat
128
4
  def self.align(database , query , output)
@@ -8,7 +8,7 @@ module Bio::PolyploidTools
8
8
 
9
9
  def initialize(name, path_to_fasta)
10
10
  @name = name
11
- @fasta_db = Bio::DB::Fasta::FastaFile.new(path_to_fasta)
11
+ @fasta_db = Bio::DB::Fasta::FastaFile.new({:fasta=>path_to_fasta})
12
12
  #$stderr.puts "Loading entries for #{name}"
13
13
 
14
14
  @genes = Hash.new
@@ -17,11 +17,11 @@ module Bio::PolyploidTools
17
17
  end
18
18
 
19
19
  def gene_models(path)
20
- @gene_models_db = Bio::DB::Fasta::FastaFile.new(path)
20
+ @gene_models_db = Bio::DB::Fasta::FastaFile.new({:fasta=>path})
21
21
  @gene_models_path = path
22
22
  end
23
23
 
24
- #Retunrs the sequence for a region in the gene models (exon)
24
+ #Returns the sequence for a region in the gene models (exon)
25
25
  def gene_model_sequence(region)
26
26
  seq=@gene_models_db.fetch_sequence(region)
27
27
 
@@ -30,7 +30,7 @@ module Bio::PolyploidTools
30
30
 
31
31
  #Sets the reference file for the gene models
32
32
  def chromosomes(path)
33
- @chromosomes_db = Bio::DB::Fasta::FastaFile.new(path)
33
+ @chromosomes_db = Bio::DB::Fasta::FastaFile.new({:fasta=>path})
34
34
  @chromosomes_path = path
35
35
  end
36
36
 
@@ -54,7 +54,7 @@ module Bio::PolyploidTools
54
54
  name = opts[:name]
55
55
  path = opts[:reference_path]
56
56
  path = opts[:alig_path]
57
- chromosomes[name] = Bio::DB::Fasta::FastaFile.new(path)
57
+ chromosomes[name] = Bio::DB::Fasta::FastaFile.new({:fasta=>path})
58
58
  end
59
59
 
60
60
  def add_snp(snp)
@@ -119,11 +119,12 @@ module Bio::PolyploidTools
119
119
  @snp_map.each do | gene, snp_array|
120
120
  snp_array.each do |snp|
121
121
  #file.puts snp.primer_fasta_string
122
+
122
123
  begin
123
124
  file.puts snp.aligned_sequences_fasta
124
125
  rescue Exception=>e
125
126
  @missing_exons << snp.to_s
126
- # $stderr.puts e.to_s
127
+ $stderr.puts e.to_s
127
128
  end
128
129
  end
129
130
  end
@@ -126,12 +126,12 @@ module Bio::PolyploidTools
126
126
  end
127
127
 
128
128
  def global_reference(reference)
129
- @global_reference = Bio::DB::Fasta::FastaFile.new(reference)
129
+ @global_reference = Bio::DB::Fasta::FastaFile.new({:fasta=>reference})
130
130
  @global_reference.load_fai_entries
131
131
  end
132
132
 
133
133
  def reference(reference)
134
- @reference = Bio::DB::Fasta::FastaFile.new(reference)
134
+ @reference = Bio::DB::Fasta::FastaFile.new({:fasta=>reference})
135
135
  @reference.load_fai_entries
136
136
  end
137
137
 
@@ -435,9 +435,10 @@ module Bio::PolyploidTools
435
435
  end
436
436
 
437
437
  def sequences_to_align
438
+ p @sequences_to_align.inspect
438
439
  @sequences_to_align = surrounding_parental_sequences.merge(surrounding_exon_sequences) unless @sequences_to_align
439
440
  # p "sequences_to_align"
440
- # p @sequences_to_align.inspect
441
+
441
442
  @sequences_to_align
442
443
  end
443
444
 
@@ -446,10 +447,10 @@ module Bio::PolyploidTools
446
447
  return @aligned_sequences if @aligned_sequences
447
448
  options = ['--maxiterate', '1000', '--localpair', '--quiet']
448
449
  mafft = Bio::MAFFT.new( "mafft" , options)
449
- #puts "Before MAFT:#{sequences_to_align.inspect}"
450
+ # puts "Before MAFT:#{sequences_to_align.inspect}"
450
451
  report = mafft.query_align(sequences_to_align)
451
452
  @aligned_sequences = report.alignment
452
- #puts "MAFFT: #{report.alignment.inspect}"
453
+ # puts "MAFFT: #{report.alignment.inspect}"
453
454
  @aligned_sequences
454
455
  end
455
456
 
@@ -640,7 +641,7 @@ module Bio::PolyploidTools
640
641
  @surrounding_exon_sequences = Bio::Alignment::SequenceHash.new
641
642
  self.exon_list.each do |chromosome, exon|
642
643
  #puts "surrounding_exon_sequences #{flanking_size}"
643
- #puts chromosome
644
+ #puts chromosome
644
645
  #puts exon
645
646
  flanquing_region = exon.target_flanking_region_from_position(position,flanking_size)
646
647
  #TODO: Padd when the exon goes over the regions...
@@ -16,7 +16,7 @@ module Bio::DB::Exonerate
16
16
 
17
17
  target=opts[:target]
18
18
  query=opts[:query]
19
-
19
+ #
20
20
 
21
21
  cmdline = "exonerate --verbose 0 --showalignment no --bestn #{opts[:bestn]} --showvulgar no --model #{opts[:model]} --ryo '#{opts[:ryo]}' #{query} #{target}"
22
22
  status, stdout, stderr = systemu cmdline
@@ -4,6 +4,7 @@ path= File.expand_path(File.dirname(__FILE__) + '/../lib/bioruby-polyploid-tools
4
4
 
5
5
  #puts path
6
6
  require path
7
+ require 'bio-samtools'
7
8
  require "test/unit"
8
9
 
9
10
  class TestPolyploidTools < Test::Unit::TestCase
@@ -18,13 +19,15 @@ class TestPolyploidTools < Test::Unit::TestCase
18
19
  @b=data_path + "/LIB1722.bam"
19
20
  @f2_a=data_path + "/LIB1716.bam"
20
21
  @f2_b=data_path + "/LIB1719.bam"
21
- @fasta_db = Bio::DB::Fasta::FastaFile.new(@ref)
22
+
23
+ @bfr_path=data_path + "/bfr_out_test.csv"
24
+ @fasta_db = Bio::DB::Fasta::FastaFile.new({:fasta=>@ref})
22
25
  @fasta_db.load_fai_entries
23
26
  @bam_a = Bio::DB::Sam.new({:fasta=>@ref, :bam=>@a})
24
27
  @bam_b = Bio::DB::Sam.new({:fasta=>@ref, :bam=>@b})
25
28
  @bam_f2_a = Bio::DB::Sam.new({:fasta=>@ref, :bam=>@f2_a})
26
29
  @bam_f2_b = Bio::DB::Sam.new({:fasta=>@ref, :bam=>@f2_b})
27
- puts "SETUP"
30
+ # puts "SETUP"
28
31
  end
29
32
 
30
33
  def teardown
@@ -35,16 +38,105 @@ class TestPolyploidTools < Test::Unit::TestCase
35
38
  setupre
36
39
 
37
40
  reg="gnl|UG|Ta#S22380157"
38
- region = @fasta_db.index.region_for_entry(reg).to_region.to_s
39
- min_cov=2
40
-
41
+ region = @fasta_db.index.region_for_entry(reg).to_region
42
+ min_cov=20
41
43
  puts region.to_s
42
44
 
43
- cons_1 = @bam_a.consensus_with_ambiguities({:region=>region, :case=>true, :min_cov=>min_cov})
44
- cons_2 = @bam_b.consensus_with_ambiguities({:region=>region, :case=>true, :min_cov=>min_cov})
45
+ #puts @bam_a.methods
46
+ ref_seq=@fasta_db.fetch_sequence(region)
47
+ reg_a = @bam_a.fetch_region({:region=>region, :min_cov=>min_cov, :A=>1})
48
+ reg_b = @bam_b.fetch_region({:region=>region, :min_cov=>min_cov, :A=>1})
49
+ cons_1 = reg_a.consensus
50
+ cons_2 = reg_b.consensus
51
+
52
+ snps_1 = cons_1.count_ambiguities
53
+ snps_2 = cons_2.count_ambiguities
54
+
55
+ called_1 = reg_a.called
56
+ called_2 = reg_b.called
57
+
58
+ snps_tot = Bio::Sequence.snps_between(cons_1, cons_2)
59
+ block_size = 1000
60
+ snps_per_1k_1 = (block_size * snps_1.to_f ) / region.size
61
+ snps_per_1k_2 = (block_size * snps_2.to_f ) / region.size
62
+ snps_per_1k_tot = (block_size * snps_tot.to_f ) / region.size
63
+
64
+
65
+
66
+ #puts "#{region.entry}\t#{region.size}\t"
67
+ #puts "#{snps_1}\t#{called_1}\t#{snps_per_1k_1}\t"
68
+ #puts "#{snps_2}\t#{called_2}\t#{snps_per_1k_2}\t"
69
+ #puts "#{snps_tot}\t#{snps_per_1k_tot}\n"
70
+
71
+
72
+ snps_tot = Bio::Sequence.snps_between(cons_1, cons_2)
73
+ snps_to_ref = Bio::Sequence.snps_between(cons_1, ref_seq)
74
+ #puts ">ref\n#{ref_seq}"
75
+ #puts ">a\n#{cons_1}"
76
+ #puts ">b\n#{cons_2}"
77
+ #puts "SNPS between: #{snps_tot}"
78
+ #puts "SNPS ref: #{snps_to_ref}"
79
+ #puts "SNPS call: #{snps_to_ref}"
80
+ assert_equal(ref_seq.to_s, "acgcttgaccttaggcctatttaggtgacactatagaacaagtttgtacaaaaaagcaggctggtaccggtccggaattcccgggatatcgtcgacccacgcgtccgcgtccgaccagcacaaacaagactgtactctgggctcctctgactccgtgtcttgctaaaatatctttggtcgactcgttgcgaggttgatcagatggcggaggaagcgaagcaggatgtggcgccacccgcgccggagccgaccgaggacgtcgcggacgagaaggtggcggttccgtcgccggaggagtctaaggccctcgttgtcgccgagaatgacgctgagaagcctgcagctacagggggctcacacgaacgagatgctctgctcacgagggtcgcgaccgagaagaggatttcgctgatcaaggcatgggaggagaacgagaaggccaaagccgagaacaaggccgtgaagttgctggcggacatcacctcgtgggagaactccaaggccgcggaactggaagccgagctcaagaagatgcaagagcagctggagaagaagaaggcgcgctgcgtggagaagctcaagaacagcgccgcgacggtgcacaaagaggcggaangagaagcgtgccgcggcggaagcgcggcacggcgaggagatcgtcgcggcggaggagaccgccgccaagtaccgcgccaagggtgaagcgccgaagaagctgctcttcggcagaagatagatatcgcttcatcttcagcttctctctgtttgaccgnttgcatgtctcctgcccatggcatcacttgtgtatttatctttgggggngatcttagtttgtatggtatcatcaaatgcgtcgtga")
81
+ assert_equal(cons_1.to_s , "acgcttgaccttaggcctatttaggtgacactatagaacaagtttgtacaaaaaagcaggctggtaccggtccggaattcccgggatatcgtcgacccacgcgtccgcgtccgaccagcacaaacaagactgtactctgggctcctctgactccgtgtcttgctaaaatatytttggtcgactcgttgcgaggttgatcagatggcggaggaagcgaagcaggatgtggcgccacccgcgccggagccgaccgaggacgtcgcggacgagaaggcggcggttccgtcgccggaggagtctaaggccctsgttgtcgccgagaatgacgcygagaagcctgcagctacagggggctcacacgaacgagatgctctgctcacgagggtygcgaccgagaagaggatttcgctgatcaaggcatgggaggagaaygagaaggccaaagccgagaacaaggccgtgaagttgctggcggacatcacctcgtgggagaactccaaggccgcggaactggaagccgagctcaagaagatgcaagagcagctggagaagaagaaggcgcgctgcgtggagaagctcaagaacagcgccgcgacggtgcacaaagaggcgraaggagaagcgtgccgcggcggaagygcggcrcggcgaggagatcgtcgcggcggaggagaccgccgccaagtaccgcgccaagggtgaggcgccgaagaagctgctcttcggcagaggatagatatcgcttcatcttcagcttctctctgtttgaccgnttgcatgtctcctgcccatggcatcacttgtgtatttatctttgggggngatcttagtttgtatggtatcatcaaatgcgtcgtga")
82
+ assert_equal(cons_2.to_s , "acgcttgaccttaggcctatttaggtgacactatagaacaagtttgtacaaaaaagcaggctggtaccggtccggaattcccgggatatcgtcgacccacgcgtccgcgtccgaccagcacaaacaagactgtactctgggctcctctgactccgtgtcttgctaaaatatytttggtcgactcgttgcgaggttgatcagatggcggasgaagcgaagcaggatgtggcgccacccgcgccggagccgaccgaggacgtcgcggacgagaaggcggcggttccgtcgccggaggartcyaaggccctsgttgtcgccgagaatgacgcygagaagcctgcagctacagggggctcacacgaacgagatgctctgctcacgagggtygcgaccgagaagaggatttcgctgatcaaggcatgggaggagaaygagaaggccaaagccgagaacaaggccgtgaagttgctggcggacatcacctcgtgggagaactccaaggccgcggaactggaagccgagctcaagaagatgcaagagcagctggagaagaagaaggcgcgctgcgtggagaagctcaagaacagcgccgcgacggtgcacaaagaggcgraaggagaagcgtgccgcggcggaagygcggcgcggcgaggagatcgtcgcggcggaggagrccgccgccaagtaccgcgccaagggtgaggcgccgaagaagctgctcttcggcagaagatagatatcgcttcatcttcagcttctctctgtttgaccgnttgcatgtctcctgcccatggcatcacttgtgtatttatctttgggggngatcttagtttgtatggtatcatcaaatgcgtcgtga")
83
+ assert_equal(snps_tot , 6)
84
+ assert_equal(snps_to_ref , 12)
85
+ assert_equal(snps_1,10)
86
+ assert_equal(snps_2,13)
87
+ assert_equal(called_1,617)
88
+ assert_equal(called_2,612)
89
+ end
90
+
91
+ def test_bfr
92
+ setupre
93
+ container = Bio::BFRTools::BFRContainer.new
94
+
95
+ container.reference @ref
96
+ container.parental_1 ( {:path => @a } )
97
+ container.parental_2 ( {:path => @b } )
98
+ container.bulk_1 ( {:path => @f2_a })
99
+ container.bulk_2 ( {:path => @f2_b })
100
+
101
+ i = -1
102
+
103
+ container.init_counters
104
+ output_file = File.open(@bfr_path, "w")
105
+ # puts "Range: #{min}:#{max}"
106
+ assert_equal(@fasta_db.index.entries.size,1)
107
+ reg = nil
108
+ @fasta_db.index.entries.each do | r |
109
+ i = i + 1
110
+
111
+ reg = container.process_region({:region => r.get_full_region.to_s,:output_file => output_file , :min_cov => 5} )
112
+ #puts reg.inspect
113
+ end
114
+
115
+ with_bfr = [210, 297, 300, 645, 674]
116
+
117
+ bases_1 = Array.new
118
+ bases_2 = Array.new
119
+ bases_1 << {:A=>0, :C=>24, :G=>120, :T=>0}
120
+ bases_2 << {:A=>0, :C=>24, :G=>112, :T=>0}
121
+ bases_1 << {:A=>34, :C=>0, :G=>138, :T=>0}
122
+ bases_2 << {:A=>26, :C=>0, :G=>138, :T=>0}
123
+ bases_1 << {:A=>0, :C=>32, :G=>0, :T=>141}
124
+ bases_2 << {:A=>0, :C=>26, :G=>0, :T=>142}
125
+ bases_1 << {:A=>22, :C=>0, :G=>56, :T=>0}
126
+ bases_2 << {:A=>62, :C=>0, :G=>25, :T=>0}
127
+ bases_1 << {:A=>27, :C=>0, :G=>22, :T=>0}
128
+ bases_2 << {:A=>46, :C=>0, :G=>9, :T=>0}
129
+ i = 0
130
+ with_bfr.each do | pos |
131
+ puts pos
132
+ assert_equal(reg.bases_bulk_1[pos - 1 ] , bases_1[i] )
133
+ assert_equal(reg.bases_bulk_2[pos - 1 ] , bases_2[i] )
134
+ i += 1
135
+ end
136
+
137
+
45
138
 
46
- puts cons_2
47
- puts cons_1
139
+ output_file.close
48
140
 
49
141
  end
50
142
 
metadata CHANGED
@@ -1,43 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-polyploid-tools
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ricardo H. Ramirez-Gonzalez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-03-31 00:00:00.000000000 Z
11
+ date: 2014-04-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bio
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - '='
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: 1.4.2
19
+ version: 1.4.3
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - '='
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: 1.4.2
26
+ version: 1.4.3
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: bio-samtools
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - '='
31
+ - - ">="
32
32
  - !ruby/object:Gem::Version
33
- version: 0.6.2
33
+ version: 2.0.3
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - '='
38
+ - - ">="
39
39
  - !ruby/object:Gem::Version
40
- version: 0.6.2
40
+ version: 2.0.3
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: rake
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - ">="
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: systemu
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: 2.5.2
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: 2.5.2
69
83
  description: Repository of tools developed in TGAC and Crop Genetics in JIC to work
70
84
  with polyploid wheat
71
85
  email: ricardo.ramirez-gonzalez@tgac.ac.uk
@@ -74,6 +88,7 @@ executables:
74
88
  - count_variations.rb
75
89
  - filter_blat_by_target_coverage.rb
76
90
  - find_best_blat_hit.rb
91
+ - find_best_exonerate.rb
77
92
  - hexaploid_primers.rb
78
93
  - homokaryot_primers.rb
79
94
  - map_markers_to_contigs.rb
@@ -83,16 +98,19 @@ executables:
83
98
  extensions: []
84
99
  extra_rdoc_files:
85
100
  - README
101
+ - README.md
86
102
  files:
87
103
  - Gemfile
88
104
  - Gemfile.lock
89
105
  - README
106
+ - README.md
90
107
  - Rakefile
91
108
  - VERSION
92
109
  - bin/bfr.rb
93
110
  - bin/count_variations.rb
94
111
  - bin/filter_blat_by_target_coverage.rb
95
112
  - bin/find_best_blat_hit.rb
113
+ - bin/find_best_exonerate.rb
96
114
  - bin/hexaploid_primers.rb
97
115
  - bin/homokaryot_primers.rb
98
116
  - bin/map_markers_to_contigs.rb
@@ -143,9 +161,7 @@ files:
143
161
  - lib/bio/PolyploidTools/PrimerRegion.rb
144
162
  - lib/bio/PolyploidTools/SNP.rb
145
163
  - lib/bio/PolyploidTools/SNPSequence.rb
146
- - lib/bio/SAMToolsExtensions.rb
147
164
  - lib/bio/db/exonerate.rb
148
- - lib/bio/db/fastadb.rb
149
165
  - lib/bio/db/primer3.rb
150
166
  - lib/bioruby-polyploid-tools.rb
151
167
  - test/data/BS00068396_51.fa