bio-polyploid-tools 0.1.0 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,128 +1,4 @@
1
1
 
2
- module Bio::NucleicAcid::Data
3
- IUPAC_CODES = {
4
-
5
- 'y' => 'ct',
6
- 'r' => 'ag',
7
- 'w' => 'at',
8
- 's' => 'cg',
9
- 'k' => 'gt',
10
- 'm' => 'ac',
11
-
12
- 'b' => 'cgt',
13
- 'd' => 'agt',
14
- 'h' => 'act',
15
- 'v' => 'acg',
16
-
17
- 'n' => 'acgt',
18
-
19
- 'a' => 'a',
20
- 't' => 't',
21
- 'g' => 'g',
22
- 'c' => 'c',
23
- 'u' => 'u',
24
-
25
- 'ct' => 'y',
26
- 'ag' => 'r',
27
- 'at' => 'w',
28
- 'cg' => 's',
29
- 'gt' => 'k',
30
- 'ac' => 'm',
31
-
32
- 'cgt' => 'b',
33
- 'agt' => 'd',
34
- 'act' => 'h',
35
- 'acg' => 'v',
36
-
37
- 'acgt' => 'n'
38
- }
39
-
40
-
41
- end
42
-
43
- class Bio::NucleicAcid
44
-
45
- IUPAC_CODES = {
46
-
47
- 'y' => 'ct',
48
- 'r' => 'ag',
49
- 'w' => 'at',
50
- 's' => 'cg',
51
- 'k' => 'gt',
52
- 'm' => 'ac',
53
-
54
- 'b' => 'cgt',
55
- 'd' => 'agt',
56
- 'h' => 'act',
57
- 'v' => 'acg',
58
-
59
- 'n' => 'acgt',
60
-
61
- 'a' => 'a',
62
- 't' => 't',
63
- 'g' => 'g',
64
- 'c' => 'c',
65
- 'u' => 'u',
66
-
67
- 'ct' => 'y',
68
- 'ag' => 'r',
69
- 'at' => 'w',
70
- 'cg' => 's',
71
- 'gt' => 'k',
72
- 'ac' => 'm',
73
-
74
- 'cgt' => 'b',
75
- 'agt' => 'd',
76
- 'act' => 'h',
77
- 'acg' => 'v',
78
-
79
- 'acgt' => 'n'
80
- }
81
-
82
- def self.is_unambiguous(base)
83
- "acgtACGT".match(base)
84
- end
85
-
86
- def self.to_IUAPC(bases)
87
- base = IUPAC_CODES[bases.to_s.downcase.chars.sort.uniq.join]
88
- if base == nil
89
- p "Invalid base! #{base}"
90
- base = 'n' #This is a patch... as one of the scripts failed here.
91
- end
92
- base.upcase
93
- end
94
-
95
- def self.is_valid(code, base)
96
- IUPAC_CODES[code.downcase].chars.include? base.downcase
97
- end
98
-
99
- end
100
-
101
- class Bio::Sequence
102
- def self.snps_between(seq1, seq2)
103
- snps=0
104
- for i in (0..seq1.size-1)
105
- snps += 1 if seq1[i] != seq2[i]
106
- end
107
- snps
108
- end
109
- end
110
-
111
- class String
112
- def count_ambiguities
113
- snps=0
114
-
115
- for i in (0..self.size-1)
116
-
117
- snps += 1 if !Bio::NucleicAcid.is_unambiguous(self[i])
118
- end
119
- snps
120
- end
121
-
122
- def upper_case_count
123
- match(/[^A-Z]*/).to_s.size
124
- end
125
- end
126
2
 
127
3
  class Bio::Blat
128
4
  def self.align(database , query , output)
@@ -8,7 +8,7 @@ module Bio::PolyploidTools
8
8
 
9
9
  def initialize(name, path_to_fasta)
10
10
  @name = name
11
- @fasta_db = Bio::DB::Fasta::FastaFile.new(path_to_fasta)
11
+ @fasta_db = Bio::DB::Fasta::FastaFile.new({:fasta=>path_to_fasta})
12
12
  #$stderr.puts "Loading entries for #{name}"
13
13
 
14
14
  @genes = Hash.new
@@ -17,11 +17,11 @@ module Bio::PolyploidTools
17
17
  end
18
18
 
19
19
  def gene_models(path)
20
- @gene_models_db = Bio::DB::Fasta::FastaFile.new(path)
20
+ @gene_models_db = Bio::DB::Fasta::FastaFile.new({:fasta=>path})
21
21
  @gene_models_path = path
22
22
  end
23
23
 
24
- #Retunrs the sequence for a region in the gene models (exon)
24
+ #Returns the sequence for a region in the gene models (exon)
25
25
  def gene_model_sequence(region)
26
26
  seq=@gene_models_db.fetch_sequence(region)
27
27
 
@@ -30,7 +30,7 @@ module Bio::PolyploidTools
30
30
 
31
31
  #Sets the reference file for the gene models
32
32
  def chromosomes(path)
33
- @chromosomes_db = Bio::DB::Fasta::FastaFile.new(path)
33
+ @chromosomes_db = Bio::DB::Fasta::FastaFile.new({:fasta=>path})
34
34
  @chromosomes_path = path
35
35
  end
36
36
 
@@ -54,7 +54,7 @@ module Bio::PolyploidTools
54
54
  name = opts[:name]
55
55
  path = opts[:reference_path]
56
56
  path = opts[:alig_path]
57
- chromosomes[name] = Bio::DB::Fasta::FastaFile.new(path)
57
+ chromosomes[name] = Bio::DB::Fasta::FastaFile.new({:fasta=>path})
58
58
  end
59
59
 
60
60
  def add_snp(snp)
@@ -119,11 +119,12 @@ module Bio::PolyploidTools
119
119
  @snp_map.each do | gene, snp_array|
120
120
  snp_array.each do |snp|
121
121
  #file.puts snp.primer_fasta_string
122
+
122
123
  begin
123
124
  file.puts snp.aligned_sequences_fasta
124
125
  rescue Exception=>e
125
126
  @missing_exons << snp.to_s
126
- # $stderr.puts e.to_s
127
+ $stderr.puts e.to_s
127
128
  end
128
129
  end
129
130
  end
@@ -126,12 +126,12 @@ module Bio::PolyploidTools
126
126
  end
127
127
 
128
128
  def global_reference(reference)
129
- @global_reference = Bio::DB::Fasta::FastaFile.new(reference)
129
+ @global_reference = Bio::DB::Fasta::FastaFile.new({:fasta=>reference})
130
130
  @global_reference.load_fai_entries
131
131
  end
132
132
 
133
133
  def reference(reference)
134
- @reference = Bio::DB::Fasta::FastaFile.new(reference)
134
+ @reference = Bio::DB::Fasta::FastaFile.new({:fasta=>reference})
135
135
  @reference.load_fai_entries
136
136
  end
137
137
 
@@ -435,9 +435,10 @@ module Bio::PolyploidTools
435
435
  end
436
436
 
437
437
  def sequences_to_align
438
+ p @sequences_to_align.inspect
438
439
  @sequences_to_align = surrounding_parental_sequences.merge(surrounding_exon_sequences) unless @sequences_to_align
439
440
  # p "sequences_to_align"
440
- # p @sequences_to_align.inspect
441
+
441
442
  @sequences_to_align
442
443
  end
443
444
 
@@ -446,10 +447,10 @@ module Bio::PolyploidTools
446
447
  return @aligned_sequences if @aligned_sequences
447
448
  options = ['--maxiterate', '1000', '--localpair', '--quiet']
448
449
  mafft = Bio::MAFFT.new( "mafft" , options)
449
- #puts "Before MAFT:#{sequences_to_align.inspect}"
450
+ # puts "Before MAFT:#{sequences_to_align.inspect}"
450
451
  report = mafft.query_align(sequences_to_align)
451
452
  @aligned_sequences = report.alignment
452
- #puts "MAFFT: #{report.alignment.inspect}"
453
+ # puts "MAFFT: #{report.alignment.inspect}"
453
454
  @aligned_sequences
454
455
  end
455
456
 
@@ -640,7 +641,7 @@ module Bio::PolyploidTools
640
641
  @surrounding_exon_sequences = Bio::Alignment::SequenceHash.new
641
642
  self.exon_list.each do |chromosome, exon|
642
643
  #puts "surrounding_exon_sequences #{flanking_size}"
643
- #puts chromosome
644
+ #puts chromosome
644
645
  #puts exon
645
646
  flanquing_region = exon.target_flanking_region_from_position(position,flanking_size)
646
647
  #TODO: Padd when the exon goes over the regions...
@@ -16,7 +16,7 @@ module Bio::DB::Exonerate
16
16
 
17
17
  target=opts[:target]
18
18
  query=opts[:query]
19
-
19
+ #
20
20
 
21
21
  cmdline = "exonerate --verbose 0 --showalignment no --bestn #{opts[:bestn]} --showvulgar no --model #{opts[:model]} --ryo '#{opts[:ryo]}' #{query} #{target}"
22
22
  status, stdout, stderr = systemu cmdline
@@ -4,6 +4,7 @@ path= File.expand_path(File.dirname(__FILE__) + '/../lib/bioruby-polyploid-tools
4
4
 
5
5
  #puts path
6
6
  require path
7
+ require 'bio-samtools'
7
8
  require "test/unit"
8
9
 
9
10
  class TestPolyploidTools < Test::Unit::TestCase
@@ -18,13 +19,15 @@ class TestPolyploidTools < Test::Unit::TestCase
18
19
  @b=data_path + "/LIB1722.bam"
19
20
  @f2_a=data_path + "/LIB1716.bam"
20
21
  @f2_b=data_path + "/LIB1719.bam"
21
- @fasta_db = Bio::DB::Fasta::FastaFile.new(@ref)
22
+
23
+ @bfr_path=data_path + "/bfr_out_test.csv"
24
+ @fasta_db = Bio::DB::Fasta::FastaFile.new({:fasta=>@ref})
22
25
  @fasta_db.load_fai_entries
23
26
  @bam_a = Bio::DB::Sam.new({:fasta=>@ref, :bam=>@a})
24
27
  @bam_b = Bio::DB::Sam.new({:fasta=>@ref, :bam=>@b})
25
28
  @bam_f2_a = Bio::DB::Sam.new({:fasta=>@ref, :bam=>@f2_a})
26
29
  @bam_f2_b = Bio::DB::Sam.new({:fasta=>@ref, :bam=>@f2_b})
27
- puts "SETUP"
30
+ # puts "SETUP"
28
31
  end
29
32
 
30
33
  def teardown
@@ -35,16 +38,105 @@ class TestPolyploidTools < Test::Unit::TestCase
35
38
  setupre
36
39
 
37
40
  reg="gnl|UG|Ta#S22380157"
38
- region = @fasta_db.index.region_for_entry(reg).to_region.to_s
39
- min_cov=2
40
-
41
+ region = @fasta_db.index.region_for_entry(reg).to_region
42
+ min_cov=20
41
43
  puts region.to_s
42
44
 
43
- cons_1 = @bam_a.consensus_with_ambiguities({:region=>region, :case=>true, :min_cov=>min_cov})
44
- cons_2 = @bam_b.consensus_with_ambiguities({:region=>region, :case=>true, :min_cov=>min_cov})
45
+ #puts @bam_a.methods
46
+ ref_seq=@fasta_db.fetch_sequence(region)
47
+ reg_a = @bam_a.fetch_region({:region=>region, :min_cov=>min_cov, :A=>1})
48
+ reg_b = @bam_b.fetch_region({:region=>region, :min_cov=>min_cov, :A=>1})
49
+ cons_1 = reg_a.consensus
50
+ cons_2 = reg_b.consensus
51
+
52
+ snps_1 = cons_1.count_ambiguities
53
+ snps_2 = cons_2.count_ambiguities
54
+
55
+ called_1 = reg_a.called
56
+ called_2 = reg_b.called
57
+
58
+ snps_tot = Bio::Sequence.snps_between(cons_1, cons_2)
59
+ block_size = 1000
60
+ snps_per_1k_1 = (block_size * snps_1.to_f ) / region.size
61
+ snps_per_1k_2 = (block_size * snps_2.to_f ) / region.size
62
+ snps_per_1k_tot = (block_size * snps_tot.to_f ) / region.size
63
+
64
+
65
+
66
+ #puts "#{region.entry}\t#{region.size}\t"
67
+ #puts "#{snps_1}\t#{called_1}\t#{snps_per_1k_1}\t"
68
+ #puts "#{snps_2}\t#{called_2}\t#{snps_per_1k_2}\t"
69
+ #puts "#{snps_tot}\t#{snps_per_1k_tot}\n"
70
+
71
+
72
+ snps_tot = Bio::Sequence.snps_between(cons_1, cons_2)
73
+ snps_to_ref = Bio::Sequence.snps_between(cons_1, ref_seq)
74
+ #puts ">ref\n#{ref_seq}"
75
+ #puts ">a\n#{cons_1}"
76
+ #puts ">b\n#{cons_2}"
77
+ #puts "SNPS between: #{snps_tot}"
78
+ #puts "SNPS ref: #{snps_to_ref}"
79
+ #puts "SNPS call: #{snps_to_ref}"
80
+ assert_equal(ref_seq.to_s, "acgcttgaccttaggcctatttaggtgacactatagaacaagtttgtacaaaaaagcaggctggtaccggtccggaattcccgggatatcgtcgacccacgcgtccgcgtccgaccagcacaaacaagactgtactctgggctcctctgactccgtgtcttgctaaaatatctttggtcgactcgttgcgaggttgatcagatggcggaggaagcgaagcaggatgtggcgccacccgcgccggagccgaccgaggacgtcgcggacgagaaggtggcggttccgtcgccggaggagtctaaggccctcgttgtcgccgagaatgacgctgagaagcctgcagctacagggggctcacacgaacgagatgctctgctcacgagggtcgcgaccgagaagaggatttcgctgatcaaggcatgggaggagaacgagaaggccaaagccgagaacaaggccgtgaagttgctggcggacatcacctcgtgggagaactccaaggccgcggaactggaagccgagctcaagaagatgcaagagcagctggagaagaagaaggcgcgctgcgtggagaagctcaagaacagcgccgcgacggtgcacaaagaggcggaangagaagcgtgccgcggcggaagcgcggcacggcgaggagatcgtcgcggcggaggagaccgccgccaagtaccgcgccaagggtgaagcgccgaagaagctgctcttcggcagaagatagatatcgcttcatcttcagcttctctctgtttgaccgnttgcatgtctcctgcccatggcatcacttgtgtatttatctttgggggngatcttagtttgtatggtatcatcaaatgcgtcgtga")
81
+ assert_equal(cons_1.to_s , "acgcttgaccttaggcctatttaggtgacactatagaacaagtttgtacaaaaaagcaggctggtaccggtccggaattcccgggatatcgtcgacccacgcgtccgcgtccgaccagcacaaacaagactgtactctgggctcctctgactccgtgtcttgctaaaatatytttggtcgactcgttgcgaggttgatcagatggcggaggaagcgaagcaggatgtggcgccacccgcgccggagccgaccgaggacgtcgcggacgagaaggcggcggttccgtcgccggaggagtctaaggccctsgttgtcgccgagaatgacgcygagaagcctgcagctacagggggctcacacgaacgagatgctctgctcacgagggtygcgaccgagaagaggatttcgctgatcaaggcatgggaggagaaygagaaggccaaagccgagaacaaggccgtgaagttgctggcggacatcacctcgtgggagaactccaaggccgcggaactggaagccgagctcaagaagatgcaagagcagctggagaagaagaaggcgcgctgcgtggagaagctcaagaacagcgccgcgacggtgcacaaagaggcgraaggagaagcgtgccgcggcggaagygcggcrcggcgaggagatcgtcgcggcggaggagaccgccgccaagtaccgcgccaagggtgaggcgccgaagaagctgctcttcggcagaggatagatatcgcttcatcttcagcttctctctgtttgaccgnttgcatgtctcctgcccatggcatcacttgtgtatttatctttgggggngatcttagtttgtatggtatcatcaaatgcgtcgtga")
82
+ assert_equal(cons_2.to_s , "acgcttgaccttaggcctatttaggtgacactatagaacaagtttgtacaaaaaagcaggctggtaccggtccggaattcccgggatatcgtcgacccacgcgtccgcgtccgaccagcacaaacaagactgtactctgggctcctctgactccgtgtcttgctaaaatatytttggtcgactcgttgcgaggttgatcagatggcggasgaagcgaagcaggatgtggcgccacccgcgccggagccgaccgaggacgtcgcggacgagaaggcggcggttccgtcgccggaggartcyaaggccctsgttgtcgccgagaatgacgcygagaagcctgcagctacagggggctcacacgaacgagatgctctgctcacgagggtygcgaccgagaagaggatttcgctgatcaaggcatgggaggagaaygagaaggccaaagccgagaacaaggccgtgaagttgctggcggacatcacctcgtgggagaactccaaggccgcggaactggaagccgagctcaagaagatgcaagagcagctggagaagaagaaggcgcgctgcgtggagaagctcaagaacagcgccgcgacggtgcacaaagaggcgraaggagaagcgtgccgcggcggaagygcggcgcggcgaggagatcgtcgcggcggaggagrccgccgccaagtaccgcgccaagggtgaggcgccgaagaagctgctcttcggcagaagatagatatcgcttcatcttcagcttctctctgtttgaccgnttgcatgtctcctgcccatggcatcacttgtgtatttatctttgggggngatcttagtttgtatggtatcatcaaatgcgtcgtga")
83
+ assert_equal(snps_tot , 6)
84
+ assert_equal(snps_to_ref , 12)
85
+ assert_equal(snps_1,10)
86
+ assert_equal(snps_2,13)
87
+ assert_equal(called_1,617)
88
+ assert_equal(called_2,612)
89
+ end
90
+
91
+ def test_bfr
92
+ setupre
93
+ container = Bio::BFRTools::BFRContainer.new
94
+
95
+ container.reference @ref
96
+ container.parental_1 ( {:path => @a } )
97
+ container.parental_2 ( {:path => @b } )
98
+ container.bulk_1 ( {:path => @f2_a })
99
+ container.bulk_2 ( {:path => @f2_b })
100
+
101
+ i = -1
102
+
103
+ container.init_counters
104
+ output_file = File.open(@bfr_path, "w")
105
+ # puts "Range: #{min}:#{max}"
106
+ assert_equal(@fasta_db.index.entries.size,1)
107
+ reg = nil
108
+ @fasta_db.index.entries.each do | r |
109
+ i = i + 1
110
+
111
+ reg = container.process_region({:region => r.get_full_region.to_s,:output_file => output_file , :min_cov => 5} )
112
+ #puts reg.inspect
113
+ end
114
+
115
+ with_bfr = [210, 297, 300, 645, 674]
116
+
117
+ bases_1 = Array.new
118
+ bases_2 = Array.new
119
+ bases_1 << {:A=>0, :C=>24, :G=>120, :T=>0}
120
+ bases_2 << {:A=>0, :C=>24, :G=>112, :T=>0}
121
+ bases_1 << {:A=>34, :C=>0, :G=>138, :T=>0}
122
+ bases_2 << {:A=>26, :C=>0, :G=>138, :T=>0}
123
+ bases_1 << {:A=>0, :C=>32, :G=>0, :T=>141}
124
+ bases_2 << {:A=>0, :C=>26, :G=>0, :T=>142}
125
+ bases_1 << {:A=>22, :C=>0, :G=>56, :T=>0}
126
+ bases_2 << {:A=>62, :C=>0, :G=>25, :T=>0}
127
+ bases_1 << {:A=>27, :C=>0, :G=>22, :T=>0}
128
+ bases_2 << {:A=>46, :C=>0, :G=>9, :T=>0}
129
+ i = 0
130
+ with_bfr.each do | pos |
131
+ puts pos
132
+ assert_equal(reg.bases_bulk_1[pos - 1 ] , bases_1[i] )
133
+ assert_equal(reg.bases_bulk_2[pos - 1 ] , bases_2[i] )
134
+ i += 1
135
+ end
136
+
137
+
45
138
 
46
- puts cons_2
47
- puts cons_1
139
+ output_file.close
48
140
 
49
141
  end
50
142
 
metadata CHANGED
@@ -1,43 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-polyploid-tools
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ricardo H. Ramirez-Gonzalez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-03-31 00:00:00.000000000 Z
11
+ date: 2014-04-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bio
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - '='
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: 1.4.2
19
+ version: 1.4.3
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - '='
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: 1.4.2
26
+ version: 1.4.3
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: bio-samtools
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - '='
31
+ - - ">="
32
32
  - !ruby/object:Gem::Version
33
- version: 0.6.2
33
+ version: 2.0.3
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - '='
38
+ - - ">="
39
39
  - !ruby/object:Gem::Version
40
- version: 0.6.2
40
+ version: 2.0.3
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: rake
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - ">="
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: systemu
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: 2.5.2
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: 2.5.2
69
83
  description: Repository of tools developed in TGAC and Crop Genetics in JIC to work
70
84
  with polyploid wheat
71
85
  email: ricardo.ramirez-gonzalez@tgac.ac.uk
@@ -74,6 +88,7 @@ executables:
74
88
  - count_variations.rb
75
89
  - filter_blat_by_target_coverage.rb
76
90
  - find_best_blat_hit.rb
91
+ - find_best_exonerate.rb
77
92
  - hexaploid_primers.rb
78
93
  - homokaryot_primers.rb
79
94
  - map_markers_to_contigs.rb
@@ -83,16 +98,19 @@ executables:
83
98
  extensions: []
84
99
  extra_rdoc_files:
85
100
  - README
101
+ - README.md
86
102
  files:
87
103
  - Gemfile
88
104
  - Gemfile.lock
89
105
  - README
106
+ - README.md
90
107
  - Rakefile
91
108
  - VERSION
92
109
  - bin/bfr.rb
93
110
  - bin/count_variations.rb
94
111
  - bin/filter_blat_by_target_coverage.rb
95
112
  - bin/find_best_blat_hit.rb
113
+ - bin/find_best_exonerate.rb
96
114
  - bin/hexaploid_primers.rb
97
115
  - bin/homokaryot_primers.rb
98
116
  - bin/map_markers_to_contigs.rb
@@ -143,9 +161,7 @@ files:
143
161
  - lib/bio/PolyploidTools/PrimerRegion.rb
144
162
  - lib/bio/PolyploidTools/SNP.rb
145
163
  - lib/bio/PolyploidTools/SNPSequence.rb
146
- - lib/bio/SAMToolsExtensions.rb
147
164
  - lib/bio/db/exonerate.rb
148
- - lib/bio/db/fastadb.rb
149
165
  - lib/bio/db/primer3.rb
150
166
  - lib/bioruby-polyploid-tools.rb
151
167
  - test/data/BS00068396_51.fa