bio-polyploid-tools 0.7.3 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +17 -0
  3. data/Gemfile +10 -7
  4. data/README.md +44 -0
  5. data/Rakefile +14 -14
  6. data/VERSION +1 -1
  7. data/bin/bfr.rb +2 -2
  8. data/bin/blast_triads.rb +166 -0
  9. data/bin/blast_triads_promoters.rb +192 -0
  10. data/bin/find_homoeologue_variations.rb +385 -0
  11. data/bin/get_longest_hsp_blastx_triads.rb +66 -0
  12. data/bin/hexaploid_primers.rb +2 -2
  13. data/bin/homokaryot_primers.rb +2 -2
  14. data/bin/mafft_triads.rb +120 -0
  15. data/bin/mafft_triads_promoters.rb +403 -0
  16. data/bin/polymarker.rb +73 -17
  17. data/bin/polymarker_capillary.rb +416 -0
  18. data/bin/snp_position_to_polymarker.rb +5 -3
  19. data/bin/snps_between_bams.rb +0 -29
  20. data/bin/vcfLineToTable.rb +56 -0
  21. data/bio-polyploid-tools.gemspec +74 -32
  22. data/lib/bio/BFRTools.rb +1 -0
  23. data/lib/bio/PolyploidTools/ChromosomeArm.rb +2 -6
  24. data/lib/bio/PolyploidTools/ExonContainer.rb +31 -8
  25. data/lib/bio/PolyploidTools/NoSNPSequence.rb +286 -0
  26. data/lib/bio/PolyploidTools/PrimerRegion.rb +9 -1
  27. data/lib/bio/PolyploidTools/SNP.rb +58 -18
  28. data/lib/bio/PolyploidTools/SNPMutant.rb +5 -3
  29. data/lib/bio/db/blast.rb +112 -0
  30. data/lib/bio/db/exonerate.rb +4 -5
  31. data/lib/bio/db/primer3.rb +83 -14
  32. data/test/data/BS00068396_51_blast.tab +4 -0
  33. data/test/data/BS00068396_51_contigs.nhr +0 -0
  34. data/test/data/BS00068396_51_contigs.nin +0 -0
  35. data/test/data/BS00068396_51_contigs.nsq +0 -0
  36. data/test/data/BS00068396_51_for_polymarker.fa +1 -0
  37. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
  38. data/test/data/S22380157.vcf +67 -0
  39. data/test/data/S58861868/LIB1716.bam +0 -0
  40. data/test/data/S58861868/LIB1716.sam +651 -0
  41. data/test/data/S58861868/LIB1719.bam +0 -0
  42. data/test/data/S58861868/LIB1719.sam +805 -0
  43. data/test/data/S58861868/LIB1721.bam +0 -0
  44. data/test/data/S58861868/LIB1721.sam +1790 -0
  45. data/test/data/S58861868/LIB1722.bam +0 -0
  46. data/test/data/S58861868/LIB1722.sam +1271 -0
  47. data/test/data/S58861868/S58861868.fa +16 -0
  48. data/test/data/S58861868/S58861868.fa.fai +1 -0
  49. data/test/data/S58861868/S58861868.vcf +76 -0
  50. data/test/data/S58861868/header.txt +9 -0
  51. data/test/data/S58861868/merged.bam +0 -0
  52. data/test/data/S58861868/merged_reheader.bam +0 -0
  53. data/test/data/S58861868/merged_reheader.bam.bai +0 -0
  54. data/test/data/bfr_out_test.csv +5 -5
  55. data/test/data/headerMergeed.txt +9 -0
  56. data/test/data/headerS2238015 +1 -0
  57. data/test/data/mergedLibs.bam +0 -0
  58. data/test/data/mergedLibsReheader.bam +0 -0
  59. data/test/data/mergedLibsSorted.bam +0 -0
  60. data/test/data/mergedLibsSorted.bam.bai +0 -0
  61. data/test/test_bfr.rb +26 -34
  62. data/test/test_blast.rb +47 -0
  63. data/test/test_exonearate.rb +4 -9
  64. data/test/test_snp_parsing.rb +42 -22
  65. metadata +81 -20
  66. data/Gemfile.lock +0 -67
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: adcaebc142757300631df98a5f672cae5dc76cb7
4
- data.tar.gz: 05e3acefabb42d5ea3c84236f4b79ebf338c6306
2
+ SHA256:
3
+ metadata.gz: '08be9c740b45561cf8de023e6ca63bb6be4ae63e6f89bd1eb4b149da9cf47334'
4
+ data.tar.gz: 94aa0d62f15ad380a35fe2c4bbcd870f2cb984f04c76aa825084b9ab97431d8b
5
5
  SHA512:
6
- metadata.gz: c1fa0e9e177bad1633fc5c4d3fd3f3fe2c2b4fc9915dfeeea943d7559d8850061f9437f703a7fa462163eb7f52431b747216d540ed73e64559ca20cfc7fe471b
7
- data.tar.gz: 4db9d4d8b404378b39af64978d82671d94b5babc42ea4542c9f76b880793c23a5283945dffc4d76d2ba68761c3d30e1b434f867513e8a71c3f60f61b3885cf58
6
+ metadata.gz: 6f15740cb929555b6627eac53dc12b28d75c10709e271a23aef06935c11fb83bf99479afe68d8db5e5bac8d9ecc06c62ac8f17fc4e3066e8ae6de1094b3fb042
7
+ data.tar.gz: 7a8cee46ca1ecf4a6ed71b497005f32f851067667c59e36a6b91bea3e8153c9beee4a765866f0849ae0fe83378cc241372fde6368f6fddc11e426a0a12415c36
@@ -0,0 +1,17 @@
1
+ language: ruby
2
+ sudo: false
3
+ addons:
4
+ apt:
5
+ packages:
6
+ - zlib1g-dev
7
+ - libncurses5-dev
8
+ - libtinfo-dev
9
+ - exonerate
10
+ rvm:
11
+ - 2.1.10
12
+ - 2.2.5
13
+ - 2.3.5
14
+ - 2.4.2
15
+
16
+ before_install:
17
+ - export RUBYOPT="-W1"
data/Gemfile CHANGED
@@ -3,15 +3,18 @@ source "http://rubygems.org"
3
3
  # Example:
4
4
  # gem "activesupport", ">= 2.3.5"
5
5
 
6
- gem "bio", ">= 1.4.3"
7
- gem "bio-samtools", ">= 2.0.4"
8
- gem "rake"
9
- gem "jeweler"
6
+ gem "bio", ">= 1.5.1"
7
+ gem "bio-samtools", ">= 2.6.2"
8
+ #gem "rake"
10
9
 
11
10
  gem "systemu", ">=2.5.2"
12
11
 
13
12
  group :development do
14
- # gem "shoulda", ">= 0"
15
- # gem "shoulda-context"
16
- # gem "shoulda-matchers"
13
+ gem "shoulda", ">= 2.10"
14
+ gem 'test-unit'
15
+ if RUBY_VERSION.start_with?("2.1") or RUBY_VERSION.start_with?("2.2") or RUBY_VERSION.start_with?("2.0")
16
+ gem "jeweler", "= 2.0.1"
17
+ else
18
+ gem "juwelier"
19
+ end
17
20
  end
data/README.md CHANGED
@@ -52,6 +52,43 @@ Usage: polymarker.rb [options]
52
52
  -P, --primers_to_order If present, saves a file named primers_to_order which contains the KASP tails
53
53
  ```
54
54
 
55
+ ## Input formats
56
+
57
+ The following formats are used to define the marker sequences:
58
+
59
+ ### Marker list
60
+
61
+ If the option ```--marker_list FILE``` is used, the SNP and the flanking sequence is included in the file. The format contains 3 columns (the order is important):
62
+
63
+ * **snp_name** The ID of the marker. Must be unique.
64
+ * **target chromosome** for the specific primers. Must be in line with the chromosome selection critieria.
65
+ * **sequence** The sequence flanking the SNP with the SNP highligted on square brackets (```[]```) and the two alleles separated by a forward slash (```/```).
66
+
67
+ #### Example:
68
+
69
+ ```
70
+ BS00068396_51,2A,CGAAGCGATCCTACTACATTGCGTTCCTTTCCCACTCCCAGGTCCCCCTA[T/C]ATGCAGGATCTTGATTAGTCGTGTGAACAACTGAAATTTGAGCGCCACAA
71
+ ```
72
+
73
+ ### SNP list
74
+
75
+ If the flanking sequence is unknow, but the position on a reference is available, the option ```--snp_list``` can be used and the FASTA file with the reference sequence must be provided with the option ```--reference```. This is to allow the use of a different assembly or set of contigs used for the discovery of the SNPs that are different to the reference given in the option ```--contigs```. The format contains the following positional columns:
76
+
77
+ * **scaffold** The sacffold where the SNP is.
78
+ * **reference allele** The base in the reference (may or may not be the same as in the reference file.
79
+ * **position** Position of the SNP. The first base in the scaffold is base 1.
80
+ * **alternative allele** The base in the alternative allele.
81
+ * **target chromosome** for the specific primers. Must be in line with the chromosome selection critieria.
82
+
83
+ ####Example
84
+
85
+ ```
86
+ IWGSC_CSS_1AL_scaff_110,C,519,A,2A
87
+ ```
88
+
89
+ This file format can be used with ```snp_positions_to_polymarker.rb``` to produce the input for the option```--marker_list```.
90
+
91
+
55
92
  ###Custom reference sequences.
56
93
  By default, the contigs and pseudomolecules from [ensembl](ftp://ftp.ensemblgenomes.org/pub/release-25/plants/fasta/triticum_aestivum/dna/Triticum_aestivum.IWGSC2.25.dna.genome.fa.gz
57
94
  ) are used. However, it is possible to use a custom reference. To define the chromosome where each contig belongs the argument ```arm_selection``` is used. The defailt uses ids like: ```IWGSC_CSS_1AL_scaff_110```, where the third field, separated by underscores is used. A simple way to add costum references is to rename the fasta file to follow that convention. Another way is to use the option ```--arm_selection arm_selection_first_two```, where only the first two characters in each contig is used as identifier, useful when pseudomolecules are named after the chromosomes (ie: ">1A" in the fasta file).
@@ -71,6 +108,13 @@ end
71
108
 
72
109
  The function should return a 2 character string, when the first is the chromosome number and the second the chromosome group. The symbol in the hash is the name to be used in the argument ```--arm_selection```. If you want your parser to be added to the distribution, feel free to fork and make a pull request.
73
110
 
111
+ ##Using blast
112
+
113
+ To use blast instead of exonerate, use the following command:
114
+
115
+ ```
116
+ ./bin/polymarker.rb --contigs test/data/BS00068396_51_contigs.fa --marker_list test/data/BS00068396_51_for_polymarker.fa --aligner blast -a arm_selection_first_two
117
+ ```
74
118
 
75
119
 
76
120
  ##Release Notes
data/Rakefile CHANGED
@@ -12,16 +12,25 @@ begin
12
12
  end
13
13
  require 'rake'
14
14
 
15
- require 'jeweler'
16
15
 
17
- Jeweler::Tasks.new do |gem|
16
+ if RUBY_VERSION.start_with?("2.1") or RUBY_VERSION.start_with?("2.2") or RUBY_VERSION.start_with?("2.0")
17
+ require 'jeweler'
18
+ @taskClass = Jeweler
19
+ else
20
+ require 'juwelier'
21
+ @taskClass = Juwelier
22
+ end
23
+
24
+
25
+
26
+ @taskClass::Tasks.new do |gem|
18
27
  # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
19
28
  gem.name = "bio-polyploid-tools"
20
29
  gem.homepage = "http://github.com/tgac/bioruby-polyploid-tools"
21
30
  gem.license = "MIT"
22
31
  gem.summary = %Q{Tool to work with polyploids, NGS and molecular biology}
23
- gem.description = %Q{Repository of tools developed in TGAC and Crop Genetics in JIC to work with polyploid wheat}
24
- gem.email = "ricardo.ramirez-gonzalez@tgac.ac.uk"
32
+ gem.description = %Q{Repository of tools developed at Crop Genetics in JIC to work with polyploid wheat}
33
+ gem.email = "ricardo.ramirez-gonzalez@jic.ac.uk"
25
34
  gem.authors = ["Ricardo H. Ramirez-Gonzalez"]
26
35
  # Include your dependencies below. Runtime dependencies are required when using your gem,
27
36
  # and development dependencies are only needed for development (ie running rake tasks, tests, etc)
@@ -29,7 +38,7 @@ Jeweler::Tasks.new do |gem|
29
38
  # gem.add_development_dependency 'rspec', '> 1.2.3'
30
39
  # gem.extensions = "ext/mkrf_conf.rb"
31
40
  end
32
- Jeweler::RubygemsDotOrgTasks.new
41
+ @taskClass::RubygemsDotOrgTasks.new
33
42
 
34
43
  require 'rake/testtask'
35
44
  Rake::TestTask.new(:test) do |test|
@@ -50,12 +59,3 @@ end
50
59
 
51
60
  task :default => :test
52
61
 
53
- #require 'rdoc/task'
54
- ##RDoc::Task.new do |rdoc|
55
- # version = File.exist?('VERSION') ? File.read('VERSION') : ""
56
-
57
- # rdoc.rdoc_dir = 'rdoc'
58
- # rdoc.title = "bio-samtools #{version}"
59
- # rdoc.rdoc_files.include('README*')
60
- # rdoc.rdoc_files.include('lib/**/*.rb')
61
- #end
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.7.3
1
+ 0.8.0
data/bin/bfr.rb CHANGED
@@ -50,11 +50,11 @@ OptionParser.new do |opts|
50
50
  options[:bulk_2] = o
51
51
  end
52
52
 
53
- opts.on("-m", "--chunk_size FILE", "Sorted BAM file with the alginments from bulk1 2 (corresponding to the phenotype of parental 2)") do |o|
53
+ opts.on("-m", "--chunk_size FILE", "Number of chunks to divde the SNP calling. Useful to run in a cluster.") do |o|
54
54
  options[:chunk_size] = o.to_i
55
55
  end
56
56
 
57
- opts.on("-n", "--chunk FILE", "Sorted BAM file with the alginments from bulk1 2 (corresponding to the phenotype of parental 2)") do |o|
57
+ opts.on("-n", "--chunk FILE", "Chunk number. Must be less than chunk_size. ") do |o|
58
58
  options[:chunk] = o.to_i
59
59
  end
60
60
 
@@ -0,0 +1,166 @@
1
+ #!/usr/bin/env ruby
2
+ require 'optparse'
3
+ require 'bio'
4
+ require 'csv'
5
+ require 'bio-blastxmlparser'
6
+ require 'fileutils'
7
+ require 'tmpdir'
8
+
9
+
10
+ options = {}
11
+ options[:identity] = 50
12
+ options[:min_bases] = 200
13
+ options[:split_token] = "-"
14
+ options[:tmp_folder] = Dir.mktmpdir
15
+ options[:program] = "blastn"
16
+ options[:random_sample] = 0
17
+
18
+ OptionParser.new do |opts|
19
+
20
+ opts.banner = "Usage: filter_blat.rb [options]"
21
+
22
+ opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
23
+ options[:identity] = o.to_f
24
+ end
25
+ opts.on("-c", "--min_bases int", "Minimum alignment length (default 200)") do |o|
26
+ options[:min_bases] = o.to_i
27
+ end
28
+
29
+ opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
30
+ options[:triads] = o
31
+ end
32
+
33
+ opts.on("-f", "--sequences FILE" , "FASTA file containing all the possible sequences. ") do |o|
34
+ options[:fasta] = o
35
+ end
36
+
37
+ opts.on("-s", "--split_token CHAR", "Character used to split the sequence name. The name will be evarything before this token on the name of the sequences") do |o|
38
+ options[:split_token] = o
39
+ end
40
+
41
+ opts.on("-p", "--program blastn|blastp", "The program to use in the alignments. Currntly only supported blastn and blastp") do |o|
42
+ options[:program] = o
43
+ end
44
+
45
+ opts.on("-r", "--random_sample INT", "Number of blast to run and keep. If set, only the number of subsets will be run") do |o|
46
+ options[:random_sample] = o.to_i
47
+ end
48
+
49
+
50
+ end.parse!
51
+
52
+
53
+ def blast_pair_fast(path_a, path_b, out_path, program: "blastn")
54
+ cmd = "#{program} -query #{path_a} -subject #{path_b} -task #{program} -out #{out_path} -outfmt '5' "
55
+ #puts cmd
56
+ executed = system cmd
57
+ result = []
58
+ blast_version = nil
59
+ n = Bio::BlastXMLParser::XmlIterator.new(out_path).to_enum
60
+ longest = nil
61
+ max_length = 0
62
+ max_pident = 0.0
63
+ max_similarity = 0.0
64
+ n.each do | iter |
65
+ iter.each do | hit |
66
+ align_len = 0
67
+ identity = 0.0
68
+ positives = 0.0
69
+ hit.each do | hsp |
70
+ align_len += hsp.align_len
71
+ identity += hsp.identity
72
+ positives += hsp.positive if program == "blastp"
73
+ end
74
+ if align_len > max_length
75
+ max_length = align_len
76
+ max_pident = 100 * identity / align_len
77
+ max_similarity = 100 * positives / align_len
78
+ end
79
+ end
80
+ end
81
+ [max_length, max_pident, max_similarity]
82
+ end
83
+
84
+ valid_pairs_A_B = Hash.new
85
+ valid_pairs_A_D = Hash.new
86
+ valid_pairs_B_D = Hash.new
87
+
88
+ split_token = options[:split_token]
89
+
90
+ sequences = Hash.new
91
+ sequence_count=0
92
+ Bio::FlatFile.open(Bio::FastaFormat, options[:fasta]) do |fasta_file|
93
+ fasta_file.each do |entry|
94
+ gene_name = entry.entry_id.split(split_token)[0]
95
+ sequences[gene_name] = entry unless sequences[gene_name]
96
+ sequences[gene_name] = entry if entry.length > sequences[gene_name].length
97
+ sequence_count += 1
98
+ end
99
+ end
100
+
101
+ $stderr.puts "#Loaded #{sequences.length} genes from #{sequence_count} sequences"
102
+ #FileUtils.mkdir_p(options[:tmp_folder])
103
+ $stderr.puts "TMP dir: #{options[:tmp_folder]}"
104
+
105
+ a_tmp = options[:tmp_folder] + "/A.fa"
106
+ b_tmp = options[:tmp_folder] + "/B.fa"
107
+ d_tmp = options[:tmp_folder] + "/D.fa"
108
+ out_tmp = options[:tmp_folder] + "/out.blast"
109
+
110
+
111
+ puts [
112
+ "group_id" , "query" , "subject" ,
113
+ "chr_query", "chr_subject", "aln_type",
114
+ "length" , "pident" , "psimilarity" ].join("\t")
115
+
116
+ count_lines = File.foreach(options[:triads]).inject(0) {|c, line| c+1}
117
+
118
+ probability = options[:random_sample] / count_lines.to_f
119
+ probability = 1 if options[:random_sample] == 0
120
+ prng = Random.new
121
+ #puts probability
122
+
123
+ CSV.foreach(options[:triads], headers:true ) do |row|
124
+ a = row['A']
125
+ b = row['B']
126
+ d = row['D']
127
+ triad = row['group_id']
128
+
129
+ save = probability > prng.rand && probability < 1
130
+ run = probability == 1 || save
131
+ next unless run
132
+
133
+ seq_a = sequences[a]
134
+ seq_b = sequences[b]
135
+ seq_d = sequences[d]
136
+ File.open(a_tmp, 'w') {|f| f.write(seq_a) } if seq_a
137
+ File.open(b_tmp, 'w') {|f| f.write(seq_b) } if seq_b
138
+ File.open(d_tmp, 'w') {|f| f.write(seq_d) } if seq_d
139
+ save_folder = "random_sample/#{triad}"
140
+
141
+ if save
142
+ FileUtils.mkdir_p save_folder
143
+ FileUtils.cp(a_tmp, save_folder) if seq_a
144
+ FileUtils.cp(b_tmp, save_folder) if seq_b
145
+ FileUtils.cp(d_tmp, save_folder) if seq_d
146
+ end
147
+
148
+ if seq_a and seq_b
149
+ to_print = [triad, a, b , "A","B","A->B"]
150
+ to_print << blast_pair_fast(a_tmp, b_tmp, out_tmp, program:options[:program])
151
+ FileUtils.cp(out_tmp, "#{save_folder}/A_B.xml") if save
152
+ puts to_print.join("\t")
153
+ end
154
+ if seq_a and seq_d
155
+ to_print = [triad, a, b , "A","D","A->D"]
156
+ to_print << blast_pair_fast(a_tmp, d_tmp, out_tmp, program:options[:program])
157
+ puts to_print.join("\t")
158
+ FileUtils.cp(out_tmp, "#{save_folder}/A_D.xml") if save
159
+ end
160
+ if seq_b and seq_d
161
+ to_print = [triad, a, b , "B","D","B->D"]
162
+ to_print << blast_pair_fast(b_tmp, d_tmp, out_tmp, program:options[:program])
163
+ FileUtils.cp(out_tmp, "#{save_folder}/B_D.xml") if save
164
+ puts to_print.join("\t")
165
+ end
166
+ end
@@ -0,0 +1,192 @@
1
+ #!/usr/bin/env ruby
2
+ require 'optparse'
3
+ require 'bio'
4
+ require 'csv'
5
+ require 'bio-blastxmlparser'
6
+ require 'fileutils'
7
+ require 'tmpdir'
8
+
9
+
10
+ options = {}
11
+ options[:identity] = 50
12
+ options[:min_bases] = 200
13
+ options[:split_token] = "-"
14
+ options[:tmp_folder] = Dir.mktmpdir
15
+ options[:program] = "blastn"
16
+ options[:random_sample] = 0
17
+ options[:cut_promoter_length] = 0
18
+ options[:reverse] = true
19
+
20
+ OptionParser.new do |opts|
21
+
22
+ opts.banner = "Usage: filter_blat.rb [options]"
23
+
24
+ opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
25
+ options[:identity] = o.to_f
26
+ end
27
+ opts.on("-c", "--min_bases int", "Minimum alignment length (default 200)") do |o|
28
+ options[:min_bases] = o.to_i
29
+ end
30
+
31
+ opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
32
+ options[:triads] = o
33
+ end
34
+
35
+ opts.on("-f", "--sequences FILE" , "FASTA file containing all the possible sequences. ") do |o|
36
+ options[:fasta] = o
37
+ end
38
+
39
+ opts.on("-s", "--split_token CHAR", "Character used to split the sequence name. The name will be evarything before this token on the name of the sequences") do |o|
40
+ options[:split_token] = o
41
+ end
42
+
43
+ opts.on("-p", "--program blastn|blastp", "The program to use in the alignments. Currntly only supported blastn and blastp") do |o|
44
+ options[:program] = o
45
+ end
46
+
47
+ opts.on("-r", "--random_sample INT", "Number of blast to run and keep. If set, only the number of subsets will be run") do |o|
48
+ options[:random_sample] = o.to_i
49
+ end
50
+
51
+ opts.on("-l", "--cut_promoter_length INT", "Bases to consider") do |o|
52
+ options[:cut_promoter_length] = o.to_i
53
+ end
54
+
55
+ opts.on("-v", "--reverse T|F", "Reverse the input bases") do |o|
56
+ if o == 'T'
57
+ options[:reverse] = true
58
+ elsif o == 'F'
59
+ options[:reverse] = false
60
+ else
61
+ $stderr.puts "Invalid option for reverse (should be T or F)"
62
+ exit -1
63
+ end
64
+ end
65
+ end.parse!
66
+
67
+
68
+ def blast_pair_fast(path_a, path_b, out_path, program: "blastn")
69
+ cmd = "#{program} -query #{path_a} -subject #{path_b} -task #{program} -out #{out_path} -outfmt '5' "
70
+ #puts cmd
71
+ executed = system cmd
72
+ result = []
73
+ blast_version = nil
74
+ n = Bio::BlastXMLParser::XmlIterator.new(out_path).to_enum
75
+ longest = nil
76
+ max_length = 0
77
+ max_pident = 0.0
78
+ n.each do | iter |
79
+ iter.each do | hit |
80
+ hit.each do | hsp |
81
+ if hsp.align_len > max_length
82
+ max_length = hsp.align_len
83
+ max_pident = 100 * hsp.identity.to_f / hsp.align_len.to_f
84
+ end
85
+ end
86
+ end
87
+ end
88
+ [max_length, max_pident]
89
+ end
90
+
91
+ valid_pairs_A_B = Hash.new
92
+ valid_pairs_A_D = Hash.new
93
+ valid_pairs_B_D = Hash.new
94
+
95
+ split_token = options[:split_token]
96
+
97
+ sequences = Hash.new
98
+ sequence_count=0
99
+ Bio::FlatFile.open(Bio::FastaFormat, options[:fasta]) do |fasta_file|
100
+ fasta_file.each do |entry|
101
+ gene_name = entry.entry_id.split(split_token)[0]
102
+ seq = entry.naseq
103
+ seq.reverse_complement! if options[:reverse]
104
+ seq = seq[0,options[:cut_promoter_length]] if options[:cut_promoter_length] > 0
105
+ entry.data = seq
106
+ sequences[gene_name] = entry unless sequences[gene_name]
107
+ sequences[gene_name] = entry if entry.length > sequences[gene_name].length
108
+ sequence_count += 1
109
+ end
110
+ end
111
+
112
+ $stderr.puts "#Loaded #{sequences.length} genes from #{sequence_count} sequences"
113
+ #FileUtils.mkdir_p(options[:tmp_folder])
114
+ $stderr.puts "TMP dir: #{options[:tmp_folder]}"
115
+
116
+ a_tmp = options[:tmp_folder] + "/A.fa"
117
+ b_tmp = options[:tmp_folder] + "/B.fa"
118
+ d_tmp = options[:tmp_folder] + "/D.fa"
119
+ out_tmp = options[:tmp_folder] + "/out.blast"
120
+
121
+
122
+ puts [
123
+ "group_id" , "query" , "subject" ,
124
+ "chr_query", "chr_subject", "aln_type",
125
+ "length" , "pident" , "Ns_query", "Ns_subject", "Ns_total" ].join("\t")
126
+
127
+ count_lines = File.foreach(options[:triads]).inject(0) {|c, line| c+1}
128
+
129
+ probability = options[:random_sample] / count_lines.to_f
130
+ probability = 1 if options[:random_sample] == 0
131
+ prng = Random.new
132
+ #puts probability
133
+ prom_len = options[:cut_promoter_length]
134
+ CSV.foreach(options[:triads], headers:true ) do |row|
135
+ a = row['A']
136
+ b = row['B']
137
+ d = row['D']
138
+ triad = row['group_id'].to_i
139
+ triad_folder = triad/100
140
+
141
+ save = probability > prng.rand && probability < 1
142
+ run = probability == 1 || save
143
+ next unless run
144
+
145
+ seq_a = sequences[a]
146
+ seq_b = sequences[b]
147
+ seq_d = sequences[d]
148
+ File.open(a_tmp, 'w') {|f| f.write(seq_a) } if seq_a
149
+ File.open(b_tmp, 'w') {|f| f.write(seq_b) } if seq_b
150
+ File.open(d_tmp, 'w') {|f| f.write(seq_d) } if seq_d
151
+
152
+ ns_a = seq_a.seq.count('Nn') if seq_a
153
+ ns_b = seq_b.seq.count('Nn') if seq_b
154
+ ns_d = seq_d.seq.count('Nn') if seq_d
155
+
156
+ save_folder = "blast_alignments_#{prom_len}/#{triad_folder}/#{triad}"
157
+
158
+ #if save
159
+ FileUtils.mkdir_p save_folder
160
+ FileUtils.cp(a_tmp, save_folder) if seq_a
161
+ FileUtils.cp(b_tmp, save_folder) if seq_b
162
+ FileUtils.cp(d_tmp, save_folder) if seq_d
163
+ #end
164
+
165
+ if seq_a and seq_b
166
+ to_print = [triad, a, b , "A","B","A->B"]
167
+ to_print << blast_pair_fast(a_tmp, b_tmp, out_tmp, program:options[:program])
168
+ to_print << ns_a
169
+ to_print << ns_b
170
+ to_print << ns_a + ns_b
171
+ FileUtils.cp(out_tmp, "#{save_folder}/A_B.xml") #if save
172
+ puts to_print.join("\t")
173
+ end
174
+ if seq_a and seq_d
175
+ to_print = [triad, a, b , "A","D","A->D"]
176
+ to_print << blast_pair_fast(a_tmp, d_tmp, out_tmp, program:options[:program])
177
+ to_print << ns_a
178
+ to_print << ns_d
179
+ to_print << ns_a + ns_d
180
+ FileUtils.cp(out_tmp, "#{save_folder}/A_D.xml") #if save
181
+ puts to_print.join("\t")
182
+ end
183
+ if seq_b and seq_d
184
+ to_print = [triad, a, b , "B","D","B->D"]
185
+ to_print << blast_pair_fast(b_tmp, d_tmp, out_tmp, program:options[:program])
186
+ to_print << ns_b
187
+ to_print << ns_d
188
+ to_print << ns_b + ns_d
189
+ FileUtils.cp(out_tmp, "#{save_folder}/B_D.xml") #if save
190
+ puts to_print.join("\t")
191
+ end
192
+ end