bio-polyploid-tools 0.7.3 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.travis.yml +17 -0
- data/Gemfile +10 -7
- data/README.md +44 -0
- data/Rakefile +14 -14
- data/VERSION +1 -1
- data/bin/bfr.rb +2 -2
- data/bin/blast_triads.rb +166 -0
- data/bin/blast_triads_promoters.rb +192 -0
- data/bin/find_homoeologue_variations.rb +385 -0
- data/bin/get_longest_hsp_blastx_triads.rb +66 -0
- data/bin/hexaploid_primers.rb +2 -2
- data/bin/homokaryot_primers.rb +2 -2
- data/bin/mafft_triads.rb +120 -0
- data/bin/mafft_triads_promoters.rb +403 -0
- data/bin/polymarker.rb +73 -17
- data/bin/polymarker_capillary.rb +416 -0
- data/bin/snp_position_to_polymarker.rb +5 -3
- data/bin/snps_between_bams.rb +0 -29
- data/bin/vcfLineToTable.rb +56 -0
- data/bio-polyploid-tools.gemspec +74 -32
- data/lib/bio/BFRTools.rb +1 -0
- data/lib/bio/PolyploidTools/ChromosomeArm.rb +2 -6
- data/lib/bio/PolyploidTools/ExonContainer.rb +31 -8
- data/lib/bio/PolyploidTools/NoSNPSequence.rb +286 -0
- data/lib/bio/PolyploidTools/PrimerRegion.rb +9 -1
- data/lib/bio/PolyploidTools/SNP.rb +58 -18
- data/lib/bio/PolyploidTools/SNPMutant.rb +5 -3
- data/lib/bio/db/blast.rb +112 -0
- data/lib/bio/db/exonerate.rb +4 -5
- data/lib/bio/db/primer3.rb +83 -14
- data/test/data/BS00068396_51_blast.tab +4 -0
- data/test/data/BS00068396_51_contigs.nhr +0 -0
- data/test/data/BS00068396_51_contigs.nin +0 -0
- data/test/data/BS00068396_51_contigs.nsq +0 -0
- data/test/data/BS00068396_51_for_polymarker.fa +1 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
- data/test/data/S22380157.vcf +67 -0
- data/test/data/S58861868/LIB1716.bam +0 -0
- data/test/data/S58861868/LIB1716.sam +651 -0
- data/test/data/S58861868/LIB1719.bam +0 -0
- data/test/data/S58861868/LIB1719.sam +805 -0
- data/test/data/S58861868/LIB1721.bam +0 -0
- data/test/data/S58861868/LIB1721.sam +1790 -0
- data/test/data/S58861868/LIB1722.bam +0 -0
- data/test/data/S58861868/LIB1722.sam +1271 -0
- data/test/data/S58861868/S58861868.fa +16 -0
- data/test/data/S58861868/S58861868.fa.fai +1 -0
- data/test/data/S58861868/S58861868.vcf +76 -0
- data/test/data/S58861868/header.txt +9 -0
- data/test/data/S58861868/merged.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam.bai +0 -0
- data/test/data/bfr_out_test.csv +5 -5
- data/test/data/headerMergeed.txt +9 -0
- data/test/data/headerS2238015 +1 -0
- data/test/data/mergedLibs.bam +0 -0
- data/test/data/mergedLibsReheader.bam +0 -0
- data/test/data/mergedLibsSorted.bam +0 -0
- data/test/data/mergedLibsSorted.bam.bai +0 -0
- data/test/test_bfr.rb +26 -34
- data/test/test_blast.rb +47 -0
- data/test/test_exonearate.rb +4 -9
- data/test/test_snp_parsing.rb +42 -22
- metadata +81 -20
- data/Gemfile.lock +0 -67
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: '08be9c740b45561cf8de023e6ca63bb6be4ae63e6f89bd1eb4b149da9cf47334'
|
4
|
+
data.tar.gz: 94aa0d62f15ad380a35fe2c4bbcd870f2cb984f04c76aa825084b9ab97431d8b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6f15740cb929555b6627eac53dc12b28d75c10709e271a23aef06935c11fb83bf99479afe68d8db5e5bac8d9ecc06c62ac8f17fc4e3066e8ae6de1094b3fb042
|
7
|
+
data.tar.gz: 7a8cee46ca1ecf4a6ed71b497005f32f851067667c59e36a6b91bea3e8153c9beee4a765866f0849ae0fe83378cc241372fde6368f6fddc11e426a0a12415c36
|
data/.travis.yml
ADDED
data/Gemfile
CHANGED
@@ -3,15 +3,18 @@ source "http://rubygems.org"
|
|
3
3
|
# Example:
|
4
4
|
# gem "activesupport", ">= 2.3.5"
|
5
5
|
|
6
|
-
gem "bio", ">= 1.
|
7
|
-
gem "bio-samtools", ">= 2.
|
8
|
-
gem "rake"
|
9
|
-
gem "jeweler"
|
6
|
+
gem "bio", ">= 1.5.1"
|
7
|
+
gem "bio-samtools", ">= 2.6.2"
|
8
|
+
#gem "rake"
|
10
9
|
|
11
10
|
gem "systemu", ">=2.5.2"
|
12
11
|
|
13
12
|
group :development do
|
14
|
-
|
15
|
-
|
16
|
-
|
13
|
+
gem "shoulda", ">= 2.10"
|
14
|
+
gem 'test-unit'
|
15
|
+
if RUBY_VERSION.start_with?("2.1") or RUBY_VERSION.start_with?("2.2") or RUBY_VERSION.start_with?("2.0")
|
16
|
+
gem "jeweler", "= 2.0.1"
|
17
|
+
else
|
18
|
+
gem "juwelier"
|
19
|
+
end
|
17
20
|
end
|
data/README.md
CHANGED
@@ -52,6 +52,43 @@ Usage: polymarker.rb [options]
|
|
52
52
|
-P, --primers_to_order If present, saves a file named primers_to_order which contains the KASP tails
|
53
53
|
```
|
54
54
|
|
55
|
+
## Input formats
|
56
|
+
|
57
|
+
The following formats are used to define the marker sequences:
|
58
|
+
|
59
|
+
### Marker list
|
60
|
+
|
61
|
+
If the option ```--marker_list FILE``` is used, the SNP and the flanking sequence is included in the file. The format contains 3 columns (the order is important):
|
62
|
+
|
63
|
+
* **snp_name** The ID of the marker. Must be unique.
|
64
|
+
* **target chromosome** for the specific primers. Must be in line with the chromosome selection critieria.
|
65
|
+
* **sequence** The sequence flanking the SNP with the SNP highligted on square brackets (```[]```) and the two alleles separated by a forward slash (```/```).
|
66
|
+
|
67
|
+
#### Example:
|
68
|
+
|
69
|
+
```
|
70
|
+
BS00068396_51,2A,CGAAGCGATCCTACTACATTGCGTTCCTTTCCCACTCCCAGGTCCCCCTA[T/C]ATGCAGGATCTTGATTAGTCGTGTGAACAACTGAAATTTGAGCGCCACAA
|
71
|
+
```
|
72
|
+
|
73
|
+
### SNP list
|
74
|
+
|
75
|
+
If the flanking sequence is unknow, but the position on a reference is available, the option ```--snp_list``` can be used and the FASTA file with the reference sequence must be provided with the option ```--reference```. This is to allow the use of a different assembly or set of contigs used for the discovery of the SNPs that are different to the reference given in the option ```--contigs```. The format contains the following positional columns:
|
76
|
+
|
77
|
+
* **scaffold** The sacffold where the SNP is.
|
78
|
+
* **reference allele** The base in the reference (may or may not be the same as in the reference file.
|
79
|
+
* **position** Position of the SNP. The first base in the scaffold is base 1.
|
80
|
+
* **alternative allele** The base in the alternative allele.
|
81
|
+
* **target chromosome** for the specific primers. Must be in line with the chromosome selection critieria.
|
82
|
+
|
83
|
+
####Example
|
84
|
+
|
85
|
+
```
|
86
|
+
IWGSC_CSS_1AL_scaff_110,C,519,A,2A
|
87
|
+
```
|
88
|
+
|
89
|
+
This file format can be used with ```snp_positions_to_polymarker.rb``` to produce the input for the option```--marker_list```.
|
90
|
+
|
91
|
+
|
55
92
|
###Custom reference sequences.
|
56
93
|
By default, the contigs and pseudomolecules from [ensembl](ftp://ftp.ensemblgenomes.org/pub/release-25/plants/fasta/triticum_aestivum/dna/Triticum_aestivum.IWGSC2.25.dna.genome.fa.gz
|
57
94
|
) are used. However, it is possible to use a custom reference. To define the chromosome where each contig belongs the argument ```arm_selection``` is used. The defailt uses ids like: ```IWGSC_CSS_1AL_scaff_110```, where the third field, separated by underscores is used. A simple way to add costum references is to rename the fasta file to follow that convention. Another way is to use the option ```--arm_selection arm_selection_first_two```, where only the first two characters in each contig is used as identifier, useful when pseudomolecules are named after the chromosomes (ie: ">1A" in the fasta file).
|
@@ -71,6 +108,13 @@ end
|
|
71
108
|
|
72
109
|
The function should return a 2 character string, when the first is the chromosome number and the second the chromosome group. The symbol in the hash is the name to be used in the argument ```--arm_selection```. If you want your parser to be added to the distribution, feel free to fork and make a pull request.
|
73
110
|
|
111
|
+
##Using blast
|
112
|
+
|
113
|
+
To use blast instead of exonerate, use the following command:
|
114
|
+
|
115
|
+
```
|
116
|
+
./bin/polymarker.rb --contigs test/data/BS00068396_51_contigs.fa --marker_list test/data/BS00068396_51_for_polymarker.fa --aligner blast -a arm_selection_first_two
|
117
|
+
```
|
74
118
|
|
75
119
|
|
76
120
|
##Release Notes
|
data/Rakefile
CHANGED
@@ -12,16 +12,25 @@ begin
|
|
12
12
|
end
|
13
13
|
require 'rake'
|
14
14
|
|
15
|
-
require 'jeweler'
|
16
15
|
|
17
|
-
|
16
|
+
if RUBY_VERSION.start_with?("2.1") or RUBY_VERSION.start_with?("2.2") or RUBY_VERSION.start_with?("2.0")
|
17
|
+
require 'jeweler'
|
18
|
+
@taskClass = Jeweler
|
19
|
+
else
|
20
|
+
require 'juwelier'
|
21
|
+
@taskClass = Juwelier
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
|
26
|
+
@taskClass::Tasks.new do |gem|
|
18
27
|
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
19
28
|
gem.name = "bio-polyploid-tools"
|
20
29
|
gem.homepage = "http://github.com/tgac/bioruby-polyploid-tools"
|
21
30
|
gem.license = "MIT"
|
22
31
|
gem.summary = %Q{Tool to work with polyploids, NGS and molecular biology}
|
23
|
-
gem.description = %Q{Repository of tools developed
|
24
|
-
gem.email = "ricardo.ramirez-gonzalez@
|
32
|
+
gem.description = %Q{Repository of tools developed at Crop Genetics in JIC to work with polyploid wheat}
|
33
|
+
gem.email = "ricardo.ramirez-gonzalez@jic.ac.uk"
|
25
34
|
gem.authors = ["Ricardo H. Ramirez-Gonzalez"]
|
26
35
|
# Include your dependencies below. Runtime dependencies are required when using your gem,
|
27
36
|
# and development dependencies are only needed for development (ie running rake tasks, tests, etc)
|
@@ -29,7 +38,7 @@ Jeweler::Tasks.new do |gem|
|
|
29
38
|
# gem.add_development_dependency 'rspec', '> 1.2.3'
|
30
39
|
# gem.extensions = "ext/mkrf_conf.rb"
|
31
40
|
end
|
32
|
-
|
41
|
+
@taskClass::RubygemsDotOrgTasks.new
|
33
42
|
|
34
43
|
require 'rake/testtask'
|
35
44
|
Rake::TestTask.new(:test) do |test|
|
@@ -50,12 +59,3 @@ end
|
|
50
59
|
|
51
60
|
task :default => :test
|
52
61
|
|
53
|
-
#require 'rdoc/task'
|
54
|
-
##RDoc::Task.new do |rdoc|
|
55
|
-
# version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
56
|
-
|
57
|
-
# rdoc.rdoc_dir = 'rdoc'
|
58
|
-
# rdoc.title = "bio-samtools #{version}"
|
59
|
-
# rdoc.rdoc_files.include('README*')
|
60
|
-
# rdoc.rdoc_files.include('lib/**/*.rb')
|
61
|
-
#end
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.8.0
|
data/bin/bfr.rb
CHANGED
@@ -50,11 +50,11 @@ OptionParser.new do |opts|
|
|
50
50
|
options[:bulk_2] = o
|
51
51
|
end
|
52
52
|
|
53
|
-
opts.on("-m", "--chunk_size FILE", "
|
53
|
+
opts.on("-m", "--chunk_size FILE", "Number of chunks to divde the SNP calling. Useful to run in a cluster.") do |o|
|
54
54
|
options[:chunk_size] = o.to_i
|
55
55
|
end
|
56
56
|
|
57
|
-
opts.on("-n", "--chunk FILE", "
|
57
|
+
opts.on("-n", "--chunk FILE", "Chunk number. Must be less than chunk_size. ") do |o|
|
58
58
|
options[:chunk] = o.to_i
|
59
59
|
end
|
60
60
|
|
data/bin/blast_triads.rb
ADDED
@@ -0,0 +1,166 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'optparse'
|
3
|
+
require 'bio'
|
4
|
+
require 'csv'
|
5
|
+
require 'bio-blastxmlparser'
|
6
|
+
require 'fileutils'
|
7
|
+
require 'tmpdir'
|
8
|
+
|
9
|
+
|
10
|
+
options = {}
|
11
|
+
options[:identity] = 50
|
12
|
+
options[:min_bases] = 200
|
13
|
+
options[:split_token] = "-"
|
14
|
+
options[:tmp_folder] = Dir.mktmpdir
|
15
|
+
options[:program] = "blastn"
|
16
|
+
options[:random_sample] = 0
|
17
|
+
|
18
|
+
OptionParser.new do |opts|
|
19
|
+
|
20
|
+
opts.banner = "Usage: filter_blat.rb [options]"
|
21
|
+
|
22
|
+
opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
|
23
|
+
options[:identity] = o.to_f
|
24
|
+
end
|
25
|
+
opts.on("-c", "--min_bases int", "Minimum alignment length (default 200)") do |o|
|
26
|
+
options[:min_bases] = o.to_i
|
27
|
+
end
|
28
|
+
|
29
|
+
opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
|
30
|
+
options[:triads] = o
|
31
|
+
end
|
32
|
+
|
33
|
+
opts.on("-f", "--sequences FILE" , "FASTA file containing all the possible sequences. ") do |o|
|
34
|
+
options[:fasta] = o
|
35
|
+
end
|
36
|
+
|
37
|
+
opts.on("-s", "--split_token CHAR", "Character used to split the sequence name. The name will be evarything before this token on the name of the sequences") do |o|
|
38
|
+
options[:split_token] = o
|
39
|
+
end
|
40
|
+
|
41
|
+
opts.on("-p", "--program blastn|blastp", "The program to use in the alignments. Currntly only supported blastn and blastp") do |o|
|
42
|
+
options[:program] = o
|
43
|
+
end
|
44
|
+
|
45
|
+
opts.on("-r", "--random_sample INT", "Number of blast to run and keep. If set, only the number of subsets will be run") do |o|
|
46
|
+
options[:random_sample] = o.to_i
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
end.parse!
|
51
|
+
|
52
|
+
|
53
|
+
def blast_pair_fast(path_a, path_b, out_path, program: "blastn")
|
54
|
+
cmd = "#{program} -query #{path_a} -subject #{path_b} -task #{program} -out #{out_path} -outfmt '5' "
|
55
|
+
#puts cmd
|
56
|
+
executed = system cmd
|
57
|
+
result = []
|
58
|
+
blast_version = nil
|
59
|
+
n = Bio::BlastXMLParser::XmlIterator.new(out_path).to_enum
|
60
|
+
longest = nil
|
61
|
+
max_length = 0
|
62
|
+
max_pident = 0.0
|
63
|
+
max_similarity = 0.0
|
64
|
+
n.each do | iter |
|
65
|
+
iter.each do | hit |
|
66
|
+
align_len = 0
|
67
|
+
identity = 0.0
|
68
|
+
positives = 0.0
|
69
|
+
hit.each do | hsp |
|
70
|
+
align_len += hsp.align_len
|
71
|
+
identity += hsp.identity
|
72
|
+
positives += hsp.positive if program == "blastp"
|
73
|
+
end
|
74
|
+
if align_len > max_length
|
75
|
+
max_length = align_len
|
76
|
+
max_pident = 100 * identity / align_len
|
77
|
+
max_similarity = 100 * positives / align_len
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
[max_length, max_pident, max_similarity]
|
82
|
+
end
|
83
|
+
|
84
|
+
valid_pairs_A_B = Hash.new
|
85
|
+
valid_pairs_A_D = Hash.new
|
86
|
+
valid_pairs_B_D = Hash.new
|
87
|
+
|
88
|
+
split_token = options[:split_token]
|
89
|
+
|
90
|
+
sequences = Hash.new
|
91
|
+
sequence_count=0
|
92
|
+
Bio::FlatFile.open(Bio::FastaFormat, options[:fasta]) do |fasta_file|
|
93
|
+
fasta_file.each do |entry|
|
94
|
+
gene_name = entry.entry_id.split(split_token)[0]
|
95
|
+
sequences[gene_name] = entry unless sequences[gene_name]
|
96
|
+
sequences[gene_name] = entry if entry.length > sequences[gene_name].length
|
97
|
+
sequence_count += 1
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
$stderr.puts "#Loaded #{sequences.length} genes from #{sequence_count} sequences"
|
102
|
+
#FileUtils.mkdir_p(options[:tmp_folder])
|
103
|
+
$stderr.puts "TMP dir: #{options[:tmp_folder]}"
|
104
|
+
|
105
|
+
a_tmp = options[:tmp_folder] + "/A.fa"
|
106
|
+
b_tmp = options[:tmp_folder] + "/B.fa"
|
107
|
+
d_tmp = options[:tmp_folder] + "/D.fa"
|
108
|
+
out_tmp = options[:tmp_folder] + "/out.blast"
|
109
|
+
|
110
|
+
|
111
|
+
puts [
|
112
|
+
"group_id" , "query" , "subject" ,
|
113
|
+
"chr_query", "chr_subject", "aln_type",
|
114
|
+
"length" , "pident" , "psimilarity" ].join("\t")
|
115
|
+
|
116
|
+
count_lines = File.foreach(options[:triads]).inject(0) {|c, line| c+1}
|
117
|
+
|
118
|
+
probability = options[:random_sample] / count_lines.to_f
|
119
|
+
probability = 1 if options[:random_sample] == 0
|
120
|
+
prng = Random.new
|
121
|
+
#puts probability
|
122
|
+
|
123
|
+
CSV.foreach(options[:triads], headers:true ) do |row|
|
124
|
+
a = row['A']
|
125
|
+
b = row['B']
|
126
|
+
d = row['D']
|
127
|
+
triad = row['group_id']
|
128
|
+
|
129
|
+
save = probability > prng.rand && probability < 1
|
130
|
+
run = probability == 1 || save
|
131
|
+
next unless run
|
132
|
+
|
133
|
+
seq_a = sequences[a]
|
134
|
+
seq_b = sequences[b]
|
135
|
+
seq_d = sequences[d]
|
136
|
+
File.open(a_tmp, 'w') {|f| f.write(seq_a) } if seq_a
|
137
|
+
File.open(b_tmp, 'w') {|f| f.write(seq_b) } if seq_b
|
138
|
+
File.open(d_tmp, 'w') {|f| f.write(seq_d) } if seq_d
|
139
|
+
save_folder = "random_sample/#{triad}"
|
140
|
+
|
141
|
+
if save
|
142
|
+
FileUtils.mkdir_p save_folder
|
143
|
+
FileUtils.cp(a_tmp, save_folder) if seq_a
|
144
|
+
FileUtils.cp(b_tmp, save_folder) if seq_b
|
145
|
+
FileUtils.cp(d_tmp, save_folder) if seq_d
|
146
|
+
end
|
147
|
+
|
148
|
+
if seq_a and seq_b
|
149
|
+
to_print = [triad, a, b , "A","B","A->B"]
|
150
|
+
to_print << blast_pair_fast(a_tmp, b_tmp, out_tmp, program:options[:program])
|
151
|
+
FileUtils.cp(out_tmp, "#{save_folder}/A_B.xml") if save
|
152
|
+
puts to_print.join("\t")
|
153
|
+
end
|
154
|
+
if seq_a and seq_d
|
155
|
+
to_print = [triad, a, b , "A","D","A->D"]
|
156
|
+
to_print << blast_pair_fast(a_tmp, d_tmp, out_tmp, program:options[:program])
|
157
|
+
puts to_print.join("\t")
|
158
|
+
FileUtils.cp(out_tmp, "#{save_folder}/A_D.xml") if save
|
159
|
+
end
|
160
|
+
if seq_b and seq_d
|
161
|
+
to_print = [triad, a, b , "B","D","B->D"]
|
162
|
+
to_print << blast_pair_fast(b_tmp, d_tmp, out_tmp, program:options[:program])
|
163
|
+
FileUtils.cp(out_tmp, "#{save_folder}/B_D.xml") if save
|
164
|
+
puts to_print.join("\t")
|
165
|
+
end
|
166
|
+
end
|
@@ -0,0 +1,192 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'optparse'
|
3
|
+
require 'bio'
|
4
|
+
require 'csv'
|
5
|
+
require 'bio-blastxmlparser'
|
6
|
+
require 'fileutils'
|
7
|
+
require 'tmpdir'
|
8
|
+
|
9
|
+
|
10
|
+
options = {}
|
11
|
+
options[:identity] = 50
|
12
|
+
options[:min_bases] = 200
|
13
|
+
options[:split_token] = "-"
|
14
|
+
options[:tmp_folder] = Dir.mktmpdir
|
15
|
+
options[:program] = "blastn"
|
16
|
+
options[:random_sample] = 0
|
17
|
+
options[:cut_promoter_length] = 0
|
18
|
+
options[:reverse] = true
|
19
|
+
|
20
|
+
OptionParser.new do |opts|
|
21
|
+
|
22
|
+
opts.banner = "Usage: filter_blat.rb [options]"
|
23
|
+
|
24
|
+
opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
|
25
|
+
options[:identity] = o.to_f
|
26
|
+
end
|
27
|
+
opts.on("-c", "--min_bases int", "Minimum alignment length (default 200)") do |o|
|
28
|
+
options[:min_bases] = o.to_i
|
29
|
+
end
|
30
|
+
|
31
|
+
opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
|
32
|
+
options[:triads] = o
|
33
|
+
end
|
34
|
+
|
35
|
+
opts.on("-f", "--sequences FILE" , "FASTA file containing all the possible sequences. ") do |o|
|
36
|
+
options[:fasta] = o
|
37
|
+
end
|
38
|
+
|
39
|
+
opts.on("-s", "--split_token CHAR", "Character used to split the sequence name. The name will be evarything before this token on the name of the sequences") do |o|
|
40
|
+
options[:split_token] = o
|
41
|
+
end
|
42
|
+
|
43
|
+
opts.on("-p", "--program blastn|blastp", "The program to use in the alignments. Currntly only supported blastn and blastp") do |o|
|
44
|
+
options[:program] = o
|
45
|
+
end
|
46
|
+
|
47
|
+
opts.on("-r", "--random_sample INT", "Number of blast to run and keep. If set, only the number of subsets will be run") do |o|
|
48
|
+
options[:random_sample] = o.to_i
|
49
|
+
end
|
50
|
+
|
51
|
+
opts.on("-l", "--cut_promoter_length INT", "Bases to consider") do |o|
|
52
|
+
options[:cut_promoter_length] = o.to_i
|
53
|
+
end
|
54
|
+
|
55
|
+
opts.on("-v", "--reverse T|F", "Reverse the input bases") do |o|
|
56
|
+
if o == 'T'
|
57
|
+
options[:reverse] = true
|
58
|
+
elsif o == 'F'
|
59
|
+
options[:reverse] = false
|
60
|
+
else
|
61
|
+
$stderr.puts "Invalid option for reverse (should be T or F)"
|
62
|
+
exit -1
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end.parse!
|
66
|
+
|
67
|
+
|
68
|
+
def blast_pair_fast(path_a, path_b, out_path, program: "blastn")
|
69
|
+
cmd = "#{program} -query #{path_a} -subject #{path_b} -task #{program} -out #{out_path} -outfmt '5' "
|
70
|
+
#puts cmd
|
71
|
+
executed = system cmd
|
72
|
+
result = []
|
73
|
+
blast_version = nil
|
74
|
+
n = Bio::BlastXMLParser::XmlIterator.new(out_path).to_enum
|
75
|
+
longest = nil
|
76
|
+
max_length = 0
|
77
|
+
max_pident = 0.0
|
78
|
+
n.each do | iter |
|
79
|
+
iter.each do | hit |
|
80
|
+
hit.each do | hsp |
|
81
|
+
if hsp.align_len > max_length
|
82
|
+
max_length = hsp.align_len
|
83
|
+
max_pident = 100 * hsp.identity.to_f / hsp.align_len.to_f
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
[max_length, max_pident]
|
89
|
+
end
|
90
|
+
|
91
|
+
valid_pairs_A_B = Hash.new
|
92
|
+
valid_pairs_A_D = Hash.new
|
93
|
+
valid_pairs_B_D = Hash.new
|
94
|
+
|
95
|
+
split_token = options[:split_token]
|
96
|
+
|
97
|
+
sequences = Hash.new
|
98
|
+
sequence_count=0
|
99
|
+
Bio::FlatFile.open(Bio::FastaFormat, options[:fasta]) do |fasta_file|
|
100
|
+
fasta_file.each do |entry|
|
101
|
+
gene_name = entry.entry_id.split(split_token)[0]
|
102
|
+
seq = entry.naseq
|
103
|
+
seq.reverse_complement! if options[:reverse]
|
104
|
+
seq = seq[0,options[:cut_promoter_length]] if options[:cut_promoter_length] > 0
|
105
|
+
entry.data = seq
|
106
|
+
sequences[gene_name] = entry unless sequences[gene_name]
|
107
|
+
sequences[gene_name] = entry if entry.length > sequences[gene_name].length
|
108
|
+
sequence_count += 1
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
$stderr.puts "#Loaded #{sequences.length} genes from #{sequence_count} sequences"
|
113
|
+
#FileUtils.mkdir_p(options[:tmp_folder])
|
114
|
+
$stderr.puts "TMP dir: #{options[:tmp_folder]}"
|
115
|
+
|
116
|
+
a_tmp = options[:tmp_folder] + "/A.fa"
|
117
|
+
b_tmp = options[:tmp_folder] + "/B.fa"
|
118
|
+
d_tmp = options[:tmp_folder] + "/D.fa"
|
119
|
+
out_tmp = options[:tmp_folder] + "/out.blast"
|
120
|
+
|
121
|
+
|
122
|
+
puts [
|
123
|
+
"group_id" , "query" , "subject" ,
|
124
|
+
"chr_query", "chr_subject", "aln_type",
|
125
|
+
"length" , "pident" , "Ns_query", "Ns_subject", "Ns_total" ].join("\t")
|
126
|
+
|
127
|
+
count_lines = File.foreach(options[:triads]).inject(0) {|c, line| c+1}
|
128
|
+
|
129
|
+
probability = options[:random_sample] / count_lines.to_f
|
130
|
+
probability = 1 if options[:random_sample] == 0
|
131
|
+
prng = Random.new
|
132
|
+
#puts probability
|
133
|
+
prom_len = options[:cut_promoter_length]
|
134
|
+
CSV.foreach(options[:triads], headers:true ) do |row|
|
135
|
+
a = row['A']
|
136
|
+
b = row['B']
|
137
|
+
d = row['D']
|
138
|
+
triad = row['group_id'].to_i
|
139
|
+
triad_folder = triad/100
|
140
|
+
|
141
|
+
save = probability > prng.rand && probability < 1
|
142
|
+
run = probability == 1 || save
|
143
|
+
next unless run
|
144
|
+
|
145
|
+
seq_a = sequences[a]
|
146
|
+
seq_b = sequences[b]
|
147
|
+
seq_d = sequences[d]
|
148
|
+
File.open(a_tmp, 'w') {|f| f.write(seq_a) } if seq_a
|
149
|
+
File.open(b_tmp, 'w') {|f| f.write(seq_b) } if seq_b
|
150
|
+
File.open(d_tmp, 'w') {|f| f.write(seq_d) } if seq_d
|
151
|
+
|
152
|
+
ns_a = seq_a.seq.count('Nn') if seq_a
|
153
|
+
ns_b = seq_b.seq.count('Nn') if seq_b
|
154
|
+
ns_d = seq_d.seq.count('Nn') if seq_d
|
155
|
+
|
156
|
+
save_folder = "blast_alignments_#{prom_len}/#{triad_folder}/#{triad}"
|
157
|
+
|
158
|
+
#if save
|
159
|
+
FileUtils.mkdir_p save_folder
|
160
|
+
FileUtils.cp(a_tmp, save_folder) if seq_a
|
161
|
+
FileUtils.cp(b_tmp, save_folder) if seq_b
|
162
|
+
FileUtils.cp(d_tmp, save_folder) if seq_d
|
163
|
+
#end
|
164
|
+
|
165
|
+
if seq_a and seq_b
|
166
|
+
to_print = [triad, a, b , "A","B","A->B"]
|
167
|
+
to_print << blast_pair_fast(a_tmp, b_tmp, out_tmp, program:options[:program])
|
168
|
+
to_print << ns_a
|
169
|
+
to_print << ns_b
|
170
|
+
to_print << ns_a + ns_b
|
171
|
+
FileUtils.cp(out_tmp, "#{save_folder}/A_B.xml") #if save
|
172
|
+
puts to_print.join("\t")
|
173
|
+
end
|
174
|
+
if seq_a and seq_d
|
175
|
+
to_print = [triad, a, b , "A","D","A->D"]
|
176
|
+
to_print << blast_pair_fast(a_tmp, d_tmp, out_tmp, program:options[:program])
|
177
|
+
to_print << ns_a
|
178
|
+
to_print << ns_d
|
179
|
+
to_print << ns_a + ns_d
|
180
|
+
FileUtils.cp(out_tmp, "#{save_folder}/A_D.xml") #if save
|
181
|
+
puts to_print.join("\t")
|
182
|
+
end
|
183
|
+
if seq_b and seq_d
|
184
|
+
to_print = [triad, a, b , "B","D","B->D"]
|
185
|
+
to_print << blast_pair_fast(b_tmp, d_tmp, out_tmp, program:options[:program])
|
186
|
+
to_print << ns_b
|
187
|
+
to_print << ns_d
|
188
|
+
to_print << ns_b + ns_d
|
189
|
+
FileUtils.cp(out_tmp, "#{save_folder}/B_D.xml") #if save
|
190
|
+
puts to_print.join("\t")
|
191
|
+
end
|
192
|
+
end
|