bio-polymarker 1.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.travis.yml +24 -0
- data/Gemfile +23 -0
- data/README.md +205 -0
- data/Rakefile +61 -0
- data/SECURITY.md +16 -0
- data/VERSION +1 -0
- data/bin/bfr.rb +128 -0
- data/bin/blast_triads.rb +166 -0
- data/bin/blast_triads_promoters.rb +192 -0
- data/bin/count_variations.rb +36 -0
- data/bin/filter_blat_by_target_coverage.rb +69 -0
- data/bin/filter_exonerate_by_identity.rb +38 -0
- data/bin/find_best_blat_hit.rb +33 -0
- data/bin/find_best_exonerate.rb +17 -0
- data/bin/get_longest_hsp_blastx_triads.rb +66 -0
- data/bin/hexaploid_primers.rb +168 -0
- data/bin/homokaryot_primers.rb +183 -0
- data/bin/mafft_triads.rb +120 -0
- data/bin/mafft_triads_promoters.rb +403 -0
- data/bin/map_markers_to_contigs.rb +66 -0
- data/bin/marker_to_vcf.rb +241 -0
- data/bin/markers_in_region.rb +42 -0
- data/bin/mask_triads.rb +169 -0
- data/bin/polymarker.rb +410 -0
- data/bin/polymarker_capillary.rb +443 -0
- data/bin/polymarker_deletions.rb +350 -0
- data/bin/snp_position_to_polymarker.rb +101 -0
- data/bin/snps_between_bams.rb +107 -0
- data/bin/tag_stats.rb +75 -0
- data/bin/vcfLineToTable.rb +56 -0
- data/bin/vcfToPolyMarker.rb +82 -0
- data/bio-polymarker.gemspec +227 -0
- data/conf/defaults.rb +1 -0
- data/conf/primer3_config/dangle.dh +128 -0
- data/conf/primer3_config/dangle.ds +128 -0
- data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
- data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
- data/conf/primer3_config/interpretations/loops_i.dh +34 -0
- data/conf/primer3_config/interpretations/loops_i.ds +31 -0
- data/conf/primer3_config/interpretations/stack_i.dh +257 -0
- data/conf/primer3_config/interpretations/stack_i.ds +256 -0
- data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
- data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
- data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
- data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
- data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
- data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
- data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
- data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
- data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
- data/conf/primer3_config/loops.dh +30 -0
- data/conf/primer3_config/loops.ds +30 -0
- data/conf/primer3_config/stack.dh +256 -0
- data/conf/primer3_config/stack.ds +256 -0
- data/conf/primer3_config/stackmm.dh +256 -0
- data/conf/primer3_config/stackmm.ds +256 -0
- data/conf/primer3_config/tetraloop.dh +77 -0
- data/conf/primer3_config/tetraloop.ds +77 -0
- data/conf/primer3_config/triloop.dh +16 -0
- data/conf/primer3_config/triloop.ds +16 -0
- data/conf/primer3_config/tstack.dh +256 -0
- data/conf/primer3_config/tstack2.dh +256 -0
- data/conf/primer3_config/tstack2.ds +256 -0
- data/conf/primer3_config/tstack_tm_inf.ds +256 -0
- data/lib/bio/BFRTools.rb +465 -0
- data/lib/bio/BIOExtensions.rb +153 -0
- data/lib/bio/PolyploidTools/ChromosomeArm.rb +63 -0
- data/lib/bio/PolyploidTools/ExonContainer.rb +245 -0
- data/lib/bio/PolyploidTools/Marker.rb +175 -0
- data/lib/bio/PolyploidTools/Mask.rb +116 -0
- data/lib/bio/PolyploidTools/NoSNPSequence.rb +292 -0
- data/lib/bio/PolyploidTools/PrimerRegion.rb +30 -0
- data/lib/bio/PolyploidTools/SNP.rb +804 -0
- data/lib/bio/PolyploidTools/SNPMutant.rb +86 -0
- data/lib/bio/PolyploidTools/SNPSequence.rb +55 -0
- data/lib/bio/db/blast.rb +114 -0
- data/lib/bio/db/exonerate.rb +333 -0
- data/lib/bio/db/primer3.rb +820 -0
- data/lib/bio-polymarker.rb +28 -0
- data/test/data/7B_amplicon_test.fa +12 -0
- data/test/data/7B_amplicon_test.fa.fai +1 -0
- data/test/data/7B_amplicon_test_reference.fa +110 -0
- data/test/data/7B_amplicon_test_reference.fa.fai +3 -0
- data/test/data/7B_marker_test.txt +1 -0
- data/test/data/BS00068396_51.fa +2 -0
- data/test/data/BS00068396_51_blast.tab +4 -0
- data/test/data/BS00068396_51_contigs.aln +1412 -0
- data/test/data/BS00068396_51_contigs.dnd +7 -0
- data/test/data/BS00068396_51_contigs.fa +8 -0
- data/test/data/BS00068396_51_contigs.fa.fai +4 -0
- data/test/data/BS00068396_51_contigs.fa.nhr +0 -0
- data/test/data/BS00068396_51_contigs.fa.nin +0 -0
- data/test/data/BS00068396_51_contigs.fa.nsq +0 -0
- data/test/data/BS00068396_51_contigs.nhr +0 -0
- data/test/data/BS00068396_51_contigs.nin +0 -0
- data/test/data/BS00068396_51_contigs.nsq +0 -0
- data/test/data/BS00068396_51_exonerate.tab +6 -0
- data/test/data/BS00068396_51_for_polymarker.txt +1 -0
- data/test/data/BS00068396_51_genes.txt +14 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974.fa +112 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa +2304 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
- data/test/data/LIB1716.bam +0 -0
- data/test/data/LIB1716.bam.bai +0 -0
- data/test/data/LIB1719.bam +0 -0
- data/test/data/LIB1719.bam.bai +0 -0
- data/test/data/LIB1721.bam +0 -0
- data/test/data/LIB1721.bam.bai +0 -0
- data/test/data/LIB1722.bam +0 -0
- data/test/data/LIB1722.bam.bai +0 -0
- data/test/data/PST130_7067.csv +1 -0
- data/test/data/PST130_7067.fa +2 -0
- data/test/data/PST130_7067.fa.fai +1 -0
- data/test/data/PST130_7067.fa.ndb +0 -0
- data/test/data/PST130_7067.fa.nhr +0 -0
- data/test/data/PST130_7067.fa.nin +0 -0
- data/test/data/PST130_7067.fa.not +0 -0
- data/test/data/PST130_7067.fa.nsq +0 -0
- data/test/data/PST130_7067.fa.ntf +0 -0
- data/test/data/PST130_7067.fa.nto +0 -0
- data/test/data/PST130_reverse_primer.csv +1 -0
- data/test/data/S22380157.fa +16 -0
- data/test/data/S22380157.fa.fai +1 -0
- data/test/data/S22380157.vcf +67 -0
- data/test/data/S58861868/LIB1716.bam +0 -0
- data/test/data/S58861868/LIB1716.sam +651 -0
- data/test/data/S58861868/LIB1719.bam +0 -0
- data/test/data/S58861868/LIB1719.sam +805 -0
- data/test/data/S58861868/LIB1721.bam +0 -0
- data/test/data/S58861868/LIB1721.sam +1790 -0
- data/test/data/S58861868/LIB1722.bam +0 -0
- data/test/data/S58861868/LIB1722.sam +1271 -0
- data/test/data/S58861868/S58861868.fa +16 -0
- data/test/data/S58861868/S58861868.fa.fai +1 -0
- data/test/data/S58861868/S58861868.vcf +76 -0
- data/test/data/S58861868/header.txt +9 -0
- data/test/data/S58861868/merged.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam.bai +0 -0
- data/test/data/Test3Aspecific.csv +2 -0
- data/test/data/Test3Aspecific_contigs.fa +6 -0
- data/test/data/bfr_out_test.csv +5 -0
- data/test/data/chr1A_C1145499T/chr1A_C1145499T.csv +1 -0
- data/test/data/chr1A_G540414846C/chr1A_G540414846C.csv +1 -0
- data/test/data/chr1A_G540414846C/chr1A_G540414846C.fa +2 -0
- data/test/data/chr1A_T517634750C/chr1A_T517634750C.csv +1 -0
- data/test/data/chr2D_C112180134A/chr2D_C112180134A.csv +1 -0
- data/test/data/chr4D_C14473543T/chr4D_C14473543T.csv +1 -0
- data/test/data/chr4D_C14473543T/chr4D_C14473543T.fa +2 -0
- data/test/data/headerMergeed.txt +9 -0
- data/test/data/headerS2238015 +1 -0
- data/test/data/mergedLibs.bam +0 -0
- data/test/data/mergedLibsReheader.bam +0 -0
- data/test/data/mergedLibsSorted.bam +0 -0
- data/test/data/mergedLibsSorted.bam.bai +0 -0
- data/test/data/patological_cases5D.csv +1 -0
- data/test/data/primer_3_input_header_test +5 -0
- data/test/data/short_primer_design_test.csv +10 -0
- data/test/data/some_tests/some_tests.csv +201 -0
- data/test/data/test_from_mutant.csv +3 -0
- data/test/data/test_iselect.csv +196 -0
- data/test/data/test_iselect_reference.fa +1868 -0
- data/test/data/test_iselect_reference.fa.fai +934 -0
- data/test/data/test_primer3_error.csv +4 -0
- data/test/data/test_primer3_error_contigs.fa +10 -0
- data/test/test_bfr.rb +135 -0
- data/test/test_blast.rb +47 -0
- data/test/test_exon_container.rb +17 -0
- data/test/test_exonearate.rb +48 -0
- data/test/test_integration.rb +76 -0
- data/test/test_snp_parsing.rb +121 -0
- data/test/test_wrong_selection.sh +5 -0
- metadata +356 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 89d5e8e03f5d0ad769937950884f4c02591e445a9135817e20feee20489dda24
|
4
|
+
data.tar.gz: 34fa2901aa11717b33be10a5e1405a159c93375e14d1a1783b7d3da7d2d6e966
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 0c04602ec3dd28a1bca16a5d30c3292fab2d069304c08111f3749dac1666b7a22291ad7eefc94ecf0f5e1510c720c9c9a9955e484177927bc7506b7d9e3af95a
|
7
|
+
data.tar.gz: 2ec66e342aeb13ba371b7e22da7fcfabb7d690903ddd13de732177d6489a3ad1732f496ae169dc71b7baf76f09a07a64d501ab50672b73194d58199c861b33d4
|
data/.travis.yml
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
language: ruby
|
2
|
+
sudo: false
|
3
|
+
addons:
|
4
|
+
apt:
|
5
|
+
packages:
|
6
|
+
- zlib1g-dev
|
7
|
+
- libncurses5-dev
|
8
|
+
- libtinfo-dev
|
9
|
+
- exonerate
|
10
|
+
- blast2
|
11
|
+
- ncbi-blast+
|
12
|
+
- mafft
|
13
|
+
- primer3
|
14
|
+
before_install:
|
15
|
+
- gem update --system
|
16
|
+
- export RUBYOPT="-W1"
|
17
|
+
rvm:
|
18
|
+
- 2.3
|
19
|
+
- 2.4
|
20
|
+
- 2.5
|
21
|
+
- 2.6
|
22
|
+
- 2.7
|
23
|
+
|
24
|
+
|
data/Gemfile
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
# Add dependencies required to use your gem here.
|
3
|
+
# Example:
|
4
|
+
# gem "activesupport", ">= 2.3.5"
|
5
|
+
|
6
|
+
gem "bio", ">= 1.5.1"
|
7
|
+
gem "bio-samtools-wrapper", ">= 2.7.0"
|
8
|
+
gem "descriptive_statistics"
|
9
|
+
#gem "rake"
|
10
|
+
|
11
|
+
gem "sorted_set"
|
12
|
+
|
13
|
+
gem "systemu", ">=2.5.2"
|
14
|
+
|
15
|
+
group :development do
|
16
|
+
gem "shoulda", ">= 2.10"
|
17
|
+
gem 'test-unit'
|
18
|
+
if RUBY_VERSION.start_with?("2.1") or RUBY_VERSION.start_with?("2.2") or RUBY_VERSION.start_with?("2.0")
|
19
|
+
gem "jeweler", "= 2.0.1"
|
20
|
+
else
|
21
|
+
gem "juwelier"
|
22
|
+
end
|
23
|
+
end
|
data/README.md
ADDED
@@ -0,0 +1,205 @@
|
|
1
|
+
# bio-polyploid-tools
|
2
|
+
|
3
|
+
## Introduction
|
4
|
+
|
5
|
+
This tools are designed to deal with polyploid wheat. The first tool is to design KASP primers, making them as specific as possible.
|
6
|
+
|
7
|
+
|
8
|
+
## Installation
|
9
|
+
|
10
|
+
```sh
|
11
|
+
gem install bio-polyploid-tools
|
12
|
+
```
|
13
|
+
You need to have in your ```$PATH``` the following programs:
|
14
|
+
|
15
|
+
* [MAFFT](http://mafft.cbrc.jp/alignment/software/)
|
16
|
+
* [primer3](http://primer3.sourceforge.net/releases.php)
|
17
|
+
* [exonerate](http://www.ebi.ac.uk/~guy/exonerate/)
|
18
|
+
* [blast](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE%3DBlastDocs&DOC_TYPE%3DDownload)
|
19
|
+
|
20
|
+
The code was originally developed on ruby 2.1, 2.3 and 2.5. It may work on older version. However, it is only actively tested in currently supported ruby versions:
|
21
|
+
|
22
|
+
* 2.1.10
|
23
|
+
* 2.2.5
|
24
|
+
* 2.3.5
|
25
|
+
* 2.4.2
|
26
|
+
* 2.5.0
|
27
|
+
|
28
|
+
# PolyMarker
|
29
|
+
|
30
|
+
To run PolyMarker with the CSS wheat contigs, you need to unzip the reference file from [ensembl](http://ftp.ensemblgenomes.org/pub/release-25/plants/fasta/triticum_aestivum/dna/Triticum_aestivum.IWGSC2.25.dna.genome.fa.gz).
|
31
|
+
|
32
|
+
|
33
|
+
```sh
|
34
|
+
polymarker.rb --contigs Triticum_aestivum.IWGSC2.25.dna.genome.fa --marker_list snp_list.csv --output output_folder
|
35
|
+
```
|
36
|
+
|
37
|
+
The ```snp_list``` file must follow the convention ```ID,Chromosome,SEQUENCE``` with the SNP inside the sequence in the format [A/T]. As a reference, look at test/data/short_primer_design_test.csv
|
38
|
+
|
39
|
+
If you want to use the web interface, visit the [PolyMarker webservice at TGAC](http://polymarker.tgac.ac.uk)
|
40
|
+
|
41
|
+
The available command line arguments are:
|
42
|
+
|
43
|
+
```
|
44
|
+
Usage: polymarker.rb [options]
|
45
|
+
-c, --contigs FILE File with contigs to use as database
|
46
|
+
-m, --marker_list FILE File with the list of markers to search from
|
47
|
+
-g, --genomes_count INT Number of genomes (default 3, for hexaploid)
|
48
|
+
-s, --snp_list FILE File with the list of snps to search from, requires --reference to get the sequence using a position
|
49
|
+
-t, --mutant_list FILE File with the list of positions with mutation and the mutation line.
|
50
|
+
requires --reference to get the sequence using a position
|
51
|
+
-r, --reference FILE Fasta file with the sequence for the markers (to complement --snp_list)
|
52
|
+
-o, --output FOLDER Output folder
|
53
|
+
-e, --exonerate_model MODEL Model to be used in exonerate to search for the contigs
|
54
|
+
-i, --min_identity INT Minimum identity to consider a hit (default 90)
|
55
|
+
-a, --arm_selection arm_selection_embl|arm_selection_morex|arm_selection_first_two
|
56
|
+
Function to decide the chromome arm
|
57
|
+
-p, --primer_3_preferences FILE file with preferences to be sent to primer3
|
58
|
+
-v, --variation_free_region INT If present, avoid generating the common primer if there are homoeologous SNPs within the specified distance (not tested)
|
59
|
+
-x, --extract_found_contigs If present, save in a separate file the contigs with matches. Useful to debug.
|
60
|
+
-P, --primers_to_order If present, saves a file named primers_to_order which contains the KASP tails
|
61
|
+
```
|
62
|
+
|
63
|
+
## Input formats
|
64
|
+
|
65
|
+
The following formats are used to define the marker sequences:
|
66
|
+
|
67
|
+
### Marker list
|
68
|
+
|
69
|
+
If the option ```--marker_list FILE``` is used, the SNP and the flanking sequence is included in the file. The format contains 3 columns (the order is important):
|
70
|
+
|
71
|
+
* **snp_name** The ID of the marker. Must be unique.
|
72
|
+
* **target chromosome** for the specific primers. Must be in line with the chromosome selection critieria.
|
73
|
+
* **sequence** The sequence flanking the SNP with the SNP highligted on square brackets (```[]```) and the two alleles separated by a forward slash (```/```).
|
74
|
+
|
75
|
+
#### Example:
|
76
|
+
|
77
|
+
```
|
78
|
+
BS00068396_51,2A,CGAAGCGATCCTACTACATTGCGTTCCTTTCCCACTCCCAGGTCCCCCTA[T/C]ATGCAGGATCTTGATTAGTCGTGTGAACAACTGAAATTTGAGCGCCACAA
|
79
|
+
```
|
80
|
+
|
81
|
+
### SNP list
|
82
|
+
|
83
|
+
If the flanking sequence is unknow, but the position on a reference is available, the option ```--snp_list``` can be used and the FASTA file with the reference sequence must be provided with the option ```--reference```. This is to allow the use of a different assembly or set of contigs used for the discovery of the SNPs that are different to the reference given in the option ```--contigs```. The format contains the following positional columns:
|
84
|
+
|
85
|
+
* **scaffold** The sacffold where the SNP is.
|
86
|
+
* **reference allele** The base in the reference (may or may not be the same as in the reference file.
|
87
|
+
* **position** Position of the SNP. The first base in the scaffold is base 1.
|
88
|
+
* **alternative allele** The base in the alternative allele.
|
89
|
+
* **target chromosome** for the specific primers. Must be in line with the chromosome selection critieria.
|
90
|
+
|
91
|
+
#### Example
|
92
|
+
|
93
|
+
```
|
94
|
+
IWGSC_CSS_1AL_scaff_110,C,519,A,2A
|
95
|
+
```
|
96
|
+
|
97
|
+
This file format can be used with ```snp_positions_to_polymarker.rb``` to produce the input for the option```--marker_list```.
|
98
|
+
|
99
|
+
|
100
|
+
### Custom reference sequences.
|
101
|
+
|
102
|
+
By default, the contigs and pseudomolecules from [ensembl](ftp://ftp.ensemblgenomes.org/pub/release-25/plants/fasta/triticum_aestivum/dna/Triticum_aestivum.IWGSC2.25.dna.genome.fa.gz
|
103
|
+
) are used. However, it is possible to use a custom reference. To define the chromosome where each contig belongs the argument ```arm_selection``` is used. The defailt uses ids like: ```IWGSC_CSS_1AL_scaff_110```, where the third field, separated by underscores is used. A simple way to add costum references is to rename the fasta file to follow that convention. Another way is to use the option ```--arm_selection arm_selection_first_two```, where only the first two characters in each contig is used as identifier, useful when pseudomolecules are named after the chromosomes (ie: ">1A" in the fasta file).
|
104
|
+
|
105
|
+
If your contigs follow a different convention, in the file ```ChromosomeArm.rb``` it is possible to define new parsers, by adding at the begining, with the rest of the parsers a new lambda like:
|
106
|
+
|
107
|
+
```rb
|
108
|
+
@@arm_selection_functions[:embl] = lambda do | contig_name|
|
109
|
+
arr = contig_name.split('_')
|
110
|
+
ret = "U"
|
111
|
+
ret = arr[2][0,2] if arr.size >= 3
|
112
|
+
ret = "3B" if arr.size == 2 and arr[0] == "v443"
|
113
|
+
ret = arr[0][0,2] if arr.size == 1
|
114
|
+
return ret
|
115
|
+
end
|
116
|
+
```
|
117
|
+
|
118
|
+
The function should return a 2 character string, when the first is the chromosome number and the second the chromosome group. The symbol in the hash is the name to be used in the argument ```--arm_selection```. If you want your parser to be added to the distribution, feel free to fork and make a pull request.
|
119
|
+
|
120
|
+
##Using blast
|
121
|
+
|
122
|
+
To use blast instead of exonerate, use the following command:
|
123
|
+
|
124
|
+
```
|
125
|
+
./bin/polymarker.rb --contigs test/data/BS00068396_51_contigs.fa --marker_list test/data/BS00068396_51_for_polymarker.fa --aligner blast -a arm_selection_first_two
|
126
|
+
```
|
127
|
+
|
128
|
+
|
129
|
+
## Release Notes
|
130
|
+
|
131
|
+
### 0.9.7
|
132
|
+
There was some strange issue with the numbering, so bumped to 0.9.7
|
133
|
+
|
134
|
+
* Moved the arm selection function for fields in the chromosome name to the ```ChromosomeArm``` class.
|
135
|
+
|
136
|
+
### 0.8.7
|
137
|
+
* FEATURE: ```polymarker.rb``` now also prints the total number of hits found.
|
138
|
+
|
139
|
+
### 0.8.6
|
140
|
+
|
141
|
+
* BUGFIX: ```priemr3.rb``` had a regression when adding the repetitive flag to the ```@values``` array. This lead to the wrong order of the columns in the output and possibly other secondary effects.
|
142
|
+
|
143
|
+
### 0.8.5
|
144
|
+
|
145
|
+
* Added the option ```--max_hits``` to ```polyamarker.rb``` to set a maximum number of bast hits to identify repetitive regions. This adds the column ```is_repetitve``` to the output. The mask is not calculated in repetitive regions and the primers are designed as non-specific.
|
146
|
+
|
147
|
+
### 0.8.4
|
148
|
+
|
149
|
+
* Added script ```tag_stats.rb`` That gets the descriptive statistics for a tag in a bam file for each reference.
|
150
|
+
|
151
|
+
```bash
|
152
|
+
ruby tag_stats.rb -b HI.3206.006.Index_2.CS_125RNA_14d_Leaf8.sorted.bam -r /Users/ramirezr/Dropbox/JIC/expVIPMetadatas/RefSeq1.0/Genes/annotation/IWGSCv1.0_UTR_ALL.cdnas.fasta --tag 'NH'
|
153
|
+
```
|
154
|
+
|
155
|
+
### 0.8.3
|
156
|
+
|
157
|
+
* BUGFIX: ```ChromosomeArm.rb``` was fixed to conform the module assumptions for the package.
|
158
|
+
|
159
|
+
|
160
|
+
### 0.8.2
|
161
|
+
|
162
|
+
* FEATURE: The functions to select the chromosome arm are now in ```lib/bio/PolyploidTools/ChromosomeArm.rb``` and the help message is updated automatically with the valid options.
|
163
|
+
* FEATURE: Added option ```filter_best``` to replicate the original behaviour of selecting the best hit of each chromosome. Still useful for assemblies which still contain synthetic duplications.
|
164
|
+
|
165
|
+
### 0.8.1
|
166
|
+
|
167
|
+
* BUGFIX: There was an error which prevented the correct localisation of the SNP in markeres with gaps in the local alignment before the position with the snp.
|
168
|
+
* FEATURE: PolyMarker now selects the best hit of the target chromosome. This improves the specificity in regions with a recent duplication. The drawback is that if your assembly has artificial repetitions, the primers won't be marked as 'chromosome specific', but as 'chromosome semi-specific '. In a future version this will be addressed.
|
169
|
+
|
170
|
+
### 0.8
|
171
|
+
|
172
|
+
* FEATURE: ```polymarker.rb``` added the flag ```--aligner blast|exonerate ``` which lets you pick between ```blast``` or ```exonerate``` as the aligner. For blast the default is to have the database with the same name as the ```--contigs``` file. However, it is possible to use a different name vua the option ```--database```.
|
173
|
+
|
174
|
+
### 0.7.3
|
175
|
+
|
176
|
+
* FEATURE: ```polymarker.rb``` Added to the flag ```--arm_selection``` the option ```scaffold```, which now supports a scaffold specific primer.
|
177
|
+
* FEATURE: ```snp_position_to_polymarker``` Added the option ```--mutant_list``` to prepare files for PolyMarker from files with the following columns ```ID,Allele_1,position,Allele_1,target_chromosome```.
|
178
|
+
|
179
|
+
### 0.7.2
|
180
|
+
|
181
|
+
* FEATURE: Added a flag ```min_identity``` to set the minimum identity to consider a hit. The default is 90
|
182
|
+
|
183
|
+
### 0.7.1
|
184
|
+
* BUGFIX: Now the parser for ```arm_selection_embl``` works with the mixture of contigs and pseudomolecules
|
185
|
+
* DOC: Added documentation on how to use custom references.
|
186
|
+
|
187
|
+
### 0.7.0
|
188
|
+
* Added flag ```genomes_count``` for number of genomes, to be used on tetraploids, etc.
|
189
|
+
|
190
|
+
### 0.6.1
|
191
|
+
|
192
|
+
|
193
|
+
* polymarker.rb now validates that all the files exist.
|
194
|
+
* BUGFIX: A reference was required even when it was not used to generate contigs.
|
195
|
+
|
196
|
+
# Notes
|
197
|
+
|
198
|
+
* BUG: Blocks with NNNs are picked and treated as semi-specific.
|
199
|
+
* BUG: If the name of the reference have space, the ID is not chopped. ">gene_1 (G12A)" shouls be treated as ">gene_1".
|
200
|
+
* TODO: Add a parameter file to configure the alignments.
|
201
|
+
* TODO: Produce primers for products of different sizes. This can probably be done with the primer_3_preferences option, but hasn't been tested.
|
202
|
+
|
203
|
+
|
204
|
+
|
205
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
#
|
4
|
+
#require 'bundler/version'
|
5
|
+
|
6
|
+
begin
|
7
|
+
Bundler.setup(:default, :development)
|
8
|
+
rescue Bundler::BundlerError => e
|
9
|
+
$stderr.puts e.message
|
10
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
11
|
+
exit e.status_code
|
12
|
+
end
|
13
|
+
require 'rake'
|
14
|
+
|
15
|
+
|
16
|
+
if RUBY_VERSION.start_with?("2.1") or RUBY_VERSION.start_with?("2.2") or RUBY_VERSION.start_with?("2.0")
|
17
|
+
require 'jeweler'
|
18
|
+
@taskClass = Jeweler
|
19
|
+
else
|
20
|
+
require 'juwelier'
|
21
|
+
@taskClass = Juwelier
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
|
26
|
+
@taskClass::Tasks.new do |gem|
|
27
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
28
|
+
gem.name = "bio-polymarker"
|
29
|
+
gem.homepage = "https://github.com/cb2e6f/bio-polymarker"
|
30
|
+
gem.license = "MIT"
|
31
|
+
gem.summary = %Q{Tool to work with polyploids, NGS and molecular biology}
|
32
|
+
gem.description = %Q{Repository of tools developed at Crop Genetics in JIC to work with polyploid wheat}
|
33
|
+
gem.email = "rob.ellis@jic.ac.uk"
|
34
|
+
gem.authors = ["Rob Ellis"]
|
35
|
+
# Include your dependencies below. Runtime dependencies are required when using your gem,
|
36
|
+
# and development dependencies are only needed for development (ie running rake tasks, tests, etc)
|
37
|
+
#gem.add_runtime_dependency 'bio-samtools', '= 0.6.2'
|
38
|
+
# gem.add_development_dependency 'rspec', '> 1.2.3'
|
39
|
+
# gem.extensions = "ext/mkrf_conf.rb"
|
40
|
+
end
|
41
|
+
@taskClass::RubygemsDotOrgTasks.new
|
42
|
+
|
43
|
+
require 'rake/testtask'
|
44
|
+
Rake::TestTask.new(:test) do |test|
|
45
|
+
test.libs << 'lib' << 'test'
|
46
|
+
test.pattern = 'test/**/test_*.rb'
|
47
|
+
test.verbose = true
|
48
|
+
end
|
49
|
+
|
50
|
+
|
51
|
+
if RUBY_VERSION.start_with?("1.8")
|
52
|
+
require 'rcov/rcovtask'
|
53
|
+
Rcov::RcovTask.new do |test|
|
54
|
+
test.libs << 'test'
|
55
|
+
test.pattern = 'test/**/test_*.rb'
|
56
|
+
test.verbose = true
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
task :default => :test
|
61
|
+
|
data/SECURITY.md
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
# Security Policy
|
2
|
+
|
3
|
+
## Supported Versions
|
4
|
+
|
5
|
+
The following table shows the currently supported version.
|
6
|
+
|
7
|
+
| Version | Supported |
|
8
|
+
| ------- | ------------------ |
|
9
|
+
| 1.1.x | :white_check_mark: |
|
10
|
+
| 1.0.x | :x: |
|
11
|
+
| 0.x.x | :x: |
|
12
|
+
|
13
|
+
|
14
|
+
## Reporting a Vulnerability
|
15
|
+
|
16
|
+
If you find a vulneravility, please submit a comment in the security tab
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
1.3.2
|
data/bin/bfr.rb
ADDED
@@ -0,0 +1,128 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
#require 'extensions/all'
|
4
|
+
require 'bio-samtools-wrapper'
|
5
|
+
require 'optparse'
|
6
|
+
|
7
|
+
$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
|
8
|
+
$: << File.expand_path('.')
|
9
|
+
path=File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
|
10
|
+
$stderr.puts "Loading: #{path}"
|
11
|
+
require path
|
12
|
+
|
13
|
+
options = {}
|
14
|
+
|
15
|
+
options[:chunk] = 0
|
16
|
+
options[:chunk_size] = 0
|
17
|
+
options[:bucket] = 1
|
18
|
+
|
19
|
+
OptionParser.new do |opts|
|
20
|
+
opts.banner = "Usage: bfr.rb [options]"
|
21
|
+
|
22
|
+
opts.on("-r", "--reference FILE", "Fasta file with the reference sequence. Make sure to run faidx before running bfr in parallel") do |o|
|
23
|
+
options[:reference] = o
|
24
|
+
end
|
25
|
+
|
26
|
+
opts.on("-a", "--parent_1 FILE", "Sorted BAM file with the alginments from parental 1") do |o|
|
27
|
+
options[:parent_1] = o
|
28
|
+
end
|
29
|
+
|
30
|
+
opts.on("-b", "--parent_2 FILE", "Sorted BAM file with the alginments from parental 2") do |o|
|
31
|
+
options[:parent_2] = o
|
32
|
+
end
|
33
|
+
|
34
|
+
opts.on("-c", "--bulk_1 FILE", "Sorted BAM file with the alginments from bulk1 1 (corresponding to the phenotype of parental 1)") do |o|
|
35
|
+
options[:bulk_1] = o
|
36
|
+
end
|
37
|
+
|
38
|
+
opts.on("-d", "--bulk_2 FILE", "Sorted BAM file with the alginments from bulk1 2 (corresponding to the phenotype of parental 2)") do |o|
|
39
|
+
options[:bulk_2] = o
|
40
|
+
end
|
41
|
+
|
42
|
+
opts.on("-o", "--bfr FILE", "Output file with the BFRs in the chunck") do |o|
|
43
|
+
options[:output_filename] = o
|
44
|
+
end
|
45
|
+
|
46
|
+
opts.on("-s", "--stats FILE", "Output with the summary of the run. Only writes at the end, so in principle, paralell process should be able to write on it to get a status of how much has been completed.") do |o|
|
47
|
+
options[:stats_file] = o
|
48
|
+
end
|
49
|
+
opts.on("-d", "--bulk_2 FILE", "Sorted BAM file with the alginments from bulk1 2 (corresponding to the phenotype of parental 2)") do |o|
|
50
|
+
options[:bulk_2] = o
|
51
|
+
end
|
52
|
+
|
53
|
+
opts.on("-m", "--chunk_size FILE", "Number of chunks to divde the SNP calling. Useful to run in a cluster.") do |o|
|
54
|
+
options[:chunk_size] = o.to_i
|
55
|
+
end
|
56
|
+
|
57
|
+
opts.on("-n", "--chunk FILE", "Chunk number. Must be less than chunk_size. ") do |o|
|
58
|
+
options[:chunk] = o.to_i
|
59
|
+
end
|
60
|
+
|
61
|
+
|
62
|
+
end.parse!
|
63
|
+
|
64
|
+
p options
|
65
|
+
p ARGV
|
66
|
+
|
67
|
+
|
68
|
+
reference = options[:reference]
|
69
|
+
chunk = options[:chunk]
|
70
|
+
chunk_size = options[:chunk_size]
|
71
|
+
output_filename = options[:output_filename]
|
72
|
+
stats_file = options[:stats_file]
|
73
|
+
|
74
|
+
|
75
|
+
min = chunk * chunk_size
|
76
|
+
max = min + chunk_size
|
77
|
+
|
78
|
+
|
79
|
+
parental_1=options[:parent_1]
|
80
|
+
parental_2=options[:parent_2]
|
81
|
+
|
82
|
+
|
83
|
+
bulk_1 = options[:bulk_1]
|
84
|
+
bulk_2 = options[:bulk_2]
|
85
|
+
|
86
|
+
|
87
|
+
fasta_db = Bio::DB::Fasta::FastaFile.new({:fasta=>reference})
|
88
|
+
fasta_db.load_fai_entries
|
89
|
+
|
90
|
+
|
91
|
+
if chunk_size == 0
|
92
|
+
min = 0
|
93
|
+
max = fasta_db.index.entries.size
|
94
|
+
end
|
95
|
+
|
96
|
+
container = Bio::BFRTools::BFRContainer.new
|
97
|
+
|
98
|
+
container.reference reference
|
99
|
+
container.parental_1 ( {:path => parental_1 } )
|
100
|
+
container.parental_2 ( {:path => parental_2 } )
|
101
|
+
container.bulk_1 ( {:path => bulk_1 })
|
102
|
+
container.bulk_2 ( {:path => bulk_2 })
|
103
|
+
|
104
|
+
i = -1
|
105
|
+
|
106
|
+
container.init_counters
|
107
|
+
output_file = File.open(output_filename, "w")
|
108
|
+
puts "Range: #{min}:#{max}"
|
109
|
+
fasta_db.index.entries.each do | r |
|
110
|
+
i = i + 1
|
111
|
+
#puts r
|
112
|
+
#puts i
|
113
|
+
next if i < min or i >= max
|
114
|
+
container.process_region({:region => r.get_full_region.to_s,:output_file => output_file } )
|
115
|
+
#puts "Processed"
|
116
|
+
end
|
117
|
+
output_file.close
|
118
|
+
|
119
|
+
file_h = nil
|
120
|
+
if !File.exists? stats_file
|
121
|
+
file_h = File.open(stats_file, "w")
|
122
|
+
container.print_header({:output_file_stats => file_h})
|
123
|
+
else
|
124
|
+
file_h = File.open(stats_file, "a")
|
125
|
+
end
|
126
|
+
container.print_stats({:output_file_stats => file_h})
|
127
|
+
|
128
|
+
file_h.close
|
data/bin/blast_triads.rb
ADDED
@@ -0,0 +1,166 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'optparse'
|
3
|
+
require 'bio'
|
4
|
+
require 'csv'
|
5
|
+
require 'bio-blastxmlparser'
|
6
|
+
require 'fileutils'
|
7
|
+
require 'tmpdir'
|
8
|
+
|
9
|
+
|
10
|
+
options = {}
|
11
|
+
options[:identity] = 50
|
12
|
+
options[:min_bases] = 200
|
13
|
+
options[:split_token] = "-"
|
14
|
+
options[:tmp_folder] = Dir.mktmpdir
|
15
|
+
options[:program] = "blastn"
|
16
|
+
options[:random_sample] = 0
|
17
|
+
|
18
|
+
OptionParser.new do |opts|
|
19
|
+
|
20
|
+
opts.banner = "Usage: filter_blat.rb [options]"
|
21
|
+
|
22
|
+
opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
|
23
|
+
options[:identity] = o.to_f
|
24
|
+
end
|
25
|
+
opts.on("-c", "--min_bases int", "Minimum alignment length (default 200)") do |o|
|
26
|
+
options[:min_bases] = o.to_i
|
27
|
+
end
|
28
|
+
|
29
|
+
opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
|
30
|
+
options[:triads] = o
|
31
|
+
end
|
32
|
+
|
33
|
+
opts.on("-f", "--sequences FILE" , "FASTA file containing all the possible sequences. ") do |o|
|
34
|
+
options[:fasta] = o
|
35
|
+
end
|
36
|
+
|
37
|
+
opts.on("-s", "--split_token CHAR", "Character used to split the sequence name. The name will be evarything before this token on the name of the sequences") do |o|
|
38
|
+
options[:split_token] = o
|
39
|
+
end
|
40
|
+
|
41
|
+
opts.on("-p", "--program blastn|blastp", "The program to use in the alignments. Currntly only supported blastn and blastp") do |o|
|
42
|
+
options[:program] = o
|
43
|
+
end
|
44
|
+
|
45
|
+
opts.on("-r", "--random_sample INT", "Number of blast to run and keep. If set, only the number of subsets will be run") do |o|
|
46
|
+
options[:random_sample] = o.to_i
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
end.parse!
|
51
|
+
|
52
|
+
|
53
|
+
def blast_pair_fast(path_a, path_b, out_path, program: "blastn")
|
54
|
+
cmd = "#{program} -query #{path_a} -subject #{path_b} -task #{program} -out #{out_path} -outfmt '5' "
|
55
|
+
#puts cmd
|
56
|
+
executed = system cmd
|
57
|
+
result = []
|
58
|
+
blast_version = nil
|
59
|
+
n = Bio::BlastXMLParser::XmlIterator.new(out_path).to_enum
|
60
|
+
longest = nil
|
61
|
+
max_length = 0
|
62
|
+
max_pident = 0.0
|
63
|
+
max_similarity = 0.0
|
64
|
+
n.each do | iter |
|
65
|
+
iter.each do | hit |
|
66
|
+
align_len = 0
|
67
|
+
identity = 0.0
|
68
|
+
positives = 0.0
|
69
|
+
hit.each do | hsp |
|
70
|
+
align_len += hsp.align_len
|
71
|
+
identity += hsp.identity
|
72
|
+
positives += hsp.positive if program == "blastp"
|
73
|
+
end
|
74
|
+
if align_len > max_length
|
75
|
+
max_length = align_len
|
76
|
+
max_pident = 100 * identity / align_len
|
77
|
+
max_similarity = 100 * positives / align_len
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
[max_length, max_pident, max_similarity]
|
82
|
+
end
|
83
|
+
|
84
|
+
valid_pairs_A_B = Hash.new
|
85
|
+
valid_pairs_A_D = Hash.new
|
86
|
+
valid_pairs_B_D = Hash.new
|
87
|
+
|
88
|
+
split_token = options[:split_token]
|
89
|
+
|
90
|
+
sequences = Hash.new
|
91
|
+
sequence_count=0
|
92
|
+
Bio::FlatFile.open(Bio::FastaFormat, options[:fasta]) do |fasta_file|
|
93
|
+
fasta_file.each do |entry|
|
94
|
+
gene_name = entry.entry_id.split(split_token)[0]
|
95
|
+
sequences[gene_name] = entry unless sequences[gene_name]
|
96
|
+
sequences[gene_name] = entry if entry.length > sequences[gene_name].length
|
97
|
+
sequence_count += 1
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
$stderr.puts "#Loaded #{sequences.length} genes from #{sequence_count} sequences"
|
102
|
+
#FileUtils.mkdir_p(options[:tmp_folder])
|
103
|
+
$stderr.puts "TMP dir: #{options[:tmp_folder]}"
|
104
|
+
|
105
|
+
a_tmp = options[:tmp_folder] + "/A.fa"
|
106
|
+
b_tmp = options[:tmp_folder] + "/B.fa"
|
107
|
+
d_tmp = options[:tmp_folder] + "/D.fa"
|
108
|
+
out_tmp = options[:tmp_folder] + "/out.blast"
|
109
|
+
|
110
|
+
|
111
|
+
puts [
|
112
|
+
"group_id" , "query" , "subject" ,
|
113
|
+
"chr_query", "chr_subject", "aln_type",
|
114
|
+
"length" , "pident" , "psimilarity" ].join("\t")
|
115
|
+
|
116
|
+
count_lines = File.foreach(options[:triads]).inject(0) {|c, line| c+1}
|
117
|
+
|
118
|
+
probability = options[:random_sample] / count_lines.to_f
|
119
|
+
probability = 1 if options[:random_sample] == 0
|
120
|
+
prng = Random.new
|
121
|
+
#puts probability
|
122
|
+
|
123
|
+
CSV.foreach(options[:triads], headers:true ) do |row|
|
124
|
+
a = row['A']
|
125
|
+
b = row['B']
|
126
|
+
d = row['D']
|
127
|
+
triad = row['group_id']
|
128
|
+
|
129
|
+
save = probability > prng.rand && probability < 1
|
130
|
+
run = probability == 1 || save
|
131
|
+
next unless run
|
132
|
+
|
133
|
+
seq_a = sequences[a]
|
134
|
+
seq_b = sequences[b]
|
135
|
+
seq_d = sequences[d]
|
136
|
+
File.open(a_tmp, 'w') {|f| f.write(seq_a) } if seq_a
|
137
|
+
File.open(b_tmp, 'w') {|f| f.write(seq_b) } if seq_b
|
138
|
+
File.open(d_tmp, 'w') {|f| f.write(seq_d) } if seq_d
|
139
|
+
save_folder = "random_sample/#{triad}"
|
140
|
+
|
141
|
+
if save
|
142
|
+
FileUtils.mkdir_p save_folder
|
143
|
+
FileUtils.cp(a_tmp, save_folder) if seq_a
|
144
|
+
FileUtils.cp(b_tmp, save_folder) if seq_b
|
145
|
+
FileUtils.cp(d_tmp, save_folder) if seq_d
|
146
|
+
end
|
147
|
+
#This had a bug where the columns where always "AB"
|
148
|
+
if seq_a and seq_b
|
149
|
+
to_print = [triad, a, b , "A","B","A->B"]
|
150
|
+
to_print << blast_pair_fast(a_tmp, b_tmp, out_tmp, program:options[:program])
|
151
|
+
FileUtils.cp(out_tmp, "#{save_folder}/A_B.xml") if save
|
152
|
+
puts to_print.join("\t")
|
153
|
+
end
|
154
|
+
if seq_a and seq_d
|
155
|
+
to_print = [triad, a, d , "A","D","A->D"]
|
156
|
+
to_print << blast_pair_fast(a_tmp, d_tmp, out_tmp, program:options[:program])
|
157
|
+
puts to_print.join("\t")
|
158
|
+
FileUtils.cp(out_tmp, "#{save_folder}/A_D.xml") if save
|
159
|
+
end
|
160
|
+
if seq_b and seq_d
|
161
|
+
to_print = [triad, b, d , "B","D","B->D"]
|
162
|
+
to_print << blast_pair_fast(b_tmp, d_tmp, out_tmp, program:options[:program])
|
163
|
+
FileUtils.cp(out_tmp, "#{save_folder}/B_D.xml") if save
|
164
|
+
puts to_print.join("\t")
|
165
|
+
end
|
166
|
+
end
|