bio-polymarker 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. checksums.yaml +7 -0
  2. data/.travis.yml +24 -0
  3. data/Gemfile +23 -0
  4. data/README.md +205 -0
  5. data/Rakefile +61 -0
  6. data/SECURITY.md +16 -0
  7. data/VERSION +1 -0
  8. data/bin/bfr.rb +128 -0
  9. data/bin/blast_triads.rb +166 -0
  10. data/bin/blast_triads_promoters.rb +192 -0
  11. data/bin/count_variations.rb +36 -0
  12. data/bin/filter_blat_by_target_coverage.rb +69 -0
  13. data/bin/filter_exonerate_by_identity.rb +38 -0
  14. data/bin/find_best_blat_hit.rb +33 -0
  15. data/bin/find_best_exonerate.rb +17 -0
  16. data/bin/get_longest_hsp_blastx_triads.rb +66 -0
  17. data/bin/hexaploid_primers.rb +168 -0
  18. data/bin/homokaryot_primers.rb +183 -0
  19. data/bin/mafft_triads.rb +120 -0
  20. data/bin/mafft_triads_promoters.rb +403 -0
  21. data/bin/map_markers_to_contigs.rb +66 -0
  22. data/bin/marker_to_vcf.rb +241 -0
  23. data/bin/markers_in_region.rb +42 -0
  24. data/bin/mask_triads.rb +169 -0
  25. data/bin/polymarker.rb +410 -0
  26. data/bin/polymarker_capillary.rb +443 -0
  27. data/bin/polymarker_deletions.rb +350 -0
  28. data/bin/snp_position_to_polymarker.rb +101 -0
  29. data/bin/snps_between_bams.rb +107 -0
  30. data/bin/tag_stats.rb +75 -0
  31. data/bin/vcfLineToTable.rb +56 -0
  32. data/bin/vcfToPolyMarker.rb +82 -0
  33. data/bio-polymarker.gemspec +227 -0
  34. data/conf/defaults.rb +1 -0
  35. data/conf/primer3_config/dangle.dh +128 -0
  36. data/conf/primer3_config/dangle.ds +128 -0
  37. data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
  38. data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
  39. data/conf/primer3_config/interpretations/loops_i.dh +34 -0
  40. data/conf/primer3_config/interpretations/loops_i.ds +31 -0
  41. data/conf/primer3_config/interpretations/stack_i.dh +257 -0
  42. data/conf/primer3_config/interpretations/stack_i.ds +256 -0
  43. data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
  44. data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
  45. data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
  46. data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
  47. data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
  48. data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
  49. data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
  50. data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
  51. data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
  52. data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
  53. data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
  54. data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
  55. data/conf/primer3_config/loops.dh +30 -0
  56. data/conf/primer3_config/loops.ds +30 -0
  57. data/conf/primer3_config/stack.dh +256 -0
  58. data/conf/primer3_config/stack.ds +256 -0
  59. data/conf/primer3_config/stackmm.dh +256 -0
  60. data/conf/primer3_config/stackmm.ds +256 -0
  61. data/conf/primer3_config/tetraloop.dh +77 -0
  62. data/conf/primer3_config/tetraloop.ds +77 -0
  63. data/conf/primer3_config/triloop.dh +16 -0
  64. data/conf/primer3_config/triloop.ds +16 -0
  65. data/conf/primer3_config/tstack.dh +256 -0
  66. data/conf/primer3_config/tstack2.dh +256 -0
  67. data/conf/primer3_config/tstack2.ds +256 -0
  68. data/conf/primer3_config/tstack_tm_inf.ds +256 -0
  69. data/lib/bio/BFRTools.rb +465 -0
  70. data/lib/bio/BIOExtensions.rb +153 -0
  71. data/lib/bio/PolyploidTools/ChromosomeArm.rb +63 -0
  72. data/lib/bio/PolyploidTools/ExonContainer.rb +245 -0
  73. data/lib/bio/PolyploidTools/Marker.rb +175 -0
  74. data/lib/bio/PolyploidTools/Mask.rb +116 -0
  75. data/lib/bio/PolyploidTools/NoSNPSequence.rb +292 -0
  76. data/lib/bio/PolyploidTools/PrimerRegion.rb +30 -0
  77. data/lib/bio/PolyploidTools/SNP.rb +804 -0
  78. data/lib/bio/PolyploidTools/SNPMutant.rb +86 -0
  79. data/lib/bio/PolyploidTools/SNPSequence.rb +55 -0
  80. data/lib/bio/db/blast.rb +114 -0
  81. data/lib/bio/db/exonerate.rb +333 -0
  82. data/lib/bio/db/primer3.rb +820 -0
  83. data/lib/bio-polymarker.rb +28 -0
  84. data/test/data/7B_amplicon_test.fa +12 -0
  85. data/test/data/7B_amplicon_test.fa.fai +1 -0
  86. data/test/data/7B_amplicon_test_reference.fa +110 -0
  87. data/test/data/7B_amplicon_test_reference.fa.fai +3 -0
  88. data/test/data/7B_marker_test.txt +1 -0
  89. data/test/data/BS00068396_51.fa +2 -0
  90. data/test/data/BS00068396_51_blast.tab +4 -0
  91. data/test/data/BS00068396_51_contigs.aln +1412 -0
  92. data/test/data/BS00068396_51_contigs.dnd +7 -0
  93. data/test/data/BS00068396_51_contigs.fa +8 -0
  94. data/test/data/BS00068396_51_contigs.fa.fai +4 -0
  95. data/test/data/BS00068396_51_contigs.fa.nhr +0 -0
  96. data/test/data/BS00068396_51_contigs.fa.nin +0 -0
  97. data/test/data/BS00068396_51_contigs.fa.nsq +0 -0
  98. data/test/data/BS00068396_51_contigs.nhr +0 -0
  99. data/test/data/BS00068396_51_contigs.nin +0 -0
  100. data/test/data/BS00068396_51_contigs.nsq +0 -0
  101. data/test/data/BS00068396_51_exonerate.tab +6 -0
  102. data/test/data/BS00068396_51_for_polymarker.txt +1 -0
  103. data/test/data/BS00068396_51_genes.txt +14 -0
  104. data/test/data/IWGSC_CSS_1AL_scaff_1455974.fa +112 -0
  105. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa +2304 -0
  106. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
  107. data/test/data/LIB1716.bam +0 -0
  108. data/test/data/LIB1716.bam.bai +0 -0
  109. data/test/data/LIB1719.bam +0 -0
  110. data/test/data/LIB1719.bam.bai +0 -0
  111. data/test/data/LIB1721.bam +0 -0
  112. data/test/data/LIB1721.bam.bai +0 -0
  113. data/test/data/LIB1722.bam +0 -0
  114. data/test/data/LIB1722.bam.bai +0 -0
  115. data/test/data/PST130_7067.csv +1 -0
  116. data/test/data/PST130_7067.fa +2 -0
  117. data/test/data/PST130_7067.fa.fai +1 -0
  118. data/test/data/PST130_7067.fa.ndb +0 -0
  119. data/test/data/PST130_7067.fa.nhr +0 -0
  120. data/test/data/PST130_7067.fa.nin +0 -0
  121. data/test/data/PST130_7067.fa.not +0 -0
  122. data/test/data/PST130_7067.fa.nsq +0 -0
  123. data/test/data/PST130_7067.fa.ntf +0 -0
  124. data/test/data/PST130_7067.fa.nto +0 -0
  125. data/test/data/PST130_reverse_primer.csv +1 -0
  126. data/test/data/S22380157.fa +16 -0
  127. data/test/data/S22380157.fa.fai +1 -0
  128. data/test/data/S22380157.vcf +67 -0
  129. data/test/data/S58861868/LIB1716.bam +0 -0
  130. data/test/data/S58861868/LIB1716.sam +651 -0
  131. data/test/data/S58861868/LIB1719.bam +0 -0
  132. data/test/data/S58861868/LIB1719.sam +805 -0
  133. data/test/data/S58861868/LIB1721.bam +0 -0
  134. data/test/data/S58861868/LIB1721.sam +1790 -0
  135. data/test/data/S58861868/LIB1722.bam +0 -0
  136. data/test/data/S58861868/LIB1722.sam +1271 -0
  137. data/test/data/S58861868/S58861868.fa +16 -0
  138. data/test/data/S58861868/S58861868.fa.fai +1 -0
  139. data/test/data/S58861868/S58861868.vcf +76 -0
  140. data/test/data/S58861868/header.txt +9 -0
  141. data/test/data/S58861868/merged.bam +0 -0
  142. data/test/data/S58861868/merged_reheader.bam +0 -0
  143. data/test/data/S58861868/merged_reheader.bam.bai +0 -0
  144. data/test/data/Test3Aspecific.csv +2 -0
  145. data/test/data/Test3Aspecific_contigs.fa +6 -0
  146. data/test/data/bfr_out_test.csv +5 -0
  147. data/test/data/chr1A_C1145499T/chr1A_C1145499T.csv +1 -0
  148. data/test/data/chr1A_G540414846C/chr1A_G540414846C.csv +1 -0
  149. data/test/data/chr1A_G540414846C/chr1A_G540414846C.fa +2 -0
  150. data/test/data/chr1A_T517634750C/chr1A_T517634750C.csv +1 -0
  151. data/test/data/chr2D_C112180134A/chr2D_C112180134A.csv +1 -0
  152. data/test/data/chr4D_C14473543T/chr4D_C14473543T.csv +1 -0
  153. data/test/data/chr4D_C14473543T/chr4D_C14473543T.fa +2 -0
  154. data/test/data/headerMergeed.txt +9 -0
  155. data/test/data/headerS2238015 +1 -0
  156. data/test/data/mergedLibs.bam +0 -0
  157. data/test/data/mergedLibsReheader.bam +0 -0
  158. data/test/data/mergedLibsSorted.bam +0 -0
  159. data/test/data/mergedLibsSorted.bam.bai +0 -0
  160. data/test/data/patological_cases5D.csv +1 -0
  161. data/test/data/primer_3_input_header_test +5 -0
  162. data/test/data/short_primer_design_test.csv +10 -0
  163. data/test/data/some_tests/some_tests.csv +201 -0
  164. data/test/data/test_from_mutant.csv +3 -0
  165. data/test/data/test_iselect.csv +196 -0
  166. data/test/data/test_iselect_reference.fa +1868 -0
  167. data/test/data/test_iselect_reference.fa.fai +934 -0
  168. data/test/data/test_primer3_error.csv +4 -0
  169. data/test/data/test_primer3_error_contigs.fa +10 -0
  170. data/test/test_bfr.rb +135 -0
  171. data/test/test_blast.rb +47 -0
  172. data/test/test_exon_container.rb +17 -0
  173. data/test/test_exonearate.rb +48 -0
  174. data/test/test_integration.rb +76 -0
  175. data/test/test_snp_parsing.rb +121 -0
  176. data/test/test_wrong_selection.sh +5 -0
  177. metadata +356 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 89d5e8e03f5d0ad769937950884f4c02591e445a9135817e20feee20489dda24
4
+ data.tar.gz: 34fa2901aa11717b33be10a5e1405a159c93375e14d1a1783b7d3da7d2d6e966
5
+ SHA512:
6
+ metadata.gz: 0c04602ec3dd28a1bca16a5d30c3292fab2d069304c08111f3749dac1666b7a22291ad7eefc94ecf0f5e1510c720c9c9a9955e484177927bc7506b7d9e3af95a
7
+ data.tar.gz: 2ec66e342aeb13ba371b7e22da7fcfabb7d690903ddd13de732177d6489a3ad1732f496ae169dc71b7baf76f09a07a64d501ab50672b73194d58199c861b33d4
data/.travis.yml ADDED
@@ -0,0 +1,24 @@
1
+ language: ruby
2
+ sudo: false
3
+ addons:
4
+ apt:
5
+ packages:
6
+ - zlib1g-dev
7
+ - libncurses5-dev
8
+ - libtinfo-dev
9
+ - exonerate
10
+ - blast2
11
+ - ncbi-blast+
12
+ - mafft
13
+ - primer3
14
+ before_install:
15
+ - gem update --system
16
+ - export RUBYOPT="-W1"
17
+ rvm:
18
+ - 2.3
19
+ - 2.4
20
+ - 2.5
21
+ - 2.6
22
+ - 2.7
23
+
24
+
data/Gemfile ADDED
@@ -0,0 +1,23 @@
1
+ source "http://rubygems.org"
2
+ # Add dependencies required to use your gem here.
3
+ # Example:
4
+ # gem "activesupport", ">= 2.3.5"
5
+
6
+ gem "bio", ">= 1.5.1"
7
+ gem "bio-samtools-wrapper", ">= 2.7.0"
8
+ gem "descriptive_statistics"
9
+ #gem "rake"
10
+
11
+ gem "sorted_set"
12
+
13
+ gem "systemu", ">=2.5.2"
14
+
15
+ group :development do
16
+ gem "shoulda", ">= 2.10"
17
+ gem 'test-unit'
18
+ if RUBY_VERSION.start_with?("2.1") or RUBY_VERSION.start_with?("2.2") or RUBY_VERSION.start_with?("2.0")
19
+ gem "jeweler", "= 2.0.1"
20
+ else
21
+ gem "juwelier"
22
+ end
23
+ end
data/README.md ADDED
@@ -0,0 +1,205 @@
1
+ # bio-polyploid-tools
2
+
3
+ ## Introduction
4
+
5
+ This tools are designed to deal with polyploid wheat. The first tool is to design KASP primers, making them as specific as possible.
6
+
7
+
8
+ ## Installation
9
+
10
+ ```sh
11
+ gem install bio-polyploid-tools
12
+ ```
13
+ You need to have in your ```$PATH``` the following programs:
14
+
15
+ * [MAFFT](http://mafft.cbrc.jp/alignment/software/)
16
+ * [primer3](http://primer3.sourceforge.net/releases.php)
17
+ * [exonerate](http://www.ebi.ac.uk/~guy/exonerate/)
18
+ * [blast](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE%3DBlastDocs&DOC_TYPE%3DDownload)
19
+
20
+ The code was originally developed on ruby 2.1, 2.3 and 2.5. It may work on older version. However, it is only actively tested in currently supported ruby versions:
21
+
22
+ * 2.1.10
23
+ * 2.2.5
24
+ * 2.3.5
25
+ * 2.4.2
26
+ * 2.5.0
27
+
28
+ # PolyMarker
29
+
30
+ To run PolyMarker with the CSS wheat contigs, you need to unzip the reference file from [ensembl](http://ftp.ensemblgenomes.org/pub/release-25/plants/fasta/triticum_aestivum/dna/Triticum_aestivum.IWGSC2.25.dna.genome.fa.gz).
31
+
32
+
33
+ ```sh
34
+ polymarker.rb --contigs Triticum_aestivum.IWGSC2.25.dna.genome.fa --marker_list snp_list.csv --output output_folder
35
+ ```
36
+
37
+ The ```snp_list``` file must follow the convention ```ID,Chromosome,SEQUENCE``` with the SNP inside the sequence in the format [A/T]. As a reference, look at test/data/short_primer_design_test.csv
38
+
39
+ If you want to use the web interface, visit the [PolyMarker webservice at TGAC](http://polymarker.tgac.ac.uk)
40
+
41
+ The available command line arguments are:
42
+
43
+ ```
44
+ Usage: polymarker.rb [options]
45
+ -c, --contigs FILE File with contigs to use as database
46
+ -m, --marker_list FILE File with the list of markers to search from
47
+ -g, --genomes_count INT Number of genomes (default 3, for hexaploid)
48
+ -s, --snp_list FILE File with the list of snps to search from, requires --reference to get the sequence using a position
49
+ -t, --mutant_list FILE File with the list of positions with mutation and the mutation line.
50
+ requires --reference to get the sequence using a position
51
+ -r, --reference FILE Fasta file with the sequence for the markers (to complement --snp_list)
52
+ -o, --output FOLDER Output folder
53
+ -e, --exonerate_model MODEL Model to be used in exonerate to search for the contigs
54
+ -i, --min_identity INT Minimum identity to consider a hit (default 90)
55
+ -a, --arm_selection arm_selection_embl|arm_selection_morex|arm_selection_first_two
56
+ Function to decide the chromome arm
57
+ -p, --primer_3_preferences FILE file with preferences to be sent to primer3
58
+ -v, --variation_free_region INT If present, avoid generating the common primer if there are homoeologous SNPs within the specified distance (not tested)
59
+ -x, --extract_found_contigs If present, save in a separate file the contigs with matches. Useful to debug.
60
+ -P, --primers_to_order If present, saves a file named primers_to_order which contains the KASP tails
61
+ ```
62
+
63
+ ## Input formats
64
+
65
+ The following formats are used to define the marker sequences:
66
+
67
+ ### Marker list
68
+
69
+ If the option ```--marker_list FILE``` is used, the SNP and the flanking sequence is included in the file. The format contains 3 columns (the order is important):
70
+
71
+ * **snp_name** The ID of the marker. Must be unique.
72
+ * **target chromosome** for the specific primers. Must be in line with the chromosome selection critieria.
73
+ * **sequence** The sequence flanking the SNP with the SNP highligted on square brackets (```[]```) and the two alleles separated by a forward slash (```/```).
74
+
75
+ #### Example:
76
+
77
+ ```
78
+ BS00068396_51,2A,CGAAGCGATCCTACTACATTGCGTTCCTTTCCCACTCCCAGGTCCCCCTA[T/C]ATGCAGGATCTTGATTAGTCGTGTGAACAACTGAAATTTGAGCGCCACAA
79
+ ```
80
+
81
+ ### SNP list
82
+
83
+ If the flanking sequence is unknow, but the position on a reference is available, the option ```--snp_list``` can be used and the FASTA file with the reference sequence must be provided with the option ```--reference```. This is to allow the use of a different assembly or set of contigs used for the discovery of the SNPs that are different to the reference given in the option ```--contigs```. The format contains the following positional columns:
84
+
85
+ * **scaffold** The sacffold where the SNP is.
86
+ * **reference allele** The base in the reference (may or may not be the same as in the reference file.
87
+ * **position** Position of the SNP. The first base in the scaffold is base 1.
88
+ * **alternative allele** The base in the alternative allele.
89
+ * **target chromosome** for the specific primers. Must be in line with the chromosome selection critieria.
90
+
91
+ #### Example
92
+
93
+ ```
94
+ IWGSC_CSS_1AL_scaff_110,C,519,A,2A
95
+ ```
96
+
97
+ This file format can be used with ```snp_positions_to_polymarker.rb``` to produce the input for the option```--marker_list```.
98
+
99
+
100
+ ### Custom reference sequences.
101
+
102
+ By default, the contigs and pseudomolecules from [ensembl](ftp://ftp.ensemblgenomes.org/pub/release-25/plants/fasta/triticum_aestivum/dna/Triticum_aestivum.IWGSC2.25.dna.genome.fa.gz
103
+ ) are used. However, it is possible to use a custom reference. To define the chromosome where each contig belongs the argument ```arm_selection``` is used. The defailt uses ids like: ```IWGSC_CSS_1AL_scaff_110```, where the third field, separated by underscores is used. A simple way to add costum references is to rename the fasta file to follow that convention. Another way is to use the option ```--arm_selection arm_selection_first_two```, where only the first two characters in each contig is used as identifier, useful when pseudomolecules are named after the chromosomes (ie: ">1A" in the fasta file).
104
+
105
+ If your contigs follow a different convention, in the file ```ChromosomeArm.rb``` it is possible to define new parsers, by adding at the begining, with the rest of the parsers a new lambda like:
106
+
107
+ ```rb
108
+ @@arm_selection_functions[:embl] = lambda do | contig_name|
109
+ arr = contig_name.split('_')
110
+ ret = "U"
111
+ ret = arr[2][0,2] if arr.size >= 3
112
+ ret = "3B" if arr.size == 2 and arr[0] == "v443"
113
+ ret = arr[0][0,2] if arr.size == 1
114
+ return ret
115
+ end
116
+ ```
117
+
118
+ The function should return a 2 character string, when the first is the chromosome number and the second the chromosome group. The symbol in the hash is the name to be used in the argument ```--arm_selection```. If you want your parser to be added to the distribution, feel free to fork and make a pull request.
119
+
120
+ ##Using blast
121
+
122
+ To use blast instead of exonerate, use the following command:
123
+
124
+ ```
125
+ ./bin/polymarker.rb --contigs test/data/BS00068396_51_contigs.fa --marker_list test/data/BS00068396_51_for_polymarker.fa --aligner blast -a arm_selection_first_two
126
+ ```
127
+
128
+
129
+ ## Release Notes
130
+
131
+ ### 0.9.7
132
+ There was some strange issue with the numbering, so bumped to 0.9.7
133
+
134
+ * Moved the arm selection function for fields in the chromosome name to the ```ChromosomeArm``` class.
135
+
136
+ ### 0.8.7
137
+ * FEATURE: ```polymarker.rb``` now also prints the total number of hits found.
138
+
139
+ ### 0.8.6
140
+
141
+ * BUGFIX: ```priemr3.rb``` had a regression when adding the repetitive flag to the ```@values``` array. This lead to the wrong order of the columns in the output and possibly other secondary effects.
142
+
143
+ ### 0.8.5
144
+
145
+ * Added the option ```--max_hits``` to ```polyamarker.rb``` to set a maximum number of bast hits to identify repetitive regions. This adds the column ```is_repetitve``` to the output. The mask is not calculated in repetitive regions and the primers are designed as non-specific.
146
+
147
+ ### 0.8.4
148
+
149
+ * Added script ```tag_stats.rb`` That gets the descriptive statistics for a tag in a bam file for each reference.
150
+
151
+ ```bash
152
+ ruby tag_stats.rb -b HI.3206.006.Index_2.CS_125RNA_14d_Leaf8.sorted.bam -r /Users/ramirezr/Dropbox/JIC/expVIPMetadatas/RefSeq1.0/Genes/annotation/IWGSCv1.0_UTR_ALL.cdnas.fasta --tag 'NH'
153
+ ```
154
+
155
+ ### 0.8.3
156
+
157
+ * BUGFIX: ```ChromosomeArm.rb``` was fixed to conform the module assumptions for the package.
158
+
159
+
160
+ ### 0.8.2
161
+
162
+ * FEATURE: The functions to select the chromosome arm are now in ```lib/bio/PolyploidTools/ChromosomeArm.rb``` and the help message is updated automatically with the valid options.
163
+ * FEATURE: Added option ```filter_best``` to replicate the original behaviour of selecting the best hit of each chromosome. Still useful for assemblies which still contain synthetic duplications.
164
+
165
+ ### 0.8.1
166
+
167
+ * BUGFIX: There was an error which prevented the correct localisation of the SNP in markeres with gaps in the local alignment before the position with the snp.
168
+ * FEATURE: PolyMarker now selects the best hit of the target chromosome. This improves the specificity in regions with a recent duplication. The drawback is that if your assembly has artificial repetitions, the primers won't be marked as 'chromosome specific', but as 'chromosome semi-specific '. In a future version this will be addressed.
169
+
170
+ ### 0.8
171
+
172
+ * FEATURE: ```polymarker.rb``` added the flag ```--aligner blast|exonerate ``` which lets you pick between ```blast``` or ```exonerate``` as the aligner. For blast the default is to have the database with the same name as the ```--contigs``` file. However, it is possible to use a different name vua the option ```--database```.
173
+
174
+ ### 0.7.3
175
+
176
+ * FEATURE: ```polymarker.rb``` Added to the flag ```--arm_selection``` the option ```scaffold```, which now supports a scaffold specific primer.
177
+ * FEATURE: ```snp_position_to_polymarker``` Added the option ```--mutant_list``` to prepare files for PolyMarker from files with the following columns ```ID,Allele_1,position,Allele_1,target_chromosome```.
178
+
179
+ ### 0.7.2
180
+
181
+ * FEATURE: Added a flag ```min_identity``` to set the minimum identity to consider a hit. The default is 90
182
+
183
+ ### 0.7.1
184
+ * BUGFIX: Now the parser for ```arm_selection_embl``` works with the mixture of contigs and pseudomolecules
185
+ * DOC: Added documentation on how to use custom references.
186
+
187
+ ### 0.7.0
188
+ * Added flag ```genomes_count``` for number of genomes, to be used on tetraploids, etc.
189
+
190
+ ### 0.6.1
191
+
192
+
193
+ * polymarker.rb now validates that all the files exist.
194
+ * BUGFIX: A reference was required even when it was not used to generate contigs.
195
+
196
+ # Notes
197
+
198
+ * BUG: Blocks with NNNs are picked and treated as semi-specific.
199
+ * BUG: If the name of the reference have space, the ID is not chopped. ">gene_1 (G12A)" shouls be treated as ">gene_1".
200
+ * TODO: Add a parameter file to configure the alignments.
201
+ * TODO: Produce primers for products of different sizes. This can probably be done with the primer_3_preferences option, but hasn't been tested.
202
+
203
+
204
+
205
+
data/Rakefile ADDED
@@ -0,0 +1,61 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ #
4
+ #require 'bundler/version'
5
+
6
+ begin
7
+ Bundler.setup(:default, :development)
8
+ rescue Bundler::BundlerError => e
9
+ $stderr.puts e.message
10
+ $stderr.puts "Run `bundle install` to install missing gems"
11
+ exit e.status_code
12
+ end
13
+ require 'rake'
14
+
15
+
16
+ if RUBY_VERSION.start_with?("2.1") or RUBY_VERSION.start_with?("2.2") or RUBY_VERSION.start_with?("2.0")
17
+ require 'jeweler'
18
+ @taskClass = Jeweler
19
+ else
20
+ require 'juwelier'
21
+ @taskClass = Juwelier
22
+ end
23
+
24
+
25
+
26
+ @taskClass::Tasks.new do |gem|
27
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
28
+ gem.name = "bio-polymarker"
29
+ gem.homepage = "https://github.com/cb2e6f/bio-polymarker"
30
+ gem.license = "MIT"
31
+ gem.summary = %Q{Tool to work with polyploids, NGS and molecular biology}
32
+ gem.description = %Q{Repository of tools developed at Crop Genetics in JIC to work with polyploid wheat}
33
+ gem.email = "rob.ellis@jic.ac.uk"
34
+ gem.authors = ["Rob Ellis"]
35
+ # Include your dependencies below. Runtime dependencies are required when using your gem,
36
+ # and development dependencies are only needed for development (ie running rake tasks, tests, etc)
37
+ #gem.add_runtime_dependency 'bio-samtools', '= 0.6.2'
38
+ # gem.add_development_dependency 'rspec', '> 1.2.3'
39
+ # gem.extensions = "ext/mkrf_conf.rb"
40
+ end
41
+ @taskClass::RubygemsDotOrgTasks.new
42
+
43
+ require 'rake/testtask'
44
+ Rake::TestTask.new(:test) do |test|
45
+ test.libs << 'lib' << 'test'
46
+ test.pattern = 'test/**/test_*.rb'
47
+ test.verbose = true
48
+ end
49
+
50
+
51
+ if RUBY_VERSION.start_with?("1.8")
52
+ require 'rcov/rcovtask'
53
+ Rcov::RcovTask.new do |test|
54
+ test.libs << 'test'
55
+ test.pattern = 'test/**/test_*.rb'
56
+ test.verbose = true
57
+ end
58
+ end
59
+
60
+ task :default => :test
61
+
data/SECURITY.md ADDED
@@ -0,0 +1,16 @@
1
+ # Security Policy
2
+
3
+ ## Supported Versions
4
+
5
+ The following table shows the currently supported version.
6
+
7
+ | Version | Supported |
8
+ | ------- | ------------------ |
9
+ | 1.1.x | :white_check_mark: |
10
+ | 1.0.x | :x: |
11
+ | 0.x.x | :x: |
12
+
13
+
14
+ ## Reporting a Vulnerability
15
+
16
+ If you find a vulneravility, please submit a comment in the security tab
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 1.3.2
data/bin/bfr.rb ADDED
@@ -0,0 +1,128 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ #require 'extensions/all'
4
+ require 'bio-samtools-wrapper'
5
+ require 'optparse'
6
+
7
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
8
+ $: << File.expand_path('.')
9
+ path=File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
10
+ $stderr.puts "Loading: #{path}"
11
+ require path
12
+
13
+ options = {}
14
+
15
+ options[:chunk] = 0
16
+ options[:chunk_size] = 0
17
+ options[:bucket] = 1
18
+
19
+ OptionParser.new do |opts|
20
+ opts.banner = "Usage: bfr.rb [options]"
21
+
22
+ opts.on("-r", "--reference FILE", "Fasta file with the reference sequence. Make sure to run faidx before running bfr in parallel") do |o|
23
+ options[:reference] = o
24
+ end
25
+
26
+ opts.on("-a", "--parent_1 FILE", "Sorted BAM file with the alginments from parental 1") do |o|
27
+ options[:parent_1] = o
28
+ end
29
+
30
+ opts.on("-b", "--parent_2 FILE", "Sorted BAM file with the alginments from parental 2") do |o|
31
+ options[:parent_2] = o
32
+ end
33
+
34
+ opts.on("-c", "--bulk_1 FILE", "Sorted BAM file with the alginments from bulk1 1 (corresponding to the phenotype of parental 1)") do |o|
35
+ options[:bulk_1] = o
36
+ end
37
+
38
+ opts.on("-d", "--bulk_2 FILE", "Sorted BAM file with the alginments from bulk1 2 (corresponding to the phenotype of parental 2)") do |o|
39
+ options[:bulk_2] = o
40
+ end
41
+
42
+ opts.on("-o", "--bfr FILE", "Output file with the BFRs in the chunck") do |o|
43
+ options[:output_filename] = o
44
+ end
45
+
46
+ opts.on("-s", "--stats FILE", "Output with the summary of the run. Only writes at the end, so in principle, paralell process should be able to write on it to get a status of how much has been completed.") do |o|
47
+ options[:stats_file] = o
48
+ end
49
+ opts.on("-d", "--bulk_2 FILE", "Sorted BAM file with the alginments from bulk1 2 (corresponding to the phenotype of parental 2)") do |o|
50
+ options[:bulk_2] = o
51
+ end
52
+
53
+ opts.on("-m", "--chunk_size FILE", "Number of chunks to divde the SNP calling. Useful to run in a cluster.") do |o|
54
+ options[:chunk_size] = o.to_i
55
+ end
56
+
57
+ opts.on("-n", "--chunk FILE", "Chunk number. Must be less than chunk_size. ") do |o|
58
+ options[:chunk] = o.to_i
59
+ end
60
+
61
+
62
+ end.parse!
63
+
64
+ p options
65
+ p ARGV
66
+
67
+
68
+ reference = options[:reference]
69
+ chunk = options[:chunk]
70
+ chunk_size = options[:chunk_size]
71
+ output_filename = options[:output_filename]
72
+ stats_file = options[:stats_file]
73
+
74
+
75
+ min = chunk * chunk_size
76
+ max = min + chunk_size
77
+
78
+
79
+ parental_1=options[:parent_1]
80
+ parental_2=options[:parent_2]
81
+
82
+
83
+ bulk_1 = options[:bulk_1]
84
+ bulk_2 = options[:bulk_2]
85
+
86
+
87
+ fasta_db = Bio::DB::Fasta::FastaFile.new({:fasta=>reference})
88
+ fasta_db.load_fai_entries
89
+
90
+
91
+ if chunk_size == 0
92
+ min = 0
93
+ max = fasta_db.index.entries.size
94
+ end
95
+
96
+ container = Bio::BFRTools::BFRContainer.new
97
+
98
+ container.reference reference
99
+ container.parental_1 ( {:path => parental_1 } )
100
+ container.parental_2 ( {:path => parental_2 } )
101
+ container.bulk_1 ( {:path => bulk_1 })
102
+ container.bulk_2 ( {:path => bulk_2 })
103
+
104
+ i = -1
105
+
106
+ container.init_counters
107
+ output_file = File.open(output_filename, "w")
108
+ puts "Range: #{min}:#{max}"
109
+ fasta_db.index.entries.each do | r |
110
+ i = i + 1
111
+ #puts r
112
+ #puts i
113
+ next if i < min or i >= max
114
+ container.process_region({:region => r.get_full_region.to_s,:output_file => output_file } )
115
+ #puts "Processed"
116
+ end
117
+ output_file.close
118
+
119
+ file_h = nil
120
+ if !File.exists? stats_file
121
+ file_h = File.open(stats_file, "w")
122
+ container.print_header({:output_file_stats => file_h})
123
+ else
124
+ file_h = File.open(stats_file, "a")
125
+ end
126
+ container.print_stats({:output_file_stats => file_h})
127
+
128
+ file_h.close
@@ -0,0 +1,166 @@
1
+ #!/usr/bin/env ruby
2
+ require 'optparse'
3
+ require 'bio'
4
+ require 'csv'
5
+ require 'bio-blastxmlparser'
6
+ require 'fileutils'
7
+ require 'tmpdir'
8
+
9
+
10
+ options = {}
11
+ options[:identity] = 50
12
+ options[:min_bases] = 200
13
+ options[:split_token] = "-"
14
+ options[:tmp_folder] = Dir.mktmpdir
15
+ options[:program] = "blastn"
16
+ options[:random_sample] = 0
17
+
18
+ OptionParser.new do |opts|
19
+
20
+ opts.banner = "Usage: filter_blat.rb [options]"
21
+
22
+ opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
23
+ options[:identity] = o.to_f
24
+ end
25
+ opts.on("-c", "--min_bases int", "Minimum alignment length (default 200)") do |o|
26
+ options[:min_bases] = o.to_i
27
+ end
28
+
29
+ opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
30
+ options[:triads] = o
31
+ end
32
+
33
+ opts.on("-f", "--sequences FILE" , "FASTA file containing all the possible sequences. ") do |o|
34
+ options[:fasta] = o
35
+ end
36
+
37
+ opts.on("-s", "--split_token CHAR", "Character used to split the sequence name. The name will be evarything before this token on the name of the sequences") do |o|
38
+ options[:split_token] = o
39
+ end
40
+
41
+ opts.on("-p", "--program blastn|blastp", "The program to use in the alignments. Currntly only supported blastn and blastp") do |o|
42
+ options[:program] = o
43
+ end
44
+
45
+ opts.on("-r", "--random_sample INT", "Number of blast to run and keep. If set, only the number of subsets will be run") do |o|
46
+ options[:random_sample] = o.to_i
47
+ end
48
+
49
+
50
+ end.parse!
51
+
52
+
53
+ def blast_pair_fast(path_a, path_b, out_path, program: "blastn")
54
+ cmd = "#{program} -query #{path_a} -subject #{path_b} -task #{program} -out #{out_path} -outfmt '5' "
55
+ #puts cmd
56
+ executed = system cmd
57
+ result = []
58
+ blast_version = nil
59
+ n = Bio::BlastXMLParser::XmlIterator.new(out_path).to_enum
60
+ longest = nil
61
+ max_length = 0
62
+ max_pident = 0.0
63
+ max_similarity = 0.0
64
+ n.each do | iter |
65
+ iter.each do | hit |
66
+ align_len = 0
67
+ identity = 0.0
68
+ positives = 0.0
69
+ hit.each do | hsp |
70
+ align_len += hsp.align_len
71
+ identity += hsp.identity
72
+ positives += hsp.positive if program == "blastp"
73
+ end
74
+ if align_len > max_length
75
+ max_length = align_len
76
+ max_pident = 100 * identity / align_len
77
+ max_similarity = 100 * positives / align_len
78
+ end
79
+ end
80
+ end
81
+ [max_length, max_pident, max_similarity]
82
+ end
83
+
84
+ valid_pairs_A_B = Hash.new
85
+ valid_pairs_A_D = Hash.new
86
+ valid_pairs_B_D = Hash.new
87
+
88
+ split_token = options[:split_token]
89
+
90
+ sequences = Hash.new
91
+ sequence_count=0
92
+ Bio::FlatFile.open(Bio::FastaFormat, options[:fasta]) do |fasta_file|
93
+ fasta_file.each do |entry|
94
+ gene_name = entry.entry_id.split(split_token)[0]
95
+ sequences[gene_name] = entry unless sequences[gene_name]
96
+ sequences[gene_name] = entry if entry.length > sequences[gene_name].length
97
+ sequence_count += 1
98
+ end
99
+ end
100
+
101
+ $stderr.puts "#Loaded #{sequences.length} genes from #{sequence_count} sequences"
102
+ #FileUtils.mkdir_p(options[:tmp_folder])
103
+ $stderr.puts "TMP dir: #{options[:tmp_folder]}"
104
+
105
+ a_tmp = options[:tmp_folder] + "/A.fa"
106
+ b_tmp = options[:tmp_folder] + "/B.fa"
107
+ d_tmp = options[:tmp_folder] + "/D.fa"
108
+ out_tmp = options[:tmp_folder] + "/out.blast"
109
+
110
+
111
+ puts [
112
+ "group_id" , "query" , "subject" ,
113
+ "chr_query", "chr_subject", "aln_type",
114
+ "length" , "pident" , "psimilarity" ].join("\t")
115
+
116
+ count_lines = File.foreach(options[:triads]).inject(0) {|c, line| c+1}
117
+
118
+ probability = options[:random_sample] / count_lines.to_f
119
+ probability = 1 if options[:random_sample] == 0
120
+ prng = Random.new
121
+ #puts probability
122
+
123
+ CSV.foreach(options[:triads], headers:true ) do |row|
124
+ a = row['A']
125
+ b = row['B']
126
+ d = row['D']
127
+ triad = row['group_id']
128
+
129
+ save = probability > prng.rand && probability < 1
130
+ run = probability == 1 || save
131
+ next unless run
132
+
133
+ seq_a = sequences[a]
134
+ seq_b = sequences[b]
135
+ seq_d = sequences[d]
136
+ File.open(a_tmp, 'w') {|f| f.write(seq_a) } if seq_a
137
+ File.open(b_tmp, 'w') {|f| f.write(seq_b) } if seq_b
138
+ File.open(d_tmp, 'w') {|f| f.write(seq_d) } if seq_d
139
+ save_folder = "random_sample/#{triad}"
140
+
141
+ if save
142
+ FileUtils.mkdir_p save_folder
143
+ FileUtils.cp(a_tmp, save_folder) if seq_a
144
+ FileUtils.cp(b_tmp, save_folder) if seq_b
145
+ FileUtils.cp(d_tmp, save_folder) if seq_d
146
+ end
147
+ #This had a bug where the columns where always "AB"
148
+ if seq_a and seq_b
149
+ to_print = [triad, a, b , "A","B","A->B"]
150
+ to_print << blast_pair_fast(a_tmp, b_tmp, out_tmp, program:options[:program])
151
+ FileUtils.cp(out_tmp, "#{save_folder}/A_B.xml") if save
152
+ puts to_print.join("\t")
153
+ end
154
+ if seq_a and seq_d
155
+ to_print = [triad, a, d , "A","D","A->D"]
156
+ to_print << blast_pair_fast(a_tmp, d_tmp, out_tmp, program:options[:program])
157
+ puts to_print.join("\t")
158
+ FileUtils.cp(out_tmp, "#{save_folder}/A_D.xml") if save
159
+ end
160
+ if seq_b and seq_d
161
+ to_print = [triad, b, d , "B","D","B->D"]
162
+ to_print << blast_pair_fast(b_tmp, d_tmp, out_tmp, program:options[:program])
163
+ FileUtils.cp(out_tmp, "#{save_folder}/B_D.xml") if save
164
+ puts to_print.join("\t")
165
+ end
166
+ end