miga-base 0.2.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +351 -0
  3. data/actions/add_result +61 -0
  4. data/actions/add_taxonomy +86 -0
  5. data/actions/create_dataset +62 -0
  6. data/actions/create_project +70 -0
  7. data/actions/daemon +69 -0
  8. data/actions/download_dataset +77 -0
  9. data/actions/find_datasets +63 -0
  10. data/actions/import_datasets +86 -0
  11. data/actions/index_taxonomy +71 -0
  12. data/actions/list_datasets +83 -0
  13. data/actions/list_files +67 -0
  14. data/actions/unlink_dataset +52 -0
  15. data/bin/miga +48 -0
  16. data/lib/miga/daemon.rb +178 -0
  17. data/lib/miga/dataset.rb +286 -0
  18. data/lib/miga/gui.rb +289 -0
  19. data/lib/miga/metadata.rb +74 -0
  20. data/lib/miga/project.rb +268 -0
  21. data/lib/miga/remote_dataset.rb +154 -0
  22. data/lib/miga/result.rb +102 -0
  23. data/lib/miga/tax_index.rb +70 -0
  24. data/lib/miga/taxonomy.rb +107 -0
  25. data/lib/miga.rb +83 -0
  26. data/scripts/_distances_noref_nomulti.bash +86 -0
  27. data/scripts/_distances_ref_nomulti.bash +105 -0
  28. data/scripts/aai_distances.bash +40 -0
  29. data/scripts/ani_distances.bash +39 -0
  30. data/scripts/assembly.bash +38 -0
  31. data/scripts/cds.bash +45 -0
  32. data/scripts/clade_finding.bash +27 -0
  33. data/scripts/distances.bash +30 -0
  34. data/scripts/essential_genes.bash +29 -0
  35. data/scripts/haai_distances.bash +39 -0
  36. data/scripts/init.bash +211 -0
  37. data/scripts/miga.bash +12 -0
  38. data/scripts/mytaxa.bash +93 -0
  39. data/scripts/mytaxa_scan.bash +85 -0
  40. data/scripts/ogs.bash +36 -0
  41. data/scripts/read_quality.bash +37 -0
  42. data/scripts/ssu.bash +35 -0
  43. data/scripts/subclades.bash +26 -0
  44. data/scripts/trimmed_fasta.bash +47 -0
  45. data/scripts/trimmed_reads.bash +57 -0
  46. data/utils/adapters.fa +302 -0
  47. data/utils/mytaxa_scan.R +89 -0
  48. data/utils/mytaxa_scan.rb +58 -0
  49. data/utils/requirements.txt +19 -0
  50. data/utils/subclades-compile.rb +48 -0
  51. data/utils/subclades.R +171 -0
  52. metadata +185 -0
data/utils/adapters.fa ADDED
@@ -0,0 +1,302 @@
1
+ >Illumina_Single_End_Apapter_1
2
+ ACACTCTTTCCCTACACGACGCTGTTCCATCT
3
+ >Illumina_Single_End_Apapter_2
4
+ CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT
5
+ >Illumina_Single_End_PCR_Primer_1
6
+ AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT
7
+ >Illumina_Single_End_PCR_Primer_2
8
+ CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT
9
+ >Illumina_Single_End_Sequencing_Primer
10
+ ACACTCTTTCCCTACACGACGCTCTTCCGATCT
11
+
12
+ >Illumina_Paired_End_Adapter_1
13
+ ACACTCTTTCCCTACACGACGCTCTTCCGATCT
14
+ >Illumina_Paired_End_Adapter_2
15
+ CTCGGCATTCCTGCTGAACCGCTCTTCCGATCT
16
+ >Illumina_Paried_End_PCR_Primer_1
17
+ AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT
18
+ >Illumina_Paired_End_PCR_Primer_2
19
+ CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT
20
+ >Illumina_Paried_End_Sequencing_Primer_1
21
+ ACACTCTTTCCCTACACGACGCTCTTCCGATCT
22
+ >Illumina_Paired_End_Sequencing_Primer_2
23
+ CGGTCTCGGCATTCCTACTGAACCGCTCTTCCGATCT
24
+
25
+ >Illumina_DpnII_expression_Adapter_1
26
+ ACAGGTTCAGAGTTCTACAGTCCGAC
27
+ >Illumina_DpnII_expression_Adapter_2
28
+ CAAGCAGAAGACGGCATACGA
29
+ >Illumina_DpnII_expression_PCR_Primer_1
30
+ CAAGCAGAAGACGGCATACGA
31
+ >Illumina_DpnII_expression_PCR_Primer_2
32
+ AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA
33
+ >Illumina_DpnII_expression_Sequencing_Primer
34
+ CGACAGGTTCAGAGTTCTACAGTCCGACGATC
35
+
36
+ >Illumina_NlaIII_expression_Adapter_1
37
+ ACAGGTTCAGAGTTCTACAGTCCGACATG
38
+ >Illumina_NlaIII_expression_Adapter_2
39
+ CAAGCAGAAGACGGCATACGA
40
+ >Illumina_NlaIII_expression_PCR_Primer_1
41
+ CAAGCAGAAGACGGCATACGA
42
+ >Illumina_NlaIII_expression_PCR_Primer_2
43
+ AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA
44
+ >Illumina_NlaIII_expression_Sequencing_Primer
45
+ CCGACAGGTTCAGAGTTCTACAGTCCGACATG
46
+
47
+ >Illumina_Small_RNA_Adapter_1
48
+ GTTCAGAGTTCTACAGTCCGACGATC
49
+ >Illumina_Small_RNA_Adapter_2
50
+ TCGTATGCCGTCTTCTGCTTGT
51
+ >Illumina_Small_RNA_RT_Primer
52
+ CAAGCAGAAGACGGCATACGA
53
+ >Illumina_Small_RNA_PCR_Primer_1
54
+ CAAGCAGAAGACGGCATACGA
55
+ >Illumina_Small_RNA_PCR_Primer_2
56
+ AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA
57
+ >Illumina_Small_RNA_Sequencing_Primer
58
+ CGACAGGTTCAGAGTTCTACAGTCCGACGATC
59
+
60
+ >Illumina_Multiplexing_Adapter_1
61
+ GATCGGAAGAGCACACGTCT
62
+ >Illumina_Multiplexing_Adapter_2
63
+ ACACTCTTTCCCTACACGACGCTCTTCCGATCT
64
+ >Illumina_Multiplexing_PCR_Primer_1.01
65
+ AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT
66
+ >Illumina_Multiplexing_PCR_Primer_2.01
67
+ GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
68
+ >Illumina_Multiplexing_Read1_Sequencing_Primer
69
+ ACACTCTTTCCCTACACGACGCTCTTCCGATCT
70
+ >Illumina_Multiplexing_Index_Sequencing_Primer
71
+ GATCGGAAGAGCACACGTCTGAACTCCAGTCAC
72
+ >Illumina_Multiplexing_Read2_Sequencing_Primer
73
+ GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
74
+
75
+ >Illumina_PCR_Primer_Index_1
76
+ CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTC
77
+ >Illumina_PCR_Primer_Index_2
78
+ CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTC
79
+ >Illumina_PCR_Primer_Index_3
80
+ CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTC
81
+ >Illumina_PCR_Primer_Index_4
82
+ CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTC
83
+ >Illumina_PCR_Primer_Index_5
84
+ CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTC
85
+ >Illumina_PCR_Primer_Index_6
86
+ CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTC
87
+ >Illumina_PCR_Primer_Index_7
88
+ CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTC
89
+ >Illumina_PCR_Primer_Index_8
90
+ CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTC
91
+ >Illumina_PCR_Primer_Index_9
92
+ CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTC
93
+ >Illumina_PCR_Primer_Index_10
94
+ CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTC
95
+ >Illumina_PCR_Primer_Index_11
96
+ CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTC
97
+ >Illumina_PCR_Primer_Index_12
98
+ CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTC
99
+
100
+ >Illumina_DpnII_Gex_Adapter_1
101
+ GATCGTCGGACTGTAGAACTCTGAAC
102
+ >Illumina_DpnII_Gex_Adapter_1.01
103
+ ACAGGTTCAGAGTTCTACAGTCCGAC
104
+ >Illumina_DpnII_Gex_Adapter_2
105
+ CAAGCAGAAGACGGCATACGA
106
+ >Illumina_DpnII_Gex_Adapter_2.01
107
+ TCGTATGCCGTCTTCTGCTTG
108
+ >Illumina_DpnII_Gex_PCR_Primer_1
109
+ CAAGCAGAAGACGGCATACGA
110
+ >Illumina_DpnII_Gex_PCR_Primer_2
111
+ AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA
112
+ >Illumina_DpnII_Gex_Sequencing_Primer
113
+ CGACAGGTTCAGAGTTCTACAGTCCGACGATC
114
+
115
+ >Illumina_NlaIII_Gex_Adapter_1.01
116
+ TCGGACTGTAGAACTCTGAAC
117
+ >Illumina_NlaIII_Gex_Adapter_1.02
118
+ ACAGGTTCAGAGTTCTACAGTCCGACATG
119
+ >Illumina_NlaIII_Gex_Adapter_2.01
120
+ CAAGCAGAAGACGGCATACGA
121
+ >Illumina_NlaIII_Gex_Adapter_2.02
122
+ TCGTATGCCGTCTTCTGCTTG
123
+ >Illumina_NlaIII_Gex_PCR_Primer_1
124
+ CAAGCAGAAGACGGCATACGA
125
+ >Illumina_NlaIII_Gex_PCR_Primer_2
126
+ AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA
127
+ >Illumina_NlaIII_Gex_Sequencing_Primer
128
+ CCGACAGGTTCAGAGTTCTACAGTCCGACATG
129
+
130
+ >Illumina_Small_RNA_RT_Primer
131
+ CAAGCAGAAGACGGCATACGA
132
+ >Illumina_5p_RNA_Adapter
133
+ GTTCAGAGTTCTACAGTCCGACGATC
134
+ >Illumina_RNA_Adapter1
135
+ TCGTATGCCGTCTTCTGCTTGT
136
+
137
+ >Illumina_Small_RNA_3p_Adapter_1
138
+ ATCTCGTATGCCGTCTTCTGCTTG
139
+ >Illumina_Small_RNA_PCR_Primer_1
140
+ CAAGCAGAAGACGGCATACGA
141
+ >Illumina_Small_RNA_PCR_Primer_2
142
+ AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA
143
+ >Illumina_Small_RNA_Sequencing_Primer
144
+ CGACAGGTTCAGAGTTCTACAGTCCGACGATC
145
+
146
+ >TruSeq_Universal_Adapter
147
+ AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT
148
+ >TruSeq_Adapter_Index_1
149
+ GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG
150
+ >TruSeq_Adapter_Index_2
151
+ GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG
152
+ >TruSeq_Adapter_Index_3
153
+ GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTAGGCATCTCGTATGCCGTCTTCTGCTTG
154
+ >TruSeq_Adapter_Index_4
155
+ GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGACCAATCTCGTATGCCGTCTTCTGCTTG
156
+ >TruSeq_Adapter_Index_5
157
+ GATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGCCGTCTTCTGCTTG
158
+ >TruSeq_Adapter_Index_6
159
+ GATCGGAAGAGCACACGTCTGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCTTG
160
+ >TruSeq_Adapter_Index_7
161
+ GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGATCATCTCGTATGCCGTCTTCTGCTTG
162
+ >TruSeq_Adapter_Index_8
163
+ GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTTGAATCTCGTATGCCGTCTTCTGCTTG
164
+ >TruSeq_Adapter_Index_9
165
+ GATCGGAAGAGCACACGTCTGAACTCCAGTCACGATCAGATCTCGTATGCCGTCTTCTGCTTG
166
+ >TruSeq_Adapter_Index_10
167
+ GATCGGAAGAGCACACGTCTGAACTCCAGTCACTAGCTTATCTCGTATGCCGTCTTCTGCTTG
168
+ >TruSeq_Adapter_Index_11
169
+ GATCGGAAGAGCACACGTCTGAACTCCAGTCACGGCTACATCTCGTATGCCGTCTTCTGCTTG
170
+ >TruSeq_Adapter_Index_12
171
+ GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTTGTAATCTCGTATGCCGTCTTCTGCTTG
172
+
173
+ >Illumina_RNA_RT_Primer
174
+ GCCTTGGCACCCGAGAATTCCA
175
+ >Illumina_RNA_PCR_Primer
176
+ AATGATACGGCGACCACCGAGATCTACACGTTCAGAGTTCTACAGTCCGA
177
+
178
+ >RNA_PCR_Primer_Index_1
179
+ CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
180
+ >RNA_PCR_Primer_Index_2
181
+ CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
182
+ >RNA_PCR_Primer_Index_3
183
+ CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
184
+ >RNA_PCR_Primer_Index_4
185
+ CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
186
+ >RNA_PCR_Primer_Index_5
187
+ CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
188
+ >RNA_PCR_Primer_Index_6
189
+ CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
190
+ >RNA_PCR_Primer_Index_7
191
+ CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
192
+ >RNA_PCR_Primer_Index_8
193
+ CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
194
+ >RNA_PCR_Primer_Index_9
195
+ CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
196
+ >RNA_PCR_Primer_Index_10
197
+ CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
198
+ >RNA_PCR_Primer_Index_11
199
+ CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
200
+ >RNA_PCR_Primer_Index_12
201
+ CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
202
+ >RNA_PCR_Primer_Index_13
203
+ CAAGCAGAAGACGGCATACGAGATTTGACTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
204
+ >RNA_PCR_Primer_Index_14
205
+ CAAGCAGAAGACGGCATACGAGATGGAACTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
206
+ >RNA_PCR_Primer_Index_15
207
+ CAAGCAGAAGACGGCATACGAGATTGACATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
208
+ >RNA_PCR_Primer_Index_16
209
+ CAAGCAGAAGACGGCATACGAGATGGACGGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
210
+ >RNA_PCR_Primer_Index_17
211
+ CAAGCAGAAGACGGCATACGAGATCTCTACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
212
+ >RNA_PCR_Primer_Index_18
213
+ CAAGCAGAAGACGGCATACGAGATGCGGACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
214
+ >RNA_PCR_Primer_Index_19
215
+ CAAGCAGAAGACGGCATACGAGATTTTCACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
216
+ >RNA_PCR_Primer_Index_20
217
+ CAAGCAGAAGACGGCATACGAGATGGCCACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
218
+ >RNA_PCR_Primer_Index_21
219
+ CAAGCAGAAGACGGCATACGAGATCGAAACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
220
+ >RNA_PCR_Primer_Index_22
221
+ CAAGCAGAAGACGGCATACGAGATCGTACGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
222
+ >RNA_PCR_Primer_Index_23
223
+ CAAGCAGAAGACGGCATACGAGATCCACTCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
224
+ >RNA_PCR_Primer_Index_24
225
+ CAAGCAGAAGACGGCATACGAGATGCTACCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
226
+ >RNA_PCR_Primer_Index_25
227
+ CAAGCAGAAGACGGCATACGAGATATCAGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
228
+ >RNA_PCR_Primer_Index_26
229
+ CAAGCAGAAGACGGCATACGAGATGCTCATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
230
+ >RNA_PCR_Primer_Index_27
231
+ CAAGCAGAAGACGGCATACGAGATAGGAATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
232
+ >RNA_PCR_Primer_Index_28
233
+ CAAGCAGAAGACGGCATACGAGATCTTTTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
234
+ >RNA_PCR_Primer_Index_29
235
+ CAAGCAGAAGACGGCATACGAGATTAGTTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
236
+ >RNA_PCR_Primer_Index_30
237
+ CAAGCAGAAGACGGCATACGAGATCCGGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
238
+ >RNA_PCR_Primer_Index_31
239
+ CAAGCAGAAGACGGCATACGAGATATCGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
240
+ >RNA_PCR_Primer_Index_32
241
+ CAAGCAGAAGACGGCATACGAGATTGAGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
242
+ >RNA_PCR_Primer_Index_33
243
+ CAAGCAGAAGACGGCATACGAGATCGCCTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
244
+ >RNA_PCR_Primer_Index_34
245
+ CAAGCAGAAGACGGCATACGAGATGCCATGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
246
+ >RNA_PCR_Primer_Index_35
247
+ CAAGCAGAAGACGGCATACGAGATAAAATGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
248
+ >RNA_PCR_Primer_Index_36
249
+ CAAGCAGAAGACGGCATACGAGATTGTTGGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
250
+ >RNA_PCR_Primer_Index_37
251
+ CAAGCAGAAGACGGCATACGAGATATTCCGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
252
+ >RNA_PCR_Primer_Index_38
253
+ CAAGCAGAAGACGGCATACGAGATAGCTAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
254
+ >RNA_PCR_Primer_Index_39
255
+ CAAGCAGAAGACGGCATACGAGATGTATAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
256
+ >RNA_PCR_Primer_Index_40
257
+ CAAGCAGAAGACGGCATACGAGATTCTGAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
258
+ >RNA_PCR_Primer_Index_41
259
+ CAAGCAGAAGACGGCATACGAGATGTCGTCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
260
+ >RNA_PCR_Primer_Index_42
261
+ CAAGCAGAAGACGGCATACGAGATCGATTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
262
+ >RNA_PCR_Primer_Index_43
263
+ CAAGCAGAAGACGGCATACGAGATGCTGTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
264
+ >RNA_PCR_Primer_Index_44
265
+ CAAGCAGAAGACGGCATACGAGATATTATAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
266
+ >RNA_PCR_Primer_Index_45
267
+ CAAGCAGAAGACGGCATACGAGATGAATGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
268
+ >RNA_PCR_Primer_Index_46
269
+ CAAGCAGAAGACGGCATACGAGATTCGGGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
270
+ >RNA_PCR_Primer_Index_47
271
+ CAAGCAGAAGACGGCATACGAGATCTTCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
272
+ >RNA_PCR_Primer_Index_48
273
+ CAAGCAGAAGACGGCATACGAGATTGCCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
274
+
275
+ >ABI_Dynabead_EcoP_Oligo
276
+ CTGATCTAGAGGTACCGGATCCCAGCAGT
277
+ >ABI_Solid3_Adapter_A
278
+ CTGCCCCGGGTTCCTCATTCTCTCAGCAGCATG
279
+ >ABI_Solid3_Adapter_B
280
+ CCACTACGCCTCCGCTTTCCTCTCTATGGGCAGTCGGTGAT
281
+ >ABI_Solid3_5_AMP_Primer
282
+ CCACTACGCCTCCGCTTTCCTCTCTATG
283
+ >ABI_Solid3_3_AMP_Primer
284
+ CTGCCCCGGGTTCCTCATTCT
285
+ >ABI_Solid3_EF1_alpha_Sense_Primer
286
+ CATGTGTGTTGAGAGCTTC
287
+ >ABI_Solid3_EF1_alpha_Antisense_Primer
288
+ GAAAACCAAAGTGGTCCAC
289
+ >ABI_Solid3_GAPDH_Forward_Primer
290
+ TTAGCACCCCTGGCCAAGG
291
+ >ABI_Solid3_GAPDH_Reverse_Primer
292
+ CTTACTCCTTGGAGGCCATG
293
+ >TruSeq2_SE
294
+ AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG
295
+ >TruSeq2_PE_f
296
+ AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT
297
+ >TruSeq2_PE_r
298
+ AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
299
+ >TruSeq3_IndexedAdapter
300
+ AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC
301
+ >TruSeq3_UniversalAdapter
302
+ AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA
@@ -0,0 +1,89 @@
1
+
2
+ mytaxa.scan <- function(
3
+ wintax,
4
+ col=c('#4dbeee','#7e2f8e','#0072bd','#d95319',
5
+ '#edb120','#77ac30','#a2142f'),
6
+ main='MyTaxa scan'){
7
+ a <- read.table(wintax, sep='\t', h=F, row.names=1, na.strings='', quote='');
8
+ if(! "NA" %in% rownames(a)) a["NA", ] <- 0
9
+ b <- as.matrix(a[-which(rownames(a)=="NA"),-1]);
10
+ if(ncol(b) <= 1){
11
+ plot(1,t='n',bty='n',axes=FALSE);
12
+ legend('center',legend='Insufficient data');
13
+ return(c());
14
+ }
15
+
16
+ layout(matrix(c(6,6,1,4,2,3,5,5),byrow=T,ncol=2),
17
+ widths=c(7,1), heights=c(1/4,1,2,3));
18
+
19
+ #::: DISTANCES
20
+ par(mar=c(1,5,2,0)+0.1);
21
+ d <- apply( a[,-1], 2,
22
+ function(x,y) sqrt(sum((sqrt(x)-sqrt(y))^2)/2), y=a[,1] );
23
+ d.thr <- quantile(d, probs=0.95, names=F, na.rm=TRUE)
24
+ plot(1, xlim=c(0, length(d)+1), ylim=c(0,1), xlab='', xaxs='i', xaxt='n',
25
+ t='n', pch=19, cex=1/2, col=grey(0.3), bty='n', ylab='Signal', las=1);
26
+ rect((1:length(d))-1, 0, 1:length(d), d, col=ifelse(d>d.thr, grey(0.3),
27
+ grey(0.5)), border='NA');
28
+
29
+ #::: WINDOWS BARPLOT
30
+ par(mar=c(0,5,0,0)+0.1);
31
+ plot(1, t='n', xlim=c(0,ncol(b)+1), xaxs='i', ylim=c(0,1.2),
32
+ yaxs='i', xlab='', ylab='Frequency', bty='n', xaxt='n', yaxt='n');
33
+ axis(2, at=seq(0,1,by=0.2), las=1);
34
+ # Regions (outliers)
35
+ regs <- c();
36
+ for(j in 1:ncol(b)) if(d[j] > d.thr) regs <- c(regs, j);
37
+ if(length(regs)>0){
38
+ x <- regs-0.5;
39
+ y <- rep(1.05,length(regs)) + ((1:length(regs)) %% 2)/10;
40
+ points(x, y, pch=19, cex=3, col='darkred');
41
+ arrows(x0=x, y0=0.01, y1=y, col='darkred', length=0);
42
+ text(x, y, 1:length(regs), col='white', font=2, cex=3/4);
43
+ write.table(regs, paste(wintax,".regions",sep=""), col.names=F,
44
+ row.names=F, quote=F)
45
+ }
46
+ # Bars
47
+ h <- rep(0, ncol(b));
48
+ all_cols <- c();
49
+ for(i in 1:nrow(b)){
50
+ i.col = 1+((i-1) %% (length(col)-1));
51
+ hn <- h + as.numeric(b[i, ]);
52
+ for(j in 1:ncol(b))
53
+ if(b[i,j]>0)
54
+ rect(j-1, h[j], j, hn[j], col=col[i.col], border=NA);
55
+ all_cols <- c(all_cols, col[i.col]);
56
+ if(i.col+1 == length(col))
57
+ for(j in 1:length(col)){
58
+ k = col2rgb(col[j]);
59
+ col[j] = rgb(k[1], k[2], k[3], maxColorValue=256*1.3)
60
+ }
61
+ h <- hn;
62
+ }
63
+
64
+ #::: GENOME PROFILE
65
+ par(mar=c(0,0,0,2)+0.1);
66
+ plot(1, t='n', xlim=c(0,1), xaxs='i', ylim=c(0,1.2), yaxs='i',
67
+ xlab='', ylab='', bty='n', xaxt='n', yaxt='n');
68
+ rect(0, cumsum(c(0,a[-nrow(a),1])), 1, cumsum(a[, 1]),
69
+ col=all_cols, border=NA);
70
+ text(0.5, 1.1, 'Genome', font=2, cex=1.5, col='darkred');
71
+
72
+ #::: DISTANCES BOXPLOT
73
+ par(mar=c(1,0,2,2)+0.1);
74
+ boxplot(d, ylim=c(0,1), yaxs='i', axes=F, col=grey(0.3), pch=19);
75
+
76
+ #::: LEGEND
77
+ par(mar=c(0,2,0,2)+0.1);
78
+ plot(1, t='n', bty='n', xlim=c(0,1),
79
+ ylim=c(0,1), xaxs='i', yaxs='i', axes=F);
80
+ legend('top', pt.bg=all_cols, col=grey(0.3), pch=22,
81
+ legend=gsub('.*::','',rownames(b)), ncol=5, cex=3/4, bty='n');
82
+
83
+ #::: MAIN
84
+ plot(1, t='n', bty='n', xlim=c(0,1), ylim=c(0,1), axes=F);
85
+ text(.5,.5,main);
86
+
87
+ return(regs);
88
+ }
89
+
@@ -0,0 +1,58 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ abort "
4
+ Usage:
5
+ #{$0} {FastA file} {MyTaxa file} {Data output}
6
+
7
+ " if ARGV[2].nil?
8
+
9
+ begin
10
+ # Get arguments
11
+ faa, mytaxa, outdata = ARGV
12
+ winsize = 10
13
+
14
+ # Extract gene IDs
15
+ ids = File.open(faa).grep(/^>/).map{|dl| dl.chomp.sub(/^>/,"").sub(/\s.*/,"")}
16
+ tax = Hash[ids.map{|k| [k, "NA"]}]
17
+
18
+ # Get MyTaxa distributions
19
+ k, l = nil
20
+ File.open(mytaxa).each do |ln|
21
+ ln.chomp!
22
+ if $.%2 == 1
23
+ k, l = ln.split /\t/
24
+ else
25
+ tax[k] = ln.gsub(/<[^>]+>/,"").gsub(/;/,"::")
26
+ end
27
+ end
28
+ all_tax = tax.values.uniq.sort{|x,y| tax.values.count(y) <=> tax.values.count(x) }
29
+
30
+ # Estimate Windows and save gene IDs
31
+ fh = File.open(outdata + ".genes", "w")
32
+ c = []
33
+ c << all_tax.map{|t| tax.values.count(t) }
34
+ n_wins = (ids.size/winsize).ceil
35
+ (0 .. (n_wins-1)).each do |win|
36
+ k = ids[win*winsize, winsize]
37
+ win_t = tax.values_at(*k)
38
+ fh.puts k.join("\t")
39
+ c << all_tax.map{|t| win_t.count(t)}
40
+ end
41
+ p = c.map{|col| col.map{|cell| cell.to_f/col.inject(:+)}}
42
+ fh.close
43
+
44
+ # Save window profiles
45
+ fh = File.open(outdata, "w")
46
+ fh.puts "# Data derived from #{mytaxa}, with #{winsize}-genes windows"
47
+ fh.puts "# " + (["Tax-label", "Genome"] + (1 .. n_wins).map{|i| "Win_#{i}"}).join("\t")
48
+ (0 .. (all_tax.size - 1)).each do |row|
49
+ fh.puts ([all_tax[row]] + p.map{|col| col[row]}).join "\t"
50
+ end
51
+ fh.close
52
+ rescue => err
53
+ $stderr.puts "Exception: #{err}\n\n"
54
+ err.backtrace.each { |l| $stderr.puts l + "\n" }
55
+ err
56
+ end
57
+
58
+
@@ -0,0 +1,19 @@
1
+ Software Test executable Website Notes
2
+ -------- --------------- ------- -----
3
+ Enve-omics scripts FastQ.tag.rb http://github.com/lmrodriguezr/enveomics All the collection must be present
4
+ SolexaQA++ SolexaQA++ http://solexaqa.sourceforge.net Required version: v3.1.3+
5
+ Scythe scythe https://github.com/vsbuffalo/scythe Required version: 0.991+
6
+ FastQC fastqc http://www.bioinformatics.babraham.ac.uk/projects/fastqc
7
+ IDBA idba_ud http://i.cs.hku.hk/~alse/hkubrg/projects/idba
8
+ MetaGeneMark gmhmmp http://exon.gatech.edu/genemark/license_download.cgi The folder must contain the key and the scripts
9
+ HMMer 3.0+ hmmsearch http://hmmer.janelia.org/software
10
+ NCBI BLAST+ blastp ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST
11
+ R R http://www.r-project.org/
12
+ SQLite3 sqlite3 https://www.sqlite.org/
13
+ RAxML (pthreads) raxmlHPC-PTHREADS http://sco.h-its.org/exelixis/web/software/raxml/index.html
14
+ MCL mcl http://micans.org/mcl/
15
+ DIAMOND diamond http://ab.inf.uni-tuebingen.de/software/diamond Required version: v0.7.9+
16
+ MyTaxa MyTaxa http://enve-omics.ce.gatech.edu/mytaxa The folder must contain the db and utils dirs, and the AllGenomes.faa BLAST database
17
+ Krona ktImportText https://github.com/marbl/Krona/wiki
18
+ Barrnap barrnap http://www.vicbioinformatics.com/software.barrnap.shtml
19
+ bedtools bedtools http://bedtools.readthedocs.org/en/latest/
@@ -0,0 +1,48 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # @author Luis M. Rodriguez-R
4
+ # @update Jan-15-2016
5
+ # @license artistic license 2.0
6
+ #
7
+
8
+ $:.push File.expand_path(File.dirname(__FILE__) + "/lib")
9
+ dir = ARGV.shift or abort "Usage: #{$0} <classif.dir>"
10
+
11
+ def read_classif(dir, classif={})
12
+ fh = File.open(File.expand_path("miga-project.1.classif", dir), "r")
13
+ klass = []
14
+ while ln = fh.gets
15
+ r = ln.chomp.split("\t")
16
+ classif[r[0]] ||= []
17
+ classif[r[0]] << r[1]
18
+ klass[r[1].to_i] = r[1]
19
+ end
20
+ fh.close
21
+ klass.each do |i|
22
+ d = File.expand_path("miga-project.1.sc-#{i}", dir)
23
+ classif = read_classif(d, classif) if Dir.exist? d
24
+ end
25
+ classif
26
+ end
27
+
28
+ def print_tree(classif, col=0)
29
+ klass = classif.values.map{ |i| i[col] }.compact.uniq
30
+ if klass.size<=1
31
+ o = classif.keys
32
+ else
33
+ o = klass.map do |c|
34
+ oo = print_tree(classif.select{ |k,v| v[col]==c }, col+1)
35
+ "#{oo}[#{c}]" unless oo.nil?
36
+ end.compact
37
+ end
38
+ o.size==0 ? nil :
39
+ o.size==1 ? o[0] :
40
+ "(#{o.join(",")})"
41
+ end
42
+
43
+ c = read_classif(dir)
44
+ max_depth = c.values.map{|i| i.count}.max
45
+ c.each do |k,v|
46
+ puts ([k] + v + ["0"]*(max_depth-v.count)).join("\t")
47
+ end
48
+ $stderr.puts print_tree(c) + ";"
data/utils/subclades.R ADDED
@@ -0,0 +1,171 @@
1
+ library(enveomics.R)
2
+ library(ape)
3
+ library(ggdendro)
4
+ library(ggplot2)
5
+ library(grid)
6
+ library(gridExtra)
7
+ library(cluster)
8
+ library(dendextend)
9
+ library(vegan)
10
+ library(scatterplot3d)
11
+
12
+ # Main function
13
+ subclades <- function(ani_file, out_base, thr=1, ani=c()){
14
+ # Get ANI distances
15
+ cat("====", out_base, "\n")
16
+ if(missing(ani_file)){
17
+ a <- as.data.frame(ani)
18
+ } else {
19
+ a <- read.table(gzfile(ani_file), sep='\t', h=TRUE, as.is=T)
20
+ }
21
+ if(nrow(a)==0){
22
+ pdf(paste(out_base,'.pdf',sep=''), 7, 12)
23
+ plot(1,t='n',axes=F)
24
+ legend('center','No ANI data',bty='n')
25
+ dev.off()
26
+ file.create(paste(out_base,'.1.classif',sep=''))
27
+ file.create(paste(out_base,'.1.medoids',sep=''))
28
+ return(NULL)
29
+ }
30
+ ani.d <- enve.df2dist(cbind(a$a, a$b, 1-a$value/100), default.d=0.3)
31
+ ani.hc <- hclust(ani.d, method='ward.D2')
32
+ write.tree(as.phylo(ani.hc), 'miga-project.ani.nwk')
33
+
34
+ # Silhouette
35
+ k <- 2:(length(labels(ani.d))-1)
36
+ s <- sapply(k, function(x) summary(silhouette(pam(ani.d, x)))$avg.width)
37
+ ds <- 10^(s[-c(1,length(s))]-(s[-length(s)+c(0,1)]+s[-c(1,2)])/2)
38
+ top.n <- head(k[order(c(-Inf,ds,-Inf), decreasing=T)],n=6)
39
+
40
+ # Save "ANI-types"
41
+ ani.types <- c()
42
+ ani.medoids <- list()
43
+ for(i in 1:length(top.n)){
44
+ k_i <- top.n[i]
45
+ ani.cl <- pam(ani.d, k_i)
46
+ ani.types <- cbind(ani.types, ani.cl$clustering)
47
+ ani.medoids[[ i ]] <- ani.cl$medoids
48
+ }
49
+
50
+ # Generate graphic reports
51
+ pdf(paste(out_base,'.pdf',sep=''), 7, 12)
52
+ plotClusterAndMetadata(as.dendrogram(ani.hc), ani.types, main='ANI types')
53
+ ani.mds <- metaMDS(ani.d, k=3, autotransform=FALSE, parallel=thr, wascores=F)
54
+ layout(matrix(1:6, ncol=2))
55
+ for(i in 1:length(top.n)){
56
+ s <- scatterplot3d(ani.mds$points, pch=19, type='h',
57
+ color=ggplotColours(top.n[i], alpha=1/2)[ani.types[,i]],
58
+ cex.symbols=1/2, box=FALSE, lty.hplot=3,
59
+ main=paste('NMDS of ANI distances with', top.n[i] ,'clusters'),
60
+ angle=80, scale.y=3/2, las=2, xlab='', ylab='', zlab='')
61
+ for(cl in 1:top.n[i]){
62
+ col <- ggplotColours(top.n[i])[cl]
63
+ med <- s$xyz.convert(matrix(ani.mds$points[ ani.medoids[[i]][cl] , ],
64
+ ncol=3))
65
+ if(sum(ani.types[,i]==cl)>1){
66
+ val <- s$xyz.convert(matrix(ani.mds$points[ ani.types[,i]==cl , ],
67
+ ncol=3))
68
+ arrows(x0=med$x, y0=med$y, x1=val$x, y1=val$y, length=0, col=col)
69
+ }
70
+ points(med, col=col, pch=19, cex=3/2)
71
+ text(med, labels=cl, col='white', cex=2/3)
72
+ }
73
+ }
74
+ dev.off()
75
+
76
+ # Save results
77
+ for(i in 1:length(top.n)){
78
+ write.table(ani.medoids[[i]], paste(out_base,i,'medoids',sep='.'),
79
+ quote=FALSE, col.names=FALSE, row.names=FALSE)
80
+ classif <- cbind(rownames(ani.types), ani.types[,i],
81
+ ani.medoids[[i]][ ani.types[,i] ], NA)
82
+ for(j in 1:nrow(classif))
83
+ classif[j,4] <- 100 - as.matrix(ani.d)[classif[j,1], classif[j,3]]
84
+ write.table(classif, paste(out_base,i,'classif',sep='.'),
85
+ quote=FALSE, col.names=FALSE, row.names=FALSE, sep='\t')
86
+ }
87
+
88
+ # Explore subclades
89
+ for(i in 1:top.n[1]){
90
+ medoid <- ani.medoids[[1]][i]
91
+ ds_f <- rownames(ani.types)[ ani.types[,1]==i ]
92
+ cat("Analyzing subclade", i, "with medoid:", medoid, "\n")
93
+ cat(" ds_f: ", ds_f, "\n")
94
+ if(length(ds_f) > 5){
95
+ a_f <- a[ (a$a %in% ds_f) & (a$b %in% ds_f), ]
96
+ dir.create(paste(out_base,'.1.sc-',i,sep=''))
97
+ write.table(ds_f,
98
+ paste(out_base,'.1.sc-',i,'/miga-project.all',sep=''),
99
+ quote=FALSE, col.names=FALSE, row.names=FALSE)
100
+ cat(" looking for subclades within: ",
101
+ out_base, ".1.sc-", i, "\n", sep="")
102
+ subclades(
103
+ out_base=paste(out_base,'.1.sc-',i,'/miga-project',sep=''),
104
+ thr=thr, ani=a_f)
105
+ }
106
+ }
107
+ }
108
+
109
+ # Ancillary functions
110
+ plotClusterAndMetadata <- function(c,m,addLabels=TRUE,main='',type='factor'){
111
+ ps <- list()
112
+ ps[[1]] <- rectGrob(gp=gpar(col="white"))
113
+ if(length(type)==1) type <- rep(type, ncol(m))
114
+ if(addLabels){
115
+ m <- cbind(m, NA)
116
+ m[labels(c),ncol(m)] <- labels(c)
117
+ type[ncol(m)] <- 'label'
118
+ }
119
+ for(i in 1:ncol(m)){
120
+ df <- data.frame(lab=factor(labels(c),levels=labels(c)),
121
+ feat=m[labels(c),i])
122
+ if(type[i]=='factor'){
123
+ ps[[i+1]] <- ggplotGrob(ggplot(df, aes(1, lab, fill=factor(feat))) +
124
+ geom_tile() + geom_text(size=3/4, label=df$feat, x=.8) +
125
+ scale_x_continuous(expand=c(0,0)) +
126
+ theme(axis.title=element_blank(), panel.margin=unit(1,'points'),
127
+ plot.margin=unit(c(40,-12,20,-12),'points'),
128
+ axis.ticks=element_blank(), axis.text=element_blank(),
129
+ legend.position="none"))
130
+ }else if(type[i]=='numeric'){
131
+ ps[[i+1]] <- ggplotGrob(ggplot(df, aes(1,lab,fill=as.numeric(feat))) +
132
+ geom_tile() + geom_text(size=3/4, label=df$feat, x=.8) +
133
+ scale_x_continuous(expand=c(0,0)) +
134
+ theme(axis.title=element_blank(), panel.margin=unit(1,'points'),
135
+ plot.margin=unit(c(40,-12,20,-12),'points'),
136
+ axis.ticks=element_blank(), axis.text=element_blank(),
137
+ legend.position="none"))
138
+ }else if(type[i]=='label'){
139
+ ps[[i+1]] <- ggplotGrob(ggplot(df, aes(1, lab)) +
140
+ geom_tile(fill='white') + geom_text(size=3/4, label=df$feat, x=.8) +
141
+ theme(axis.title=element_blank(), panel.margin=unit(1,'points'),
142
+ plot.margin=unit(c(40,-12,20,-12),'points'),
143
+ axis.ticks=element_blank(), axis.text=element_blank(),
144
+ legend.position="none"))
145
+ }else{
146
+ stop('Unsupported type: ', type[i])
147
+ }
148
+ }
149
+ ps[[i+2]] <- ggplotGrob(ggplot(segment(dendro_data(c, type="rectangle"))) +
150
+ geom_segment(aes(x = x, y = y, xend = xend, yend = yend)) +
151
+ scale_x_continuous(expand=c(0,.5)) +
152
+ coord_flip() + theme_dendro() +
153
+ theme(axis.title=element_blank(), axis.ticks=element_blank(),
154
+ plot.margin=unit(c(40,20,20,ifelse(addLabels,-35,-30)),'points'),
155
+ panel.margin=unit(0,'points'), axis.text=element_blank(),
156
+ legend.position="none"))
157
+ maxHeights = do.call(grid::unit.pmax, lapply(ps, function(x) x$heights[2:5]))
158
+ for(g in ps) g$heights[2:5] <- as.list(maxHeights)
159
+ ps$nrow <- 1
160
+ ps$widths <- c(0.1,rep(.07,ncol(m)),1)
161
+ ps$main <- main
162
+ do.call(grid.arrange, ps)
163
+ return(ps)
164
+ }
165
+
166
+ ggplotColours <- function(n=6, h=c(0, 360)+15, alpha=1){
167
+ if ((diff(h)%%360) < 1) h[2] <- h[2] - 360/n
168
+ hcl(h=seq(h[1], h[2], length=n), c=100, l=65, alpha=alpha)
169
+ }
170
+
171
+