miga-base 0.2.0.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (52) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +351 -0
  3. data/actions/add_result +61 -0
  4. data/actions/add_taxonomy +86 -0
  5. data/actions/create_dataset +62 -0
  6. data/actions/create_project +70 -0
  7. data/actions/daemon +69 -0
  8. data/actions/download_dataset +77 -0
  9. data/actions/find_datasets +63 -0
  10. data/actions/import_datasets +86 -0
  11. data/actions/index_taxonomy +71 -0
  12. data/actions/list_datasets +83 -0
  13. data/actions/list_files +67 -0
  14. data/actions/unlink_dataset +52 -0
  15. data/bin/miga +48 -0
  16. data/lib/miga/daemon.rb +178 -0
  17. data/lib/miga/dataset.rb +286 -0
  18. data/lib/miga/gui.rb +289 -0
  19. data/lib/miga/metadata.rb +74 -0
  20. data/lib/miga/project.rb +268 -0
  21. data/lib/miga/remote_dataset.rb +154 -0
  22. data/lib/miga/result.rb +102 -0
  23. data/lib/miga/tax_index.rb +70 -0
  24. data/lib/miga/taxonomy.rb +107 -0
  25. data/lib/miga.rb +83 -0
  26. data/scripts/_distances_noref_nomulti.bash +86 -0
  27. data/scripts/_distances_ref_nomulti.bash +105 -0
  28. data/scripts/aai_distances.bash +40 -0
  29. data/scripts/ani_distances.bash +39 -0
  30. data/scripts/assembly.bash +38 -0
  31. data/scripts/cds.bash +45 -0
  32. data/scripts/clade_finding.bash +27 -0
  33. data/scripts/distances.bash +30 -0
  34. data/scripts/essential_genes.bash +29 -0
  35. data/scripts/haai_distances.bash +39 -0
  36. data/scripts/init.bash +211 -0
  37. data/scripts/miga.bash +12 -0
  38. data/scripts/mytaxa.bash +93 -0
  39. data/scripts/mytaxa_scan.bash +85 -0
  40. data/scripts/ogs.bash +36 -0
  41. data/scripts/read_quality.bash +37 -0
  42. data/scripts/ssu.bash +35 -0
  43. data/scripts/subclades.bash +26 -0
  44. data/scripts/trimmed_fasta.bash +47 -0
  45. data/scripts/trimmed_reads.bash +57 -0
  46. data/utils/adapters.fa +302 -0
  47. data/utils/mytaxa_scan.R +89 -0
  48. data/utils/mytaxa_scan.rb +58 -0
  49. data/utils/requirements.txt +19 -0
  50. data/utils/subclades-compile.rb +48 -0
  51. data/utils/subclades.R +171 -0
  52. metadata +185 -0
data/utils/adapters.fa ADDED
@@ -0,0 +1,302 @@
1
+ >Illumina_Single_End_Apapter_1
2
+ ACACTCTTTCCCTACACGACGCTGTTCCATCT
3
+ >Illumina_Single_End_Apapter_2
4
+ CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT
5
+ >Illumina_Single_End_PCR_Primer_1
6
+ AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT
7
+ >Illumina_Single_End_PCR_Primer_2
8
+ CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT
9
+ >Illumina_Single_End_Sequencing_Primer
10
+ ACACTCTTTCCCTACACGACGCTCTTCCGATCT
11
+
12
+ >Illumina_Paired_End_Adapter_1
13
+ ACACTCTTTCCCTACACGACGCTCTTCCGATCT
14
+ >Illumina_Paired_End_Adapter_2
15
+ CTCGGCATTCCTGCTGAACCGCTCTTCCGATCT
16
+ >Illumina_Paried_End_PCR_Primer_1
17
+ AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT
18
+ >Illumina_Paired_End_PCR_Primer_2
19
+ CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT
20
+ >Illumina_Paried_End_Sequencing_Primer_1
21
+ ACACTCTTTCCCTACACGACGCTCTTCCGATCT
22
+ >Illumina_Paired_End_Sequencing_Primer_2
23
+ CGGTCTCGGCATTCCTACTGAACCGCTCTTCCGATCT
24
+
25
+ >Illumina_DpnII_expression_Adapter_1
26
+ ACAGGTTCAGAGTTCTACAGTCCGAC
27
+ >Illumina_DpnII_expression_Adapter_2
28
+ CAAGCAGAAGACGGCATACGA
29
+ >Illumina_DpnII_expression_PCR_Primer_1
30
+ CAAGCAGAAGACGGCATACGA
31
+ >Illumina_DpnII_expression_PCR_Primer_2
32
+ AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA
33
+ >Illumina_DpnII_expression_Sequencing_Primer
34
+ CGACAGGTTCAGAGTTCTACAGTCCGACGATC
35
+
36
+ >Illumina_NlaIII_expression_Adapter_1
37
+ ACAGGTTCAGAGTTCTACAGTCCGACATG
38
+ >Illumina_NlaIII_expression_Adapter_2
39
+ CAAGCAGAAGACGGCATACGA
40
+ >Illumina_NlaIII_expression_PCR_Primer_1
41
+ CAAGCAGAAGACGGCATACGA
42
+ >Illumina_NlaIII_expression_PCR_Primer_2
43
+ AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA
44
+ >Illumina_NlaIII_expression_Sequencing_Primer
45
+ CCGACAGGTTCAGAGTTCTACAGTCCGACATG
46
+
47
+ >Illumina_Small_RNA_Adapter_1
48
+ GTTCAGAGTTCTACAGTCCGACGATC
49
+ >Illumina_Small_RNA_Adapter_2
50
+ TCGTATGCCGTCTTCTGCTTGT
51
+ >Illumina_Small_RNA_RT_Primer
52
+ CAAGCAGAAGACGGCATACGA
53
+ >Illumina_Small_RNA_PCR_Primer_1
54
+ CAAGCAGAAGACGGCATACGA
55
+ >Illumina_Small_RNA_PCR_Primer_2
56
+ AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA
57
+ >Illumina_Small_RNA_Sequencing_Primer
58
+ CGACAGGTTCAGAGTTCTACAGTCCGACGATC
59
+
60
+ >Illumina_Multiplexing_Adapter_1
61
+ GATCGGAAGAGCACACGTCT
62
+ >Illumina_Multiplexing_Adapter_2
63
+ ACACTCTTTCCCTACACGACGCTCTTCCGATCT
64
+ >Illumina_Multiplexing_PCR_Primer_1.01
65
+ AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT
66
+ >Illumina_Multiplexing_PCR_Primer_2.01
67
+ GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
68
+ >Illumina_Multiplexing_Read1_Sequencing_Primer
69
+ ACACTCTTTCCCTACACGACGCTCTTCCGATCT
70
+ >Illumina_Multiplexing_Index_Sequencing_Primer
71
+ GATCGGAAGAGCACACGTCTGAACTCCAGTCAC
72
+ >Illumina_Multiplexing_Read2_Sequencing_Primer
73
+ GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
74
+
75
+ >Illumina_PCR_Primer_Index_1
76
+ CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTC
77
+ >Illumina_PCR_Primer_Index_2
78
+ CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTC
79
+ >Illumina_PCR_Primer_Index_3
80
+ CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTC
81
+ >Illumina_PCR_Primer_Index_4
82
+ CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTC
83
+ >Illumina_PCR_Primer_Index_5
84
+ CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTC
85
+ >Illumina_PCR_Primer_Index_6
86
+ CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTC
87
+ >Illumina_PCR_Primer_Index_7
88
+ CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTC
89
+ >Illumina_PCR_Primer_Index_8
90
+ CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTC
91
+ >Illumina_PCR_Primer_Index_9
92
+ CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTC
93
+ >Illumina_PCR_Primer_Index_10
94
+ CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTC
95
+ >Illumina_PCR_Primer_Index_11
96
+ CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTC
97
+ >Illumina_PCR_Primer_Index_12
98
+ CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTC
99
+
100
+ >Illumina_DpnII_Gex_Adapter_1
101
+ GATCGTCGGACTGTAGAACTCTGAAC
102
+ >Illumina_DpnII_Gex_Adapter_1.01
103
+ ACAGGTTCAGAGTTCTACAGTCCGAC
104
+ >Illumina_DpnII_Gex_Adapter_2
105
+ CAAGCAGAAGACGGCATACGA
106
+ >Illumina_DpnII_Gex_Adapter_2.01
107
+ TCGTATGCCGTCTTCTGCTTG
108
+ >Illumina_DpnII_Gex_PCR_Primer_1
109
+ CAAGCAGAAGACGGCATACGA
110
+ >Illumina_DpnII_Gex_PCR_Primer_2
111
+ AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA
112
+ >Illumina_DpnII_Gex_Sequencing_Primer
113
+ CGACAGGTTCAGAGTTCTACAGTCCGACGATC
114
+
115
+ >Illumina_NlaIII_Gex_Adapter_1.01
116
+ TCGGACTGTAGAACTCTGAAC
117
+ >Illumina_NlaIII_Gex_Adapter_1.02
118
+ ACAGGTTCAGAGTTCTACAGTCCGACATG
119
+ >Illumina_NlaIII_Gex_Adapter_2.01
120
+ CAAGCAGAAGACGGCATACGA
121
+ >Illumina_NlaIII_Gex_Adapter_2.02
122
+ TCGTATGCCGTCTTCTGCTTG
123
+ >Illumina_NlaIII_Gex_PCR_Primer_1
124
+ CAAGCAGAAGACGGCATACGA
125
+ >Illumina_NlaIII_Gex_PCR_Primer_2
126
+ AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA
127
+ >Illumina_NlaIII_Gex_Sequencing_Primer
128
+ CCGACAGGTTCAGAGTTCTACAGTCCGACATG
129
+
130
+ >Illumina_Small_RNA_RT_Primer
131
+ CAAGCAGAAGACGGCATACGA
132
+ >Illumina_5p_RNA_Adapter
133
+ GTTCAGAGTTCTACAGTCCGACGATC
134
+ >Illumina_RNA_Adapter1
135
+ TCGTATGCCGTCTTCTGCTTGT
136
+
137
+ >Illumina_Small_RNA_3p_Adapter_1
138
+ ATCTCGTATGCCGTCTTCTGCTTG
139
+ >Illumina_Small_RNA_PCR_Primer_1
140
+ CAAGCAGAAGACGGCATACGA
141
+ >Illumina_Small_RNA_PCR_Primer_2
142
+ AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA
143
+ >Illumina_Small_RNA_Sequencing_Primer
144
+ CGACAGGTTCAGAGTTCTACAGTCCGACGATC
145
+
146
+ >TruSeq_Universal_Adapter
147
+ AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT
148
+ >TruSeq_Adapter_Index_1
149
+ GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG
150
+ >TruSeq_Adapter_Index_2
151
+ GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG
152
+ >TruSeq_Adapter_Index_3
153
+ GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTAGGCATCTCGTATGCCGTCTTCTGCTTG
154
+ >TruSeq_Adapter_Index_4
155
+ GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGACCAATCTCGTATGCCGTCTTCTGCTTG
156
+ >TruSeq_Adapter_Index_5
157
+ GATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGCCGTCTTCTGCTTG
158
+ >TruSeq_Adapter_Index_6
159
+ GATCGGAAGAGCACACGTCTGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCTTG
160
+ >TruSeq_Adapter_Index_7
161
+ GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGATCATCTCGTATGCCGTCTTCTGCTTG
162
+ >TruSeq_Adapter_Index_8
163
+ GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTTGAATCTCGTATGCCGTCTTCTGCTTG
164
+ >TruSeq_Adapter_Index_9
165
+ GATCGGAAGAGCACACGTCTGAACTCCAGTCACGATCAGATCTCGTATGCCGTCTTCTGCTTG
166
+ >TruSeq_Adapter_Index_10
167
+ GATCGGAAGAGCACACGTCTGAACTCCAGTCACTAGCTTATCTCGTATGCCGTCTTCTGCTTG
168
+ >TruSeq_Adapter_Index_11
169
+ GATCGGAAGAGCACACGTCTGAACTCCAGTCACGGCTACATCTCGTATGCCGTCTTCTGCTTG
170
+ >TruSeq_Adapter_Index_12
171
+ GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTTGTAATCTCGTATGCCGTCTTCTGCTTG
172
+
173
+ >Illumina_RNA_RT_Primer
174
+ GCCTTGGCACCCGAGAATTCCA
175
+ >Illumina_RNA_PCR_Primer
176
+ AATGATACGGCGACCACCGAGATCTACACGTTCAGAGTTCTACAGTCCGA
177
+
178
+ >RNA_PCR_Primer_Index_1
179
+ CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
180
+ >RNA_PCR_Primer_Index_2
181
+ CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
182
+ >RNA_PCR_Primer_Index_3
183
+ CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
184
+ >RNA_PCR_Primer_Index_4
185
+ CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
186
+ >RNA_PCR_Primer_Index_5
187
+ CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
188
+ >RNA_PCR_Primer_Index_6
189
+ CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
190
+ >RNA_PCR_Primer_Index_7
191
+ CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
192
+ >RNA_PCR_Primer_Index_8
193
+ CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
194
+ >RNA_PCR_Primer_Index_9
195
+ CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
196
+ >RNA_PCR_Primer_Index_10
197
+ CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
198
+ >RNA_PCR_Primer_Index_11
199
+ CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
200
+ >RNA_PCR_Primer_Index_12
201
+ CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
202
+ >RNA_PCR_Primer_Index_13
203
+ CAAGCAGAAGACGGCATACGAGATTTGACTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
204
+ >RNA_PCR_Primer_Index_14
205
+ CAAGCAGAAGACGGCATACGAGATGGAACTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
206
+ >RNA_PCR_Primer_Index_15
207
+ CAAGCAGAAGACGGCATACGAGATTGACATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
208
+ >RNA_PCR_Primer_Index_16
209
+ CAAGCAGAAGACGGCATACGAGATGGACGGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
210
+ >RNA_PCR_Primer_Index_17
211
+ CAAGCAGAAGACGGCATACGAGATCTCTACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
212
+ >RNA_PCR_Primer_Index_18
213
+ CAAGCAGAAGACGGCATACGAGATGCGGACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
214
+ >RNA_PCR_Primer_Index_19
215
+ CAAGCAGAAGACGGCATACGAGATTTTCACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
216
+ >RNA_PCR_Primer_Index_20
217
+ CAAGCAGAAGACGGCATACGAGATGGCCACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
218
+ >RNA_PCR_Primer_Index_21
219
+ CAAGCAGAAGACGGCATACGAGATCGAAACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
220
+ >RNA_PCR_Primer_Index_22
221
+ CAAGCAGAAGACGGCATACGAGATCGTACGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
222
+ >RNA_PCR_Primer_Index_23
223
+ CAAGCAGAAGACGGCATACGAGATCCACTCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
224
+ >RNA_PCR_Primer_Index_24
225
+ CAAGCAGAAGACGGCATACGAGATGCTACCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
226
+ >RNA_PCR_Primer_Index_25
227
+ CAAGCAGAAGACGGCATACGAGATATCAGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
228
+ >RNA_PCR_Primer_Index_26
229
+ CAAGCAGAAGACGGCATACGAGATGCTCATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
230
+ >RNA_PCR_Primer_Index_27
231
+ CAAGCAGAAGACGGCATACGAGATAGGAATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
232
+ >RNA_PCR_Primer_Index_28
233
+ CAAGCAGAAGACGGCATACGAGATCTTTTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
234
+ >RNA_PCR_Primer_Index_29
235
+ CAAGCAGAAGACGGCATACGAGATTAGTTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
236
+ >RNA_PCR_Primer_Index_30
237
+ CAAGCAGAAGACGGCATACGAGATCCGGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
238
+ >RNA_PCR_Primer_Index_31
239
+ CAAGCAGAAGACGGCATACGAGATATCGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
240
+ >RNA_PCR_Primer_Index_32
241
+ CAAGCAGAAGACGGCATACGAGATTGAGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
242
+ >RNA_PCR_Primer_Index_33
243
+ CAAGCAGAAGACGGCATACGAGATCGCCTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
244
+ >RNA_PCR_Primer_Index_34
245
+ CAAGCAGAAGACGGCATACGAGATGCCATGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
246
+ >RNA_PCR_Primer_Index_35
247
+ CAAGCAGAAGACGGCATACGAGATAAAATGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
248
+ >RNA_PCR_Primer_Index_36
249
+ CAAGCAGAAGACGGCATACGAGATTGTTGGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
250
+ >RNA_PCR_Primer_Index_37
251
+ CAAGCAGAAGACGGCATACGAGATATTCCGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
252
+ >RNA_PCR_Primer_Index_38
253
+ CAAGCAGAAGACGGCATACGAGATAGCTAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
254
+ >RNA_PCR_Primer_Index_39
255
+ CAAGCAGAAGACGGCATACGAGATGTATAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
256
+ >RNA_PCR_Primer_Index_40
257
+ CAAGCAGAAGACGGCATACGAGATTCTGAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
258
+ >RNA_PCR_Primer_Index_41
259
+ CAAGCAGAAGACGGCATACGAGATGTCGTCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
260
+ >RNA_PCR_Primer_Index_42
261
+ CAAGCAGAAGACGGCATACGAGATCGATTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
262
+ >RNA_PCR_Primer_Index_43
263
+ CAAGCAGAAGACGGCATACGAGATGCTGTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
264
+ >RNA_PCR_Primer_Index_44
265
+ CAAGCAGAAGACGGCATACGAGATATTATAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
266
+ >RNA_PCR_Primer_Index_45
267
+ CAAGCAGAAGACGGCATACGAGATGAATGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
268
+ >RNA_PCR_Primer_Index_46
269
+ CAAGCAGAAGACGGCATACGAGATTCGGGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
270
+ >RNA_PCR_Primer_Index_47
271
+ CAAGCAGAAGACGGCATACGAGATCTTCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
272
+ >RNA_PCR_Primer_Index_48
273
+ CAAGCAGAAGACGGCATACGAGATTGCCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
274
+
275
+ >ABI_Dynabead_EcoP_Oligo
276
+ CTGATCTAGAGGTACCGGATCCCAGCAGT
277
+ >ABI_Solid3_Adapter_A
278
+ CTGCCCCGGGTTCCTCATTCTCTCAGCAGCATG
279
+ >ABI_Solid3_Adapter_B
280
+ CCACTACGCCTCCGCTTTCCTCTCTATGGGCAGTCGGTGAT
281
+ >ABI_Solid3_5_AMP_Primer
282
+ CCACTACGCCTCCGCTTTCCTCTCTATG
283
+ >ABI_Solid3_3_AMP_Primer
284
+ CTGCCCCGGGTTCCTCATTCT
285
+ >ABI_Solid3_EF1_alpha_Sense_Primer
286
+ CATGTGTGTTGAGAGCTTC
287
+ >ABI_Solid3_EF1_alpha_Antisense_Primer
288
+ GAAAACCAAAGTGGTCCAC
289
+ >ABI_Solid3_GAPDH_Forward_Primer
290
+ TTAGCACCCCTGGCCAAGG
291
+ >ABI_Solid3_GAPDH_Reverse_Primer
292
+ CTTACTCCTTGGAGGCCATG
293
+ >TruSeq2_SE
294
+ AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG
295
+ >TruSeq2_PE_f
296
+ AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT
297
+ >TruSeq2_PE_r
298
+ AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
299
+ >TruSeq3_IndexedAdapter
300
+ AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC
301
+ >TruSeq3_UniversalAdapter
302
+ AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA
@@ -0,0 +1,89 @@
1
+
2
+ mytaxa.scan <- function(
3
+ wintax,
4
+ col=c('#4dbeee','#7e2f8e','#0072bd','#d95319',
5
+ '#edb120','#77ac30','#a2142f'),
6
+ main='MyTaxa scan'){
7
+ a <- read.table(wintax, sep='\t', h=F, row.names=1, na.strings='', quote='');
8
+ if(! "NA" %in% rownames(a)) a["NA", ] <- 0
9
+ b <- as.matrix(a[-which(rownames(a)=="NA"),-1]);
10
+ if(ncol(b) <= 1){
11
+ plot(1,t='n',bty='n',axes=FALSE);
12
+ legend('center',legend='Insufficient data');
13
+ return(c());
14
+ }
15
+
16
+ layout(matrix(c(6,6,1,4,2,3,5,5),byrow=T,ncol=2),
17
+ widths=c(7,1), heights=c(1/4,1,2,3));
18
+
19
+ #::: DISTANCES
20
+ par(mar=c(1,5,2,0)+0.1);
21
+ d <- apply( a[,-1], 2,
22
+ function(x,y) sqrt(sum((sqrt(x)-sqrt(y))^2)/2), y=a[,1] );
23
+ d.thr <- quantile(d, probs=0.95, names=F, na.rm=TRUE)
24
+ plot(1, xlim=c(0, length(d)+1), ylim=c(0,1), xlab='', xaxs='i', xaxt='n',
25
+ t='n', pch=19, cex=1/2, col=grey(0.3), bty='n', ylab='Signal', las=1);
26
+ rect((1:length(d))-1, 0, 1:length(d), d, col=ifelse(d>d.thr, grey(0.3),
27
+ grey(0.5)), border='NA');
28
+
29
+ #::: WINDOWS BARPLOT
30
+ par(mar=c(0,5,0,0)+0.1);
31
+ plot(1, t='n', xlim=c(0,ncol(b)+1), xaxs='i', ylim=c(0,1.2),
32
+ yaxs='i', xlab='', ylab='Frequency', bty='n', xaxt='n', yaxt='n');
33
+ axis(2, at=seq(0,1,by=0.2), las=1);
34
+ # Regions (outliers)
35
+ regs <- c();
36
+ for(j in 1:ncol(b)) if(d[j] > d.thr) regs <- c(regs, j);
37
+ if(length(regs)>0){
38
+ x <- regs-0.5;
39
+ y <- rep(1.05,length(regs)) + ((1:length(regs)) %% 2)/10;
40
+ points(x, y, pch=19, cex=3, col='darkred');
41
+ arrows(x0=x, y0=0.01, y1=y, col='darkred', length=0);
42
+ text(x, y, 1:length(regs), col='white', font=2, cex=3/4);
43
+ write.table(regs, paste(wintax,".regions",sep=""), col.names=F,
44
+ row.names=F, quote=F)
45
+ }
46
+ # Bars
47
+ h <- rep(0, ncol(b));
48
+ all_cols <- c();
49
+ for(i in 1:nrow(b)){
50
+ i.col = 1+((i-1) %% (length(col)-1));
51
+ hn <- h + as.numeric(b[i, ]);
52
+ for(j in 1:ncol(b))
53
+ if(b[i,j]>0)
54
+ rect(j-1, h[j], j, hn[j], col=col[i.col], border=NA);
55
+ all_cols <- c(all_cols, col[i.col]);
56
+ if(i.col+1 == length(col))
57
+ for(j in 1:length(col)){
58
+ k = col2rgb(col[j]);
59
+ col[j] = rgb(k[1], k[2], k[3], maxColorValue=256*1.3)
60
+ }
61
+ h <- hn;
62
+ }
63
+
64
+ #::: GENOME PROFILE
65
+ par(mar=c(0,0,0,2)+0.1);
66
+ plot(1, t='n', xlim=c(0,1), xaxs='i', ylim=c(0,1.2), yaxs='i',
67
+ xlab='', ylab='', bty='n', xaxt='n', yaxt='n');
68
+ rect(0, cumsum(c(0,a[-nrow(a),1])), 1, cumsum(a[, 1]),
69
+ col=all_cols, border=NA);
70
+ text(0.5, 1.1, 'Genome', font=2, cex=1.5, col='darkred');
71
+
72
+ #::: DISTANCES BOXPLOT
73
+ par(mar=c(1,0,2,2)+0.1);
74
+ boxplot(d, ylim=c(0,1), yaxs='i', axes=F, col=grey(0.3), pch=19);
75
+
76
+ #::: LEGEND
77
+ par(mar=c(0,2,0,2)+0.1);
78
+ plot(1, t='n', bty='n', xlim=c(0,1),
79
+ ylim=c(0,1), xaxs='i', yaxs='i', axes=F);
80
+ legend('top', pt.bg=all_cols, col=grey(0.3), pch=22,
81
+ legend=gsub('.*::','',rownames(b)), ncol=5, cex=3/4, bty='n');
82
+
83
+ #::: MAIN
84
+ plot(1, t='n', bty='n', xlim=c(0,1), ylim=c(0,1), axes=F);
85
+ text(.5,.5,main);
86
+
87
+ return(regs);
88
+ }
89
+
@@ -0,0 +1,58 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ abort "
4
+ Usage:
5
+ #{$0} {FastA file} {MyTaxa file} {Data output}
6
+
7
+ " if ARGV[2].nil?
8
+
9
+ begin
10
+ # Get arguments
11
+ faa, mytaxa, outdata = ARGV
12
+ winsize = 10
13
+
14
+ # Extract gene IDs
15
+ ids = File.open(faa).grep(/^>/).map{|dl| dl.chomp.sub(/^>/,"").sub(/\s.*/,"")}
16
+ tax = Hash[ids.map{|k| [k, "NA"]}]
17
+
18
+ # Get MyTaxa distributions
19
+ k, l = nil
20
+ File.open(mytaxa).each do |ln|
21
+ ln.chomp!
22
+ if $.%2 == 1
23
+ k, l = ln.split /\t/
24
+ else
25
+ tax[k] = ln.gsub(/<[^>]+>/,"").gsub(/;/,"::")
26
+ end
27
+ end
28
+ all_tax = tax.values.uniq.sort{|x,y| tax.values.count(y) <=> tax.values.count(x) }
29
+
30
+ # Estimate Windows and save gene IDs
31
+ fh = File.open(outdata + ".genes", "w")
32
+ c = []
33
+ c << all_tax.map{|t| tax.values.count(t) }
34
+ n_wins = (ids.size/winsize).ceil
35
+ (0 .. (n_wins-1)).each do |win|
36
+ k = ids[win*winsize, winsize]
37
+ win_t = tax.values_at(*k)
38
+ fh.puts k.join("\t")
39
+ c << all_tax.map{|t| win_t.count(t)}
40
+ end
41
+ p = c.map{|col| col.map{|cell| cell.to_f/col.inject(:+)}}
42
+ fh.close
43
+
44
+ # Save window profiles
45
+ fh = File.open(outdata, "w")
46
+ fh.puts "# Data derived from #{mytaxa}, with #{winsize}-genes windows"
47
+ fh.puts "# " + (["Tax-label", "Genome"] + (1 .. n_wins).map{|i| "Win_#{i}"}).join("\t")
48
+ (0 .. (all_tax.size - 1)).each do |row|
49
+ fh.puts ([all_tax[row]] + p.map{|col| col[row]}).join "\t"
50
+ end
51
+ fh.close
52
+ rescue => err
53
+ $stderr.puts "Exception: #{err}\n\n"
54
+ err.backtrace.each { |l| $stderr.puts l + "\n" }
55
+ err
56
+ end
57
+
58
+
@@ -0,0 +1,19 @@
1
+ Software Test executable Website Notes
2
+ -------- --------------- ------- -----
3
+ Enve-omics scripts FastQ.tag.rb http://github.com/lmrodriguezr/enveomics All the collection must be present
4
+ SolexaQA++ SolexaQA++ http://solexaqa.sourceforge.net Required version: v3.1.3+
5
+ Scythe scythe https://github.com/vsbuffalo/scythe Required version: 0.991+
6
+ FastQC fastqc http://www.bioinformatics.babraham.ac.uk/projects/fastqc
7
+ IDBA idba_ud http://i.cs.hku.hk/~alse/hkubrg/projects/idba
8
+ MetaGeneMark gmhmmp http://exon.gatech.edu/genemark/license_download.cgi The folder must contain the key and the scripts
9
+ HMMer 3.0+ hmmsearch http://hmmer.janelia.org/software
10
+ NCBI BLAST+ blastp ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST
11
+ R R http://www.r-project.org/
12
+ SQLite3 sqlite3 https://www.sqlite.org/
13
+ RAxML (pthreads) raxmlHPC-PTHREADS http://sco.h-its.org/exelixis/web/software/raxml/index.html
14
+ MCL mcl http://micans.org/mcl/
15
+ DIAMOND diamond http://ab.inf.uni-tuebingen.de/software/diamond Required version: v0.7.9+
16
+ MyTaxa MyTaxa http://enve-omics.ce.gatech.edu/mytaxa The folder must contain the db and utils dirs, and the AllGenomes.faa BLAST database
17
+ Krona ktImportText https://github.com/marbl/Krona/wiki
18
+ Barrnap barrnap http://www.vicbioinformatics.com/software.barrnap.shtml
19
+ bedtools bedtools http://bedtools.readthedocs.org/en/latest/
@@ -0,0 +1,48 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # @author Luis M. Rodriguez-R
4
+ # @update Jan-15-2016
5
+ # @license artistic license 2.0
6
+ #
7
+
8
+ $:.push File.expand_path(File.dirname(__FILE__) + "/lib")
9
+ dir = ARGV.shift or abort "Usage: #{$0} <classif.dir>"
10
+
11
+ def read_classif(dir, classif={})
12
+ fh = File.open(File.expand_path("miga-project.1.classif", dir), "r")
13
+ klass = []
14
+ while ln = fh.gets
15
+ r = ln.chomp.split("\t")
16
+ classif[r[0]] ||= []
17
+ classif[r[0]] << r[1]
18
+ klass[r[1].to_i] = r[1]
19
+ end
20
+ fh.close
21
+ klass.each do |i|
22
+ d = File.expand_path("miga-project.1.sc-#{i}", dir)
23
+ classif = read_classif(d, classif) if Dir.exist? d
24
+ end
25
+ classif
26
+ end
27
+
28
+ def print_tree(classif, col=0)
29
+ klass = classif.values.map{ |i| i[col] }.compact.uniq
30
+ if klass.size<=1
31
+ o = classif.keys
32
+ else
33
+ o = klass.map do |c|
34
+ oo = print_tree(classif.select{ |k,v| v[col]==c }, col+1)
35
+ "#{oo}[#{c}]" unless oo.nil?
36
+ end.compact
37
+ end
38
+ o.size==0 ? nil :
39
+ o.size==1 ? o[0] :
40
+ "(#{o.join(",")})"
41
+ end
42
+
43
+ c = read_classif(dir)
44
+ max_depth = c.values.map{|i| i.count}.max
45
+ c.each do |k,v|
46
+ puts ([k] + v + ["0"]*(max_depth-v.count)).join("\t")
47
+ end
48
+ $stderr.puts print_tree(c) + ";"
data/utils/subclades.R ADDED
@@ -0,0 +1,171 @@
1
+ library(enveomics.R)
2
+ library(ape)
3
+ library(ggdendro)
4
+ library(ggplot2)
5
+ library(grid)
6
+ library(gridExtra)
7
+ library(cluster)
8
+ library(dendextend)
9
+ library(vegan)
10
+ library(scatterplot3d)
11
+
12
+ # Main function
13
+ subclades <- function(ani_file, out_base, thr=1, ani=c()){
14
+ # Get ANI distances
15
+ cat("====", out_base, "\n")
16
+ if(missing(ani_file)){
17
+ a <- as.data.frame(ani)
18
+ } else {
19
+ a <- read.table(gzfile(ani_file), sep='\t', h=TRUE, as.is=T)
20
+ }
21
+ if(nrow(a)==0){
22
+ pdf(paste(out_base,'.pdf',sep=''), 7, 12)
23
+ plot(1,t='n',axes=F)
24
+ legend('center','No ANI data',bty='n')
25
+ dev.off()
26
+ file.create(paste(out_base,'.1.classif',sep=''))
27
+ file.create(paste(out_base,'.1.medoids',sep=''))
28
+ return(NULL)
29
+ }
30
+ ani.d <- enve.df2dist(cbind(a$a, a$b, 1-a$value/100), default.d=0.3)
31
+ ani.hc <- hclust(ani.d, method='ward.D2')
32
+ write.tree(as.phylo(ani.hc), 'miga-project.ani.nwk')
33
+
34
+ # Silhouette
35
+ k <- 2:(length(labels(ani.d))-1)
36
+ s <- sapply(k, function(x) summary(silhouette(pam(ani.d, x)))$avg.width)
37
+ ds <- 10^(s[-c(1,length(s))]-(s[-length(s)+c(0,1)]+s[-c(1,2)])/2)
38
+ top.n <- head(k[order(c(-Inf,ds,-Inf), decreasing=T)],n=6)
39
+
40
+ # Save "ANI-types"
41
+ ani.types <- c()
42
+ ani.medoids <- list()
43
+ for(i in 1:length(top.n)){
44
+ k_i <- top.n[i]
45
+ ani.cl <- pam(ani.d, k_i)
46
+ ani.types <- cbind(ani.types, ani.cl$clustering)
47
+ ani.medoids[[ i ]] <- ani.cl$medoids
48
+ }
49
+
50
+ # Generate graphic reports
51
+ pdf(paste(out_base,'.pdf',sep=''), 7, 12)
52
+ plotClusterAndMetadata(as.dendrogram(ani.hc), ani.types, main='ANI types')
53
+ ani.mds <- metaMDS(ani.d, k=3, autotransform=FALSE, parallel=thr, wascores=F)
54
+ layout(matrix(1:6, ncol=2))
55
+ for(i in 1:length(top.n)){
56
+ s <- scatterplot3d(ani.mds$points, pch=19, type='h',
57
+ color=ggplotColours(top.n[i], alpha=1/2)[ani.types[,i]],
58
+ cex.symbols=1/2, box=FALSE, lty.hplot=3,
59
+ main=paste('NMDS of ANI distances with', top.n[i] ,'clusters'),
60
+ angle=80, scale.y=3/2, las=2, xlab='', ylab='', zlab='')
61
+ for(cl in 1:top.n[i]){
62
+ col <- ggplotColours(top.n[i])[cl]
63
+ med <- s$xyz.convert(matrix(ani.mds$points[ ani.medoids[[i]][cl] , ],
64
+ ncol=3))
65
+ if(sum(ani.types[,i]==cl)>1){
66
+ val <- s$xyz.convert(matrix(ani.mds$points[ ani.types[,i]==cl , ],
67
+ ncol=3))
68
+ arrows(x0=med$x, y0=med$y, x1=val$x, y1=val$y, length=0, col=col)
69
+ }
70
+ points(med, col=col, pch=19, cex=3/2)
71
+ text(med, labels=cl, col='white', cex=2/3)
72
+ }
73
+ }
74
+ dev.off()
75
+
76
+ # Save results
77
+ for(i in 1:length(top.n)){
78
+ write.table(ani.medoids[[i]], paste(out_base,i,'medoids',sep='.'),
79
+ quote=FALSE, col.names=FALSE, row.names=FALSE)
80
+ classif <- cbind(rownames(ani.types), ani.types[,i],
81
+ ani.medoids[[i]][ ani.types[,i] ], NA)
82
+ for(j in 1:nrow(classif))
83
+ classif[j,4] <- 100 - as.matrix(ani.d)[classif[j,1], classif[j,3]]
84
+ write.table(classif, paste(out_base,i,'classif',sep='.'),
85
+ quote=FALSE, col.names=FALSE, row.names=FALSE, sep='\t')
86
+ }
87
+
88
+ # Explore subclades
89
+ for(i in 1:top.n[1]){
90
+ medoid <- ani.medoids[[1]][i]
91
+ ds_f <- rownames(ani.types)[ ani.types[,1]==i ]
92
+ cat("Analyzing subclade", i, "with medoid:", medoid, "\n")
93
+ cat(" ds_f: ", ds_f, "\n")
94
+ if(length(ds_f) > 5){
95
+ a_f <- a[ (a$a %in% ds_f) & (a$b %in% ds_f), ]
96
+ dir.create(paste(out_base,'.1.sc-',i,sep=''))
97
+ write.table(ds_f,
98
+ paste(out_base,'.1.sc-',i,'/miga-project.all',sep=''),
99
+ quote=FALSE, col.names=FALSE, row.names=FALSE)
100
+ cat(" looking for subclades within: ",
101
+ out_base, ".1.sc-", i, "\n", sep="")
102
+ subclades(
103
+ out_base=paste(out_base,'.1.sc-',i,'/miga-project',sep=''),
104
+ thr=thr, ani=a_f)
105
+ }
106
+ }
107
+ }
108
+
109
+ # Ancillary functions
110
+ plotClusterAndMetadata <- function(c,m,addLabels=TRUE,main='',type='factor'){
111
+ ps <- list()
112
+ ps[[1]] <- rectGrob(gp=gpar(col="white"))
113
+ if(length(type)==1) type <- rep(type, ncol(m))
114
+ if(addLabels){
115
+ m <- cbind(m, NA)
116
+ m[labels(c),ncol(m)] <- labels(c)
117
+ type[ncol(m)] <- 'label'
118
+ }
119
+ for(i in 1:ncol(m)){
120
+ df <- data.frame(lab=factor(labels(c),levels=labels(c)),
121
+ feat=m[labels(c),i])
122
+ if(type[i]=='factor'){
123
+ ps[[i+1]] <- ggplotGrob(ggplot(df, aes(1, lab, fill=factor(feat))) +
124
+ geom_tile() + geom_text(size=3/4, label=df$feat, x=.8) +
125
+ scale_x_continuous(expand=c(0,0)) +
126
+ theme(axis.title=element_blank(), panel.margin=unit(1,'points'),
127
+ plot.margin=unit(c(40,-12,20,-12),'points'),
128
+ axis.ticks=element_blank(), axis.text=element_blank(),
129
+ legend.position="none"))
130
+ }else if(type[i]=='numeric'){
131
+ ps[[i+1]] <- ggplotGrob(ggplot(df, aes(1,lab,fill=as.numeric(feat))) +
132
+ geom_tile() + geom_text(size=3/4, label=df$feat, x=.8) +
133
+ scale_x_continuous(expand=c(0,0)) +
134
+ theme(axis.title=element_blank(), panel.margin=unit(1,'points'),
135
+ plot.margin=unit(c(40,-12,20,-12),'points'),
136
+ axis.ticks=element_blank(), axis.text=element_blank(),
137
+ legend.position="none"))
138
+ }else if(type[i]=='label'){
139
+ ps[[i+1]] <- ggplotGrob(ggplot(df, aes(1, lab)) +
140
+ geom_tile(fill='white') + geom_text(size=3/4, label=df$feat, x=.8) +
141
+ theme(axis.title=element_blank(), panel.margin=unit(1,'points'),
142
+ plot.margin=unit(c(40,-12,20,-12),'points'),
143
+ axis.ticks=element_blank(), axis.text=element_blank(),
144
+ legend.position="none"))
145
+ }else{
146
+ stop('Unsupported type: ', type[i])
147
+ }
148
+ }
149
+ ps[[i+2]] <- ggplotGrob(ggplot(segment(dendro_data(c, type="rectangle"))) +
150
+ geom_segment(aes(x = x, y = y, xend = xend, yend = yend)) +
151
+ scale_x_continuous(expand=c(0,.5)) +
152
+ coord_flip() + theme_dendro() +
153
+ theme(axis.title=element_blank(), axis.ticks=element_blank(),
154
+ plot.margin=unit(c(40,20,20,ifelse(addLabels,-35,-30)),'points'),
155
+ panel.margin=unit(0,'points'), axis.text=element_blank(),
156
+ legend.position="none"))
157
+ maxHeights = do.call(grid::unit.pmax, lapply(ps, function(x) x$heights[2:5]))
158
+ for(g in ps) g$heights[2:5] <- as.list(maxHeights)
159
+ ps$nrow <- 1
160
+ ps$widths <- c(0.1,rep(.07,ncol(m)),1)
161
+ ps$main <- main
162
+ do.call(grid.arrange, ps)
163
+ return(ps)
164
+ }
165
+
166
+ ggplotColours <- function(n=6, h=c(0, 360)+15, alpha=1){
167
+ if ((diff(h)%%360) < 1) h[2] <- h[2] - 360/n
168
+ hcl(h=seq(h[1], h[2], length=n), c=100, l=65, alpha=alpha)
169
+ }
170
+
171
+