miga-base 0.2.0.6 → 0.2.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +3 -0
- data/LICENSE +201 -0
- data/README.md +17 -335
- data/Rakefile +31 -0
- data/actions/add_result +2 -5
- data/actions/add_taxonomy +4 -7
- data/actions/create_dataset +5 -6
- data/actions/create_project +2 -5
- data/actions/daemon +2 -5
- data/actions/download_dataset +88 -58
- data/actions/find_datasets +36 -38
- data/actions/import_datasets +2 -5
- data/actions/index_taxonomy +2 -5
- data/actions/list_datasets +47 -49
- data/actions/list_files +7 -11
- data/actions/unlink_dataset +2 -5
- data/bin/miga +1 -1
- data/lib/miga/common.rb +132 -0
- data/lib/miga/daemon.rb +229 -168
- data/lib/miga/dataset.rb +354 -277
- data/lib/miga/gui.rb +346 -269
- data/lib/miga/metadata.rb +115 -71
- data/lib/miga/project.rb +361 -259
- data/lib/miga/remote_dataset.rb +200 -148
- data/lib/miga/result.rb +150 -99
- data/lib/miga/tax_index.rb +124 -67
- data/lib/miga/taxonomy.rb +129 -100
- data/lib/miga/version.rb +57 -0
- data/lib/miga.rb +2 -77
- data/scripts/_distances_noref_nomulti.bash +2 -0
- data/scripts/_distances_ref_nomulti.bash +2 -0
- data/scripts/aai_distances.bash +1 -0
- data/scripts/ani_distances.bash +1 -0
- data/scripts/assembly.bash +1 -0
- data/scripts/cds.bash +1 -0
- data/scripts/clade_finding.bash +17 -1
- data/scripts/distances.bash +1 -0
- data/scripts/essential_genes.bash +1 -0
- data/scripts/haai_distances.bash +1 -0
- data/scripts/init.bash +2 -0
- data/scripts/mytaxa.bash +1 -0
- data/scripts/mytaxa_scan.bash +1 -0
- data/scripts/ogs.bash +1 -0
- data/scripts/read_quality.bash +1 -0
- data/scripts/ssu.bash +1 -0
- data/scripts/subclades.bash +1 -0
- data/scripts/trimmed_fasta.bash +1 -0
- data/scripts/trimmed_reads.bash +1 -0
- data/test/common_test.rb +82 -0
- data/test/daemon_test.rb +53 -0
- data/test/dataset_test.rb +156 -0
- data/test/jruby_gui_test.rb +20 -0
- data/test/metadata_test.rb +48 -0
- data/test/project_test.rb +54 -0
- data/test/remote_dataset_test.rb +41 -0
- data/test/tax_index_test.rb +44 -0
- data/test/taxonomy_test.rb +36 -0
- data/test/test_helper.rb +32 -0
- metadata +53 -38
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 01b3728971a5d407f85578447d4a66dc4c8ab8a8
|
4
|
+
data.tar.gz: 656535155e0316681f2d7aa9bcc8b501caed9d96
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5daf2a27f6a6119e18e5eda94dbda72be91ad7b46f4f8bea401f111ce95d907c88930b08d85543c1694a82aeac54c42dff5d0c3586c0846123d1e1881ad23885
|
7
|
+
data.tar.gz: 0a75b0152ad7374729c1cb09b02b539b70429902850c81cdb086f64e9b616cc66f5ba032469be85505c103dc05d15b9469b2c26689b8b4dee2b0c3ef6636ad61
|
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,201 @@
|
|
1
|
+
The Artistic License 2.0
|
2
|
+
|
3
|
+
Copyright (c) 2016 Luis M Rodriguez-R
|
4
|
+
|
5
|
+
Everyone is permitted to copy and distribute verbatim copies
|
6
|
+
of this license document, but changing it is not allowed.
|
7
|
+
|
8
|
+
Preamble
|
9
|
+
|
10
|
+
This license establishes the terms under which a given free software
|
11
|
+
Package may be copied, modified, distributed, and/or redistributed.
|
12
|
+
The intent is that the Copyright Holder maintains some artistic
|
13
|
+
control over the development of that Package while still keeping the
|
14
|
+
Package available as open source and free software.
|
15
|
+
|
16
|
+
You are always permitted to make arrangements wholly outside of this
|
17
|
+
license directly with the Copyright Holder of a given Package. If the
|
18
|
+
terms of this license do not permit the full use that you propose to
|
19
|
+
make of the Package, you should contact the Copyright Holder and seek
|
20
|
+
a different licensing arrangement.
|
21
|
+
|
22
|
+
Definitions
|
23
|
+
|
24
|
+
"Copyright Holder" means the individual(s) or organization(s)
|
25
|
+
named in the copyright notice for the entire Package.
|
26
|
+
|
27
|
+
"Contributor" means any party that has contributed code or other
|
28
|
+
material to the Package, in accordance with the Copyright Holder's
|
29
|
+
procedures.
|
30
|
+
|
31
|
+
"You" and "your" means any person who would like to copy,
|
32
|
+
distribute, or modify the Package.
|
33
|
+
|
34
|
+
"Package" means the collection of files distributed by the
|
35
|
+
Copyright Holder, and derivatives of that collection and/or of
|
36
|
+
those files. A given Package may consist of either the Standard
|
37
|
+
Version, or a Modified Version.
|
38
|
+
|
39
|
+
"Distribute" means providing a copy of the Package or making it
|
40
|
+
accessible to anyone else, or in the case of a company or
|
41
|
+
organization, to others outside of your company or organization.
|
42
|
+
|
43
|
+
"Distributor Fee" means any fee that you charge for Distributing
|
44
|
+
this Package or providing support for this Package to another
|
45
|
+
party. It does not mean licensing fees.
|
46
|
+
|
47
|
+
"Standard Version" refers to the Package if it has not been
|
48
|
+
modified, or has been modified only in ways explicitly requested
|
49
|
+
by the Copyright Holder.
|
50
|
+
|
51
|
+
"Modified Version" means the Package, if it has been changed, and
|
52
|
+
such changes were not explicitly requested by the Copyright
|
53
|
+
Holder.
|
54
|
+
|
55
|
+
"Original License" means this Artistic License as Distributed with
|
56
|
+
the Standard Version of the Package, in its current version or as
|
57
|
+
it may be modified by The Perl Foundation in the future.
|
58
|
+
|
59
|
+
"Source" form means the source code, documentation source, and
|
60
|
+
configuration files for the Package.
|
61
|
+
|
62
|
+
"Compiled" form means the compiled bytecode, object code, binary,
|
63
|
+
or any other form resulting from mechanical transformation or
|
64
|
+
translation of the Source form.
|
65
|
+
|
66
|
+
|
67
|
+
Permission for Use and Modification Without Distribution
|
68
|
+
|
69
|
+
(1) You are permitted to use the Standard Version and create and use
|
70
|
+
Modified Versions for any purpose without restriction, provided that
|
71
|
+
you do not Distribute the Modified Version.
|
72
|
+
|
73
|
+
|
74
|
+
Permissions for Redistribution of the Standard Version
|
75
|
+
|
76
|
+
(2) You may Distribute verbatim copies of the Source form of the
|
77
|
+
Standard Version of this Package in any medium without restriction,
|
78
|
+
either gratis or for a Distributor Fee, provided that you duplicate
|
79
|
+
all of the original copyright notices and associated disclaimers. At
|
80
|
+
your discretion, such verbatim copies may or may not include a
|
81
|
+
Compiled form of the Package.
|
82
|
+
|
83
|
+
(3) You may apply any bug fixes, portability changes, and other
|
84
|
+
modifications made available from the Copyright Holder. The resulting
|
85
|
+
Package will still be considered the Standard Version, and as such
|
86
|
+
will be subject to the Original License.
|
87
|
+
|
88
|
+
|
89
|
+
Distribution of Modified Versions of the Package as Source
|
90
|
+
|
91
|
+
(4) You may Distribute your Modified Version as Source (either gratis
|
92
|
+
or for a Distributor Fee, and with or without a Compiled form of the
|
93
|
+
Modified Version) provided that you clearly document how it differs
|
94
|
+
from the Standard Version, including, but not limited to, documenting
|
95
|
+
any non-standard features, executables, or modules, and provided that
|
96
|
+
you do at least ONE of the following:
|
97
|
+
|
98
|
+
(a) make the Modified Version available to the Copyright Holder
|
99
|
+
of the Standard Version, under the Original License, so that the
|
100
|
+
Copyright Holder may include your modifications in the Standard
|
101
|
+
Version.
|
102
|
+
|
103
|
+
(b) ensure that installation of your Modified Version does not
|
104
|
+
prevent the user installing or running the Standard Version. In
|
105
|
+
addition, the Modified Version must bear a name that is different
|
106
|
+
from the name of the Standard Version.
|
107
|
+
|
108
|
+
(c) allow anyone who receives a copy of the Modified Version to
|
109
|
+
make the Source form of the Modified Version available to others
|
110
|
+
under
|
111
|
+
|
112
|
+
(i) the Original License or
|
113
|
+
|
114
|
+
(ii) a license that permits the licensee to freely copy,
|
115
|
+
modify and redistribute the Modified Version using the same
|
116
|
+
licensing terms that apply to the copy that the licensee
|
117
|
+
received, and requires that the Source form of the Modified
|
118
|
+
Version, and of any works derived from it, be made freely
|
119
|
+
available in that license fees are prohibited but Distributor
|
120
|
+
Fees are allowed.
|
121
|
+
|
122
|
+
|
123
|
+
Distribution of Compiled Forms of the Standard Version
|
124
|
+
or Modified Versions without the Source
|
125
|
+
|
126
|
+
(5) You may Distribute Compiled forms of the Standard Version without
|
127
|
+
the Source, provided that you include complete instructions on how to
|
128
|
+
get the Source of the Standard Version. Such instructions must be
|
129
|
+
valid at the time of your distribution. If these instructions, at any
|
130
|
+
time while you are carrying out such distribution, become invalid, you
|
131
|
+
must provide new instructions on demand or cease further distribution.
|
132
|
+
If you provide valid instructions or cease distribution within thirty
|
133
|
+
days after you become aware that the instructions are invalid, then
|
134
|
+
you do not forfeit any of your rights under this license.
|
135
|
+
|
136
|
+
(6) You may Distribute a Modified Version in Compiled form without
|
137
|
+
the Source, provided that you comply with Section 4 with respect to
|
138
|
+
the Source of the Modified Version.
|
139
|
+
|
140
|
+
|
141
|
+
Aggregating or Linking the Package
|
142
|
+
|
143
|
+
(7) You may aggregate the Package (either the Standard Version or
|
144
|
+
Modified Version) with other packages and Distribute the resulting
|
145
|
+
aggregation provided that you do not charge a licensing fee for the
|
146
|
+
Package. Distributor Fees are permitted, and licensing fees for other
|
147
|
+
components in the aggregation are permitted. The terms of this license
|
148
|
+
apply to the use and Distribution of the Standard or Modified Versions
|
149
|
+
as included in the aggregation.
|
150
|
+
|
151
|
+
(8) You are permitted to link Modified and Standard Versions with
|
152
|
+
other works, to embed the Package in a larger work of your own, or to
|
153
|
+
build stand-alone binary or bytecode versions of applications that
|
154
|
+
include the Package, and Distribute the result without restriction,
|
155
|
+
provided the result does not expose a direct interface to the Package.
|
156
|
+
|
157
|
+
|
158
|
+
Items That are Not Considered Part of a Modified Version
|
159
|
+
|
160
|
+
(9) Works (including, but not limited to, modules and scripts) that
|
161
|
+
merely extend or make use of the Package, do not, by themselves, cause
|
162
|
+
the Package to be a Modified Version. In addition, such works are not
|
163
|
+
considered parts of the Package itself, and are not subject to the
|
164
|
+
terms of this license.
|
165
|
+
|
166
|
+
|
167
|
+
General Provisions
|
168
|
+
|
169
|
+
(10) Any use, modification, and distribution of the Standard or
|
170
|
+
Modified Versions is governed by this Artistic License. By using,
|
171
|
+
modifying or distributing the Package, you accept this license. Do not
|
172
|
+
use, modify, or distribute the Package, if you do not accept this
|
173
|
+
license.
|
174
|
+
|
175
|
+
(11) If your Modified Version has been derived from a Modified
|
176
|
+
Version made by someone other than you, you are nevertheless required
|
177
|
+
to ensure that your Modified Version complies with the requirements of
|
178
|
+
this license.
|
179
|
+
|
180
|
+
(12) This license does not grant you the right to use any trademark,
|
181
|
+
service mark, tradename, or logo of the Copyright Holder.
|
182
|
+
|
183
|
+
(13) This license includes the non-exclusive, worldwide,
|
184
|
+
free-of-charge patent license to make, have made, use, offer to sell,
|
185
|
+
sell, import and otherwise transfer the Package with respect to any
|
186
|
+
patent claims licensable by the Copyright Holder that are necessarily
|
187
|
+
infringed by the Package. If you institute patent litigation
|
188
|
+
(including a cross-claim or counterclaim) against any party alleging
|
189
|
+
that the Package constitutes direct or contributory patent
|
190
|
+
infringement, then this Artistic License to you shall terminate on the
|
191
|
+
date that such litigation is filed.
|
192
|
+
|
193
|
+
(14) Disclaimer of Warranty:
|
194
|
+
THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS "AS
|
195
|
+
IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES. THE IMPLIED
|
196
|
+
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR
|
197
|
+
NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY YOUR LOCAL
|
198
|
+
LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR CONTRIBUTOR WILL
|
199
|
+
BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
|
200
|
+
DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE, EVEN IF
|
201
|
+
ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
data/README.md
CHANGED
@@ -1,351 +1,33 @@
|
|
1
1
|
[](https://codeclimate.com/github/bio-miga/miga)
|
2
2
|
[](https://codeclimate.com/github/bio-miga/miga/coverage)
|
3
|
-
[](https://travis-ci.org/bio-miga/miga)
|
4
|
+
[](https://badge.fury.io/rb/miga-base)
|
5
|
+
[](http://inch-ci.org/github/bio-miga/miga)
|
6
|
+
[](http://www.rubydoc.info/github/bio-miga/miga)
|
4
7
|
|
5
|
-
MiGA: Microbial Genomes Atlas
|
6
|
-
=============================
|
7
8
|
|
9
|
+
# MiGA: Microbial Genomes Atlas
|
8
10
|
|
9
|
-
|
10
|
-
|
11
|
+
**Important**: The MiGA code is under active development, and we currently
|
12
|
+
cannot ensure any stability on the different interfaces. We'll be launching a
|
13
|
+
Beta Testing program soon, with dedicated support for a small number of
|
14
|
+
laboratories. If you're interested, please [contact us][contact].
|
11
15
|
|
12
|
-
|
16
|
+
For additional information on the MiGA system, please refer to the
|
17
|
+
[MiGA manual][gitbook]. For additional information on the MiGA API
|
18
|
+
(and Ruby gem), please refer to the [miga docs][rubydoc].
|
13
19
|
|
14
20
|
|
15
|
-
|
16
|
-
-------------------------
|
17
|
-
|
18
|
-
### MiGA Interfaces
|
19
|
-
|
20
|
-
You caninteract with MiGA through different interfaces. These interfaces have
|
21
|
-
different purposes, but they also have some degree of overlap, because different
|
22
|
-
users with different aims sometimes want to do the same thing. Throughout this
|
23
|
-
manual I'll be telling you how to do things using mostly the CLI, but I'll also
|
24
|
-
try to mention the GUI and the Web Interface. The CLI is the most comprehensive
|
25
|
-
and flexible interface, but the other two are friendlier to humans. There is a
|
26
|
-
fourth interface that I won't be mentioning at all, but I'll try to document:
|
27
|
-
the Ruby API. MiGA is mostly written in Ruby, with an object-oriented approach,
|
28
|
-
and all the interfaces are just thin layers atop the Ruby core. That means that
|
29
|
-
you can write your own interfaces (or pieces) if you know how to talk to these
|
30
|
-
Ruby objects. Sometimes I even use `irb`, which is an interactive shell for
|
31
|
-
Ruby, but that's mostly for debugging.
|
32
|
-
|
33
|
-
#### MiGA CLI
|
34
|
-
|
35
|
-
CLI stands for Command Line Interface. This is a set of little scripts that let
|
36
|
-
you talk with MiGA through the terminal shell. If MiGA is in your PATH (see
|
37
|
-
[installation details](./INSTALLATION.md#miga-in-your-path)), you can simply run
|
38
|
-
`miga` in your terminal, and the help messages will take it from there. All the
|
39
|
-
MiGA CLI calls look like:
|
40
|
-
|
41
|
-
```bash
|
42
|
-
miga task [options]
|
43
|
-
```
|
44
|
-
|
45
|
-
Where `task` is one of the supported tasks and `[options]` is a set of dash-flag
|
46
|
-
options supported by each task. `-h` is always there to provide help. If you're
|
47
|
-
a MiGA administrator, this is probably the most convenient option for you (but
|
48
|
-
hey, give the GUI a chance).
|
49
|
-
|
50
|
-
#### MiGA GUI
|
51
|
-
|
52
|
-
The Graphical User Interface is the friendlier option for setting up a MiGA
|
53
|
-
project. It doesn't have as many options as the CLI, but it's pretty easy to
|
54
|
-
use, so it's a good option if you have a typical project in your hands.
|
55
|
-
|
56
|
-
#### MiGA Web
|
57
|
-
|
58
|
-
The Web interface for MiGA is the way MiGA reports results from a project. It's
|
59
|
-
not designed to set up new projects, but to explore existing ones, and to submit
|
60
|
-
non-reference datasets for analyses.
|
61
|
-
|
62
|
-
### Creating your first project
|
63
|
-
|
64
|
-
You can do this in the GUI, but I like the CLI better, so I'll be telling you
|
65
|
-
how to tell MiGA what to do from the CLI. First, think where you'll place your
|
66
|
-
project. Normally this means a location...
|
67
|
-
|
68
|
-
1. ... with enough space. This is, plan for at least 4 or 5 times the size of
|
69
|
-
the input files.
|
70
|
-
|
71
|
-
2. ... accessible by worker nodes. If you're using a single server, this is not
|
72
|
-
really an issue. However, if you plan on deploying MiGA in a cluster
|
73
|
-
infrastructure, make sure your project is reachable by worker nodes.
|
74
|
-
|
75
|
-
3. ... with fast access. It's not a great idea to set up projects in remote
|
76
|
-
drives with large latency. In some cases there no way around this, for example
|
77
|
-
when that's the only available option in your cluster infrastructure, but try
|
78
|
-
to avoid this as much as possible.
|
79
|
-
|
80
|
-
Now that you know where to create your project, go ahead and run:
|
81
|
-
|
82
|
-
```bash
|
83
|
-
miga create_project -P /path/to/project1 -t type-of-project
|
84
|
-
```
|
85
|
-
|
86
|
-
Where `/path/to/project1` is the path to where the project should be created.
|
87
|
-
You don't need to create the folder in advance, MiGA will take care. See the
|
88
|
-
next section to help you decide what `type-of-project` to use. There are some
|
89
|
-
other options that are not mandatory, but will make your project richer. Take a
|
90
|
-
look at `miga create_project -h`.
|
91
|
-
|
92
|
-
#### Project types
|
93
|
-
|
94
|
-
Projects can be set for different purposes, so we've divided them into "types".
|
95
|
-
There are four of them, depending on the types of datasets to be processed (see
|
96
|
-
[Dataset types](#dataset-types)):
|
97
|
-
|
98
|
-
1. **mixed**: A generic project with any supported type of datasets.
|
99
|
-
|
100
|
-
2. **metagenomes**: A project containing only metagenomic datasets. This
|
101
|
-
includes either (or both) metagenomes and viromes.
|
102
|
-
|
103
|
-
3. **genomes**: A project containing only single-organism datasets. This
|
104
|
-
includes any of the single-organism types: genome, scgenome, and/or popgenome.
|
105
|
-
|
106
|
-
4. **clade**: Same as "genomes", but all the datasets are expected to be from
|
107
|
-
the same species. This type of project performs additional analyses that expect
|
108
|
-
a very dense ANI matrix, so all genomes in it are expected to have AAI > 90%.
|
109
|
-
|
110
|
-
### Creating datasets
|
111
|
-
|
112
|
-
Once your project is ready, you can start populating it with datasets and data.
|
113
|
-
While it's possible to create empty datasets using `miga create_dataset`, the
|
114
|
-
preferred method is to first add data and then use the data to create the
|
115
|
-
datasets in batch. For example, lets assume you have a collection of paired-end
|
116
|
-
raw reads from several datasets. The first step is to format the filenames
|
117
|
-
properly. For each one of your datasets, pick a name that conforms the
|
118
|
-
[MiGA names](#miga-names) restrictions (we'll call it "ds1") and rename your
|
119
|
-
reads to `/path/to/project1/data/01.raw_reads/ds1.1.fastq` for the first
|
120
|
-
sister and `/path/to/project1/data/01.raw_reads/ds1.2.fastq` for the second
|
121
|
-
sister. Also, add the date into `/path/to/project1/data/01.raw_reads/ds1.done`.
|
122
|
-
Check what are the [expected result files](#expected-result-files) below if you
|
123
|
-
want to start at any other point in the pipeline. Once you have renamed (or
|
124
|
-
copied) the files inside the project folder, run:
|
125
|
-
|
126
|
-
```bash
|
127
|
-
miga find_datasets -P /path/to/project1 -a -r -t type-of-dataset
|
128
|
-
```
|
129
|
-
|
130
|
-
The `-a` flag tells MiGA that you want to add the datasets (not just find them);
|
131
|
-
the `-r` flag tells MiGA that your datasets are to be treated as "reference"
|
132
|
-
datasets (see [Non-reference datasets](#non-reference-datasets) below); and the
|
133
|
-
`-t` option tells MiGA what type of datasets you're adding (see
|
134
|
-
[Dataset types](#dataset-types) below). If you have a mixture of dataset types,
|
135
|
-
process one at a time. This is, perform this step for each dataset type. Don't
|
136
|
-
worry about the datasets that are already registered, those will be ignored by
|
137
|
-
the `find_datasets` task and will remain unchanged.
|
138
|
-
|
139
|
-
#### Expected result files
|
140
|
-
|
141
|
-
For brevity, we'll assume that you're inside `/path/to/project1/data`; *i.e.*,
|
142
|
-
in the `data` directory of your project. We'll also assume that you're naming
|
143
|
-
your dataset **ds1**, but you can change this by anything following the
|
144
|
-
[MiGA names](#miga-names) restrictions. Now, these are the "input" points that
|
145
|
-
you can use in MiGA:
|
146
|
-
|
147
|
-
1. **Paired-end raw reads**: The expected files are `01.raw_reads/ds1.1.fastq`
|
148
|
-
and `01.raw_reads/ds1.2.fastq`, each including a sister end. The reads must be
|
149
|
-
in the same order in both files (MiGA won't check). You can also use gzipped
|
150
|
-
files instead.
|
151
|
-
|
152
|
-
2. **Single-end raw reads**: The expected file is `01.raw_reads/ds1.1.fastq`.
|
153
|
-
You can also use a gzipped file instead.
|
154
|
-
|
155
|
-
3. **Paired-end trimmed reads**: These are assumed to be quality-controlled
|
156
|
-
reads in FastA format, with both ends passing the quality filters. The minimum
|
157
|
-
expected file is `04.trimmed_fasta/ds1.CoupledReads.fa`, which contains the
|
158
|
-
reads interposed. You can also pass (in addition) the reads that past the
|
159
|
-
quality check without the sister as a gzipped FastA at
|
160
|
-
`04.trimmed_fasta/ds1.SingleReads.fa.gz`.
|
161
|
-
|
162
|
-
4. **Single-end trimmed reads**: Similar to the option above, only
|
163
|
-
quality-checked reads are expected here. The expected file is
|
164
|
-
`04.trimmed_fasta/ds1.SingleReads.fa`.
|
165
|
-
|
166
|
-
5. **Assembled fragments**: This can be any assembly result, including complete
|
167
|
-
genomes. The expected file is `05.assembly/ds1.LargeContigs.fna`, containing
|
168
|
-
only contigs longer than 500bp. You can also provide the complete assembly
|
169
|
-
(without length-filtering) at `05.assembly/ds1.AllContigs.fna`.
|
170
|
-
|
171
|
-
6. **Predicted genes/proteins**: This is the total collection of predicted genes
|
172
|
-
and proteins. The expected files are `06.cds/ds1.fna`, containing genes, and
|
173
|
-
`06.cds/ds1.faa`, containing proteins. You can also provide the locations of
|
174
|
-
said genes in the genome in gzipped GFF v2 (`06.cds/ds1.gff2.gz`), gzipped
|
175
|
-
GFF v3 (`06.cds/ds1.gff3.gz`), or gzipped tabular (`06.cds/ds1.tab.gz`).
|
176
|
-
|
177
|
-
**IMPORTANT**: In all cases, an additional `ds1.done` file MUST be created in
|
178
|
-
the same folder. This is meant to prevent MiGA from mistakenly adding files as
|
179
|
-
results before they're done being processed or transferred. This file must
|
180
|
-
contain the current [date in MiGA format](#date-in-miga-format). Here's a quick
|
181
|
-
code snippet to add the `.done` file for all the input files in `01.raw_reads`
|
182
|
-
(you can adapt this accordingly to any of the other options):
|
183
|
-
|
184
|
-
```bash
|
185
|
-
cd /path/to/project1/data/01.raw_reads
|
186
|
-
for i in *.1.fastq ; do
|
187
|
-
date "+%Y-%m-%d %H:%M:%S %z" > $(basename $i .1.fastq).done
|
188
|
-
done
|
189
|
-
```
|
190
|
-
|
191
|
-
#### Dataset types
|
192
|
-
|
193
|
-
This is how you tell MiGA what kind of data you have in your datasets. Lets see
|
194
|
-
the definitions:
|
195
|
-
|
196
|
-
1. **genome**: The genome from an isolate.
|
197
|
-
2. **metagenome**: A metagenome (excluding viromes).
|
198
|
-
3. **virome**: A viral metagenome.
|
199
|
-
4. **scgenome**: A genome from a single cell.
|
200
|
-
5. **popgenome**: The genome of a population (including microdiversity).
|
201
|
-
|
202
|
-
#### Non-reference datasets
|
203
|
-
|
204
|
-
|
205
|
-
#### Creating a RefSeq project
|
206
|
-
|
207
|
-
If you've reached this point, you are now ready to create a large functional
|
208
|
-
project. If you want to continue using this documentation on real data but
|
209
|
-
don't have any of your own handy (or if you want to use RefSeq data), this
|
210
|
-
is a quick tutoral on how to create a functional MiGA project using ALL of
|
211
|
-
NCBI's Prokaryotic RefSeq data.
|
212
|
-
|
213
|
-
**Step 1: Create the project**. That's simple, just `cd` to the directory you
|
214
|
-
want to use, and execute `miga create_project -P MiGA_RefSeq -t genomes`.
|
215
|
-
|
216
|
-
**Step 2: Download the data**. Just `cd MiGA_RefSeq`, and execute this code:
|
217
|
-
|
218
|
-
```bash
|
219
|
-
wget -O reference_genomes.txt 'http://www.ncbi.nlm.nih.gov/genomes/Genome2BE/genome2srv.cgi?action=refgenomes&download=on&type=reference'
|
220
|
-
grep -v '^#' reference_genomes.txt \
|
221
|
-
| awk -F'\t' '{gsub(/[^A-Za-z0-9]/,"_",$3)} {print "miga download_dataset -P . -D "$3" -I "$4" -U ncbi --db nuccore -t genome -v # "$3""}' \
|
222
|
-
| while read ln ; do
|
223
|
-
sp=$(echo $ln | perl -pe 's/.*# //')
|
224
|
-
if [[ ! -n $(miga list_datasets -P . -D $sp) ]] ; then
|
225
|
-
echo $ln
|
226
|
-
$ln
|
227
|
-
fi
|
228
|
-
done
|
229
|
-
```
|
230
|
-
|
231
|
-
And that's it. The first line will download the most current list of genomes
|
232
|
-
included in NCBI's Prokaryotic RefSeq, and the rest will repeatedly execute the
|
233
|
-
`download_dataset` task, that automatically fetches the data (even the genome's
|
234
|
-
taxonomy!). Note that the code above checks first if a dataset already exists,
|
235
|
-
so if you want to update an existing MiGA_RefSeq project, simply repeat step 2
|
236
|
-
and only missing genomes will be fetched.
|
237
|
-
|
238
|
-
Note that running time for the above code may vary depending on the network and
|
239
|
-
the size of RefSeq, but I was able to create a complete project with 122 genomes
|
240
|
-
in under 10 minutes.
|
241
|
-
|
242
|
-
**Alternative step 2: downloading all representatives**. If you want a larger
|
243
|
-
and more comprehensive collection, and not just the reference genomes, you can
|
244
|
-
download all of the representative genomes in the prokaryotic RefSeq with this
|
245
|
-
alternative code:
|
246
|
-
|
247
|
-
```bash
|
248
|
-
wget -O representative_genomes.txt 'http://www.ncbi.nlm.nih.gov/genomes/Genome2BE/genome2srv.cgi?action=refgenomes&download=on'
|
249
|
-
grep -v '^#' representative_genomes.txt \
|
250
|
-
| awk -F'\t' '{gsub(/[^A-Za-z0-9]/,"_",$3)} $4{print "miga download_dataset -P . -D "$3" -I "$4" -U ncbi --db nuccore -t genome -v # "$3""}' \
|
251
|
-
| while read ln ; do
|
252
|
-
sp=$(echo $ln | perl -pe 's/.*# //')
|
253
|
-
if [[ ! -n $(miga list_datasets -P . -D $sp) ]] ; then
|
254
|
-
echo $ln
|
255
|
-
$ln
|
256
|
-
fi
|
257
|
-
done
|
258
|
-
```
|
259
|
-
|
260
|
-
This is a much larger set (1,246), hence it'll take much more time. I finished
|
261
|
-
downloading the whole thing in about one and a half hours.
|
262
|
-
|
263
|
-
|
264
|
-
Launching daemons
|
265
|
-
-----------------
|
266
|
-
|
267
|
-
### Configuring daemons
|
268
|
-
|
269
|
-
|
270
|
-
### Understating the MiGA configuration file
|
271
|
-
|
272
|
-
|
273
|
-
### Arbitrary configuration scripts
|
274
|
-
|
275
|
-
|
276
|
-
### Fixing system calls with aliases
|
277
|
-
|
278
|
-
In some cases, we might not have the same executable names as MiGA expects, or
|
279
|
-
we might have broken modules in our cluster that can be easily fixed with an
|
280
|
-
`alias`. In these cases, you can use
|
281
|
-
[arbitrary configuration scripts](#arbitrary-configuration-scripts) to generate
|
282
|
-
one or more `alias`. Importantly, MiGA daemons work with non-interactive shells,
|
283
|
-
which means you likely need to explicitly allow for alias extensions, for
|
284
|
-
example:
|
285
|
-
|
286
|
-
```bash
|
287
|
-
# Allow alias expansions in non-interactive shells
|
288
|
-
shopt -s expand_aliases
|
289
|
-
|
290
|
-
# Call FastQC with the environmental Perl,
|
291
|
-
# not the built-in /usr/bin/perl:
|
292
|
-
alias fastqc="perl $(which fastqc)"
|
293
|
-
|
294
|
-
# Use the standard name for RAxML (pthreads)
|
295
|
-
# instead of the one my sys-admin decided to use:
|
296
|
-
alias raxmlHPC-PTHREADS=RAxML_pthreads
|
297
|
-
```
|
298
|
-
|
299
|
-
The examples above illustrate how to use `alias` to fix broken packages or to
|
300
|
-
make Software with non-standard names reachable.
|
301
|
-
|
302
|
-
**Known caveats to this solution:** This solution CANNOT BE USED in the few
|
303
|
-
cases in which a whole package is expected based on a single executable. For
|
304
|
-
example, adding the enveomics scripts to your `PATH` is far easier than creating
|
305
|
-
an `alias` for each script. Also, MiGA expects to find the model, the activation
|
306
|
-
key, and the scripts of MetaGeneMark in the same folder of the `gmhmmp` binary,
|
307
|
-
so setting an`alias` may prevent MiGA from finding these ancillary files.
|
308
|
-
|
309
|
-
|
310
|
-
Cluster infrastructure
|
311
|
-
----------------------
|
312
|
-
|
313
|
-
|
314
|
-
### Loading optional modules
|
315
|
-
|
316
|
-
|
317
|
-
See also [Fixing system calls with aliases](#fixing-system-calls-with-aliases).
|
318
|
-
|
319
|
-
|
320
|
-
Miscellaneous
|
321
|
-
-------------
|
322
|
-
|
323
|
-
These below are reference snippets that for which I couldn't find a more
|
324
|
-
suitable home, but are important documentation.
|
325
|
-
|
326
|
-
### MiGA Names
|
327
|
-
|
328
|
-
MiGA names are non-empty strings composed exclusively of alphanumerics and
|
329
|
-
underscores. All the dataset names in MiGA must conform this restriction, but
|
330
|
-
not all the projects do. Other objects must conform the MiGA name restrictions,
|
331
|
-
such as taxonomic entries.
|
332
|
-
|
333
|
-
### Date in MiGA format
|
334
|
-
|
335
|
-
The official format in which MiGA represents date/times is the default of Ruby's
|
336
|
-
`Time.now.to_s`. In the *nix `date` utility this corresponds to the format:
|
337
|
-
`+%Y-%m-%d %H:%M:%S %z`.
|
338
|
-
|
339
|
-
|
340
|
-
Authors
|
341
|
-
-------
|
21
|
+
# Authors
|
342
22
|
|
343
23
|
Developed and maintained by [Luis M. Rodriguez-R][lrr].
|
344
24
|
|
345
25
|
|
346
|
-
License
|
347
|
-
-------
|
26
|
+
# License
|
348
27
|
|
349
28
|
See [LICENSE](LICENSE).
|
350
29
|
|
351
30
|
[lrr]: http://lmrodriguezr.github.io/
|
31
|
+
[gitbook]: https://miga.gitbooks.io/miga/content/
|
32
|
+
[rubydoc]: http://www.rubydoc.info/github/bio-miga/miga
|
33
|
+
[contact]: http://enve-omics.gatech.edu/node/7
|
data/Rakefile
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
require "rake/testtask"
|
2
|
+
|
3
|
+
SOURCES = FileList["lib/**/*.rb"]
|
4
|
+
|
5
|
+
desc "Default Task"
|
6
|
+
task :default => "test:base"
|
7
|
+
|
8
|
+
desc "Base Tests"
|
9
|
+
Rake::TestTask.new("test:base") do |t|
|
10
|
+
t.libs << "test"
|
11
|
+
t.pattern = "test/[^j]*_test.rb"
|
12
|
+
t.verbose = true
|
13
|
+
end
|
14
|
+
|
15
|
+
desc "GUI Tests"
|
16
|
+
Rake::TestTask.new("test:gui") do |t|
|
17
|
+
ENV["GUI_TESTS"] = "true"
|
18
|
+
t.libs << "test"
|
19
|
+
t.libs << "test"
|
20
|
+
t.pattern = "test/j*_test.rb"
|
21
|
+
t.verbose = true
|
22
|
+
end
|
23
|
+
|
24
|
+
desc "All the tests"
|
25
|
+
Rake::TestTask.new("test:all") do |t|
|
26
|
+
ENV["GUI_TESTS"] = "true"
|
27
|
+
t.libs << "test"
|
28
|
+
t.libs << "test"
|
29
|
+
t.pattern = "test/*_test.rb"
|
30
|
+
t.verbose = true
|
31
|
+
end
|
data/actions/add_result
CHANGED
data/actions/add_taxonomy
CHANGED
@@ -1,10 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
|
2
|
+
|
3
3
|
# @package MiGA
|
4
|
-
# @
|
5
|
-
# @license artistic license 2.0
|
6
|
-
# @update Oct-01-2015
|
7
|
-
#
|
4
|
+
# @license Artistic-2.0
|
8
5
|
|
9
6
|
o = {q:true}
|
10
7
|
OptionParser.new do |opt|
|
@@ -57,9 +54,9 @@ if not o[:taxfile].nil?
|
|
57
54
|
$stderr.puts "Reading tax-file and registering taxonomy." unless o[:q]
|
58
55
|
tfh = File.open(o[:taxfile], "r")
|
59
56
|
header = nil
|
60
|
-
|
57
|
+
tfh.each_line do |ln|
|
61
58
|
next if ln =~ /^\s*?$/
|
62
|
-
r = ln.chomp.split
|
59
|
+
r = ln.chomp.split(/\t/, -1)
|
63
60
|
dn = r.shift
|
64
61
|
if header.nil?
|
65
62
|
header = r
|
data/actions/create_dataset
CHANGED
@@ -1,10 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
|
2
|
+
|
3
3
|
# @package MiGA
|
4
|
-
# @
|
5
|
-
# @license artistic license 2.0
|
6
|
-
# @update Nov-29-2015
|
7
|
-
#
|
4
|
+
# @license Artistic-2.0
|
8
5
|
|
9
6
|
o = {q:true, ref:true}
|
10
7
|
OptionParser.new do |opt|
|
@@ -55,8 +52,10 @@ raise "Impossible to load project: #{o[:project]}" if p.nil?
|
|
55
52
|
$stderr.puts "Creating dataset." unless o[:q]
|
56
53
|
md = {}
|
57
54
|
[:type, :description, :user, :comments].each{ |k| md[k]=o[k] unless o[k].nil? }
|
58
|
-
|
55
|
+
MiGA::Dataset.new(p, o[:dataset], o[:ref], md)
|
59
56
|
p.add_dataset(o[:dataset])
|
57
|
+
res = d.first_preprocessing
|
58
|
+
put "- #{res}" unless o[:q]
|
60
59
|
|
61
60
|
$stderr.puts "Done." unless o[:q]
|
62
61
|
|