miga-base 0.2.0.6 → 0.2.0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +3 -0
- data/LICENSE +201 -0
- data/README.md +17 -335
- data/Rakefile +31 -0
- data/actions/add_result +2 -5
- data/actions/add_taxonomy +4 -7
- data/actions/create_dataset +5 -6
- data/actions/create_project +2 -5
- data/actions/daemon +2 -5
- data/actions/download_dataset +88 -58
- data/actions/find_datasets +36 -38
- data/actions/import_datasets +2 -5
- data/actions/index_taxonomy +2 -5
- data/actions/list_datasets +47 -49
- data/actions/list_files +7 -11
- data/actions/unlink_dataset +2 -5
- data/bin/miga +1 -1
- data/lib/miga/common.rb +132 -0
- data/lib/miga/daemon.rb +229 -168
- data/lib/miga/dataset.rb +354 -277
- data/lib/miga/gui.rb +346 -269
- data/lib/miga/metadata.rb +115 -71
- data/lib/miga/project.rb +361 -259
- data/lib/miga/remote_dataset.rb +200 -148
- data/lib/miga/result.rb +150 -99
- data/lib/miga/tax_index.rb +124 -67
- data/lib/miga/taxonomy.rb +129 -100
- data/lib/miga/version.rb +57 -0
- data/lib/miga.rb +2 -77
- data/scripts/_distances_noref_nomulti.bash +2 -0
- data/scripts/_distances_ref_nomulti.bash +2 -0
- data/scripts/aai_distances.bash +1 -0
- data/scripts/ani_distances.bash +1 -0
- data/scripts/assembly.bash +1 -0
- data/scripts/cds.bash +1 -0
- data/scripts/clade_finding.bash +17 -1
- data/scripts/distances.bash +1 -0
- data/scripts/essential_genes.bash +1 -0
- data/scripts/haai_distances.bash +1 -0
- data/scripts/init.bash +2 -0
- data/scripts/mytaxa.bash +1 -0
- data/scripts/mytaxa_scan.bash +1 -0
- data/scripts/ogs.bash +1 -0
- data/scripts/read_quality.bash +1 -0
- data/scripts/ssu.bash +1 -0
- data/scripts/subclades.bash +1 -0
- data/scripts/trimmed_fasta.bash +1 -0
- data/scripts/trimmed_reads.bash +1 -0
- data/test/common_test.rb +82 -0
- data/test/daemon_test.rb +53 -0
- data/test/dataset_test.rb +156 -0
- data/test/jruby_gui_test.rb +20 -0
- data/test/metadata_test.rb +48 -0
- data/test/project_test.rb +54 -0
- data/test/remote_dataset_test.rb +41 -0
- data/test/tax_index_test.rb +44 -0
- data/test/taxonomy_test.rb +36 -0
- data/test/test_helper.rb +32 -0
- metadata +53 -38
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 01b3728971a5d407f85578447d4a66dc4c8ab8a8
|
4
|
+
data.tar.gz: 656535155e0316681f2d7aa9bcc8b501caed9d96
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5daf2a27f6a6119e18e5eda94dbda72be91ad7b46f4f8bea401f111ce95d907c88930b08d85543c1694a82aeac54c42dff5d0c3586c0846123d1e1881ad23885
|
7
|
+
data.tar.gz: 0a75b0152ad7374729c1cb09b02b539b70429902850c81cdb086f64e9b616cc66f5ba032469be85505c103dc05d15b9469b2c26689b8b4dee2b0c3ef6636ad61
|
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,201 @@
|
|
1
|
+
The Artistic License 2.0
|
2
|
+
|
3
|
+
Copyright (c) 2016 Luis M Rodriguez-R
|
4
|
+
|
5
|
+
Everyone is permitted to copy and distribute verbatim copies
|
6
|
+
of this license document, but changing it is not allowed.
|
7
|
+
|
8
|
+
Preamble
|
9
|
+
|
10
|
+
This license establishes the terms under which a given free software
|
11
|
+
Package may be copied, modified, distributed, and/or redistributed.
|
12
|
+
The intent is that the Copyright Holder maintains some artistic
|
13
|
+
control over the development of that Package while still keeping the
|
14
|
+
Package available as open source and free software.
|
15
|
+
|
16
|
+
You are always permitted to make arrangements wholly outside of this
|
17
|
+
license directly with the Copyright Holder of a given Package. If the
|
18
|
+
terms of this license do not permit the full use that you propose to
|
19
|
+
make of the Package, you should contact the Copyright Holder and seek
|
20
|
+
a different licensing arrangement.
|
21
|
+
|
22
|
+
Definitions
|
23
|
+
|
24
|
+
"Copyright Holder" means the individual(s) or organization(s)
|
25
|
+
named in the copyright notice for the entire Package.
|
26
|
+
|
27
|
+
"Contributor" means any party that has contributed code or other
|
28
|
+
material to the Package, in accordance with the Copyright Holder's
|
29
|
+
procedures.
|
30
|
+
|
31
|
+
"You" and "your" means any person who would like to copy,
|
32
|
+
distribute, or modify the Package.
|
33
|
+
|
34
|
+
"Package" means the collection of files distributed by the
|
35
|
+
Copyright Holder, and derivatives of that collection and/or of
|
36
|
+
those files. A given Package may consist of either the Standard
|
37
|
+
Version, or a Modified Version.
|
38
|
+
|
39
|
+
"Distribute" means providing a copy of the Package or making it
|
40
|
+
accessible to anyone else, or in the case of a company or
|
41
|
+
organization, to others outside of your company or organization.
|
42
|
+
|
43
|
+
"Distributor Fee" means any fee that you charge for Distributing
|
44
|
+
this Package or providing support for this Package to another
|
45
|
+
party. It does not mean licensing fees.
|
46
|
+
|
47
|
+
"Standard Version" refers to the Package if it has not been
|
48
|
+
modified, or has been modified only in ways explicitly requested
|
49
|
+
by the Copyright Holder.
|
50
|
+
|
51
|
+
"Modified Version" means the Package, if it has been changed, and
|
52
|
+
such changes were not explicitly requested by the Copyright
|
53
|
+
Holder.
|
54
|
+
|
55
|
+
"Original License" means this Artistic License as Distributed with
|
56
|
+
the Standard Version of the Package, in its current version or as
|
57
|
+
it may be modified by The Perl Foundation in the future.
|
58
|
+
|
59
|
+
"Source" form means the source code, documentation source, and
|
60
|
+
configuration files for the Package.
|
61
|
+
|
62
|
+
"Compiled" form means the compiled bytecode, object code, binary,
|
63
|
+
or any other form resulting from mechanical transformation or
|
64
|
+
translation of the Source form.
|
65
|
+
|
66
|
+
|
67
|
+
Permission for Use and Modification Without Distribution
|
68
|
+
|
69
|
+
(1) You are permitted to use the Standard Version and create and use
|
70
|
+
Modified Versions for any purpose without restriction, provided that
|
71
|
+
you do not Distribute the Modified Version.
|
72
|
+
|
73
|
+
|
74
|
+
Permissions for Redistribution of the Standard Version
|
75
|
+
|
76
|
+
(2) You may Distribute verbatim copies of the Source form of the
|
77
|
+
Standard Version of this Package in any medium without restriction,
|
78
|
+
either gratis or for a Distributor Fee, provided that you duplicate
|
79
|
+
all of the original copyright notices and associated disclaimers. At
|
80
|
+
your discretion, such verbatim copies may or may not include a
|
81
|
+
Compiled form of the Package.
|
82
|
+
|
83
|
+
(3) You may apply any bug fixes, portability changes, and other
|
84
|
+
modifications made available from the Copyright Holder. The resulting
|
85
|
+
Package will still be considered the Standard Version, and as such
|
86
|
+
will be subject to the Original License.
|
87
|
+
|
88
|
+
|
89
|
+
Distribution of Modified Versions of the Package as Source
|
90
|
+
|
91
|
+
(4) You may Distribute your Modified Version as Source (either gratis
|
92
|
+
or for a Distributor Fee, and with or without a Compiled form of the
|
93
|
+
Modified Version) provided that you clearly document how it differs
|
94
|
+
from the Standard Version, including, but not limited to, documenting
|
95
|
+
any non-standard features, executables, or modules, and provided that
|
96
|
+
you do at least ONE of the following:
|
97
|
+
|
98
|
+
(a) make the Modified Version available to the Copyright Holder
|
99
|
+
of the Standard Version, under the Original License, so that the
|
100
|
+
Copyright Holder may include your modifications in the Standard
|
101
|
+
Version.
|
102
|
+
|
103
|
+
(b) ensure that installation of your Modified Version does not
|
104
|
+
prevent the user installing or running the Standard Version. In
|
105
|
+
addition, the Modified Version must bear a name that is different
|
106
|
+
from the name of the Standard Version.
|
107
|
+
|
108
|
+
(c) allow anyone who receives a copy of the Modified Version to
|
109
|
+
make the Source form of the Modified Version available to others
|
110
|
+
under
|
111
|
+
|
112
|
+
(i) the Original License or
|
113
|
+
|
114
|
+
(ii) a license that permits the licensee to freely copy,
|
115
|
+
modify and redistribute the Modified Version using the same
|
116
|
+
licensing terms that apply to the copy that the licensee
|
117
|
+
received, and requires that the Source form of the Modified
|
118
|
+
Version, and of any works derived from it, be made freely
|
119
|
+
available in that license fees are prohibited but Distributor
|
120
|
+
Fees are allowed.
|
121
|
+
|
122
|
+
|
123
|
+
Distribution of Compiled Forms of the Standard Version
|
124
|
+
or Modified Versions without the Source
|
125
|
+
|
126
|
+
(5) You may Distribute Compiled forms of the Standard Version without
|
127
|
+
the Source, provided that you include complete instructions on how to
|
128
|
+
get the Source of the Standard Version. Such instructions must be
|
129
|
+
valid at the time of your distribution. If these instructions, at any
|
130
|
+
time while you are carrying out such distribution, become invalid, you
|
131
|
+
must provide new instructions on demand or cease further distribution.
|
132
|
+
If you provide valid instructions or cease distribution within thirty
|
133
|
+
days after you become aware that the instructions are invalid, then
|
134
|
+
you do not forfeit any of your rights under this license.
|
135
|
+
|
136
|
+
(6) You may Distribute a Modified Version in Compiled form without
|
137
|
+
the Source, provided that you comply with Section 4 with respect to
|
138
|
+
the Source of the Modified Version.
|
139
|
+
|
140
|
+
|
141
|
+
Aggregating or Linking the Package
|
142
|
+
|
143
|
+
(7) You may aggregate the Package (either the Standard Version or
|
144
|
+
Modified Version) with other packages and Distribute the resulting
|
145
|
+
aggregation provided that you do not charge a licensing fee for the
|
146
|
+
Package. Distributor Fees are permitted, and licensing fees for other
|
147
|
+
components in the aggregation are permitted. The terms of this license
|
148
|
+
apply to the use and Distribution of the Standard or Modified Versions
|
149
|
+
as included in the aggregation.
|
150
|
+
|
151
|
+
(8) You are permitted to link Modified and Standard Versions with
|
152
|
+
other works, to embed the Package in a larger work of your own, or to
|
153
|
+
build stand-alone binary or bytecode versions of applications that
|
154
|
+
include the Package, and Distribute the result without restriction,
|
155
|
+
provided the result does not expose a direct interface to the Package.
|
156
|
+
|
157
|
+
|
158
|
+
Items That are Not Considered Part of a Modified Version
|
159
|
+
|
160
|
+
(9) Works (including, but not limited to, modules and scripts) that
|
161
|
+
merely extend or make use of the Package, do not, by themselves, cause
|
162
|
+
the Package to be a Modified Version. In addition, such works are not
|
163
|
+
considered parts of the Package itself, and are not subject to the
|
164
|
+
terms of this license.
|
165
|
+
|
166
|
+
|
167
|
+
General Provisions
|
168
|
+
|
169
|
+
(10) Any use, modification, and distribution of the Standard or
|
170
|
+
Modified Versions is governed by this Artistic License. By using,
|
171
|
+
modifying or distributing the Package, you accept this license. Do not
|
172
|
+
use, modify, or distribute the Package, if you do not accept this
|
173
|
+
license.
|
174
|
+
|
175
|
+
(11) If your Modified Version has been derived from a Modified
|
176
|
+
Version made by someone other than you, you are nevertheless required
|
177
|
+
to ensure that your Modified Version complies with the requirements of
|
178
|
+
this license.
|
179
|
+
|
180
|
+
(12) This license does not grant you the right to use any trademark,
|
181
|
+
service mark, tradename, or logo of the Copyright Holder.
|
182
|
+
|
183
|
+
(13) This license includes the non-exclusive, worldwide,
|
184
|
+
free-of-charge patent license to make, have made, use, offer to sell,
|
185
|
+
sell, import and otherwise transfer the Package with respect to any
|
186
|
+
patent claims licensable by the Copyright Holder that are necessarily
|
187
|
+
infringed by the Package. If you institute patent litigation
|
188
|
+
(including a cross-claim or counterclaim) against any party alleging
|
189
|
+
that the Package constitutes direct or contributory patent
|
190
|
+
infringement, then this Artistic License to you shall terminate on the
|
191
|
+
date that such litigation is filed.
|
192
|
+
|
193
|
+
(14) Disclaimer of Warranty:
|
194
|
+
THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS "AS
|
195
|
+
IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES. THE IMPLIED
|
196
|
+
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR
|
197
|
+
NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY YOUR LOCAL
|
198
|
+
LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR CONTRIBUTOR WILL
|
199
|
+
BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
|
200
|
+
DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE, EVEN IF
|
201
|
+
ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
data/README.md
CHANGED
@@ -1,351 +1,33 @@
|
|
1
1
|
[![Code Climate](https://codeclimate.com/github/bio-miga/miga/badges/gpa.svg)](https://codeclimate.com/github/bio-miga/miga)
|
2
2
|
[![Test Coverage](https://codeclimate.com/github/bio-miga/miga/badges/coverage.svg)](https://codeclimate.com/github/bio-miga/miga/coverage)
|
3
|
-
[![Build Status](https://travis-ci.org/
|
3
|
+
[![Build Status](https://travis-ci.org/bio-miga/miga.svg?branch=master)](https://travis-ci.org/bio-miga/miga)
|
4
|
+
[![Gem Version](https://badge.fury.io/rb/miga-base.svg)](https://badge.fury.io/rb/miga-base)
|
5
|
+
[![Inch docs](http://inch-ci.org/github/bio-miga/miga.svg)](http://inch-ci.org/github/bio-miga/miga)
|
6
|
+
[![Yard docs](http://img.shields.io/badge/yard-docs-blue.svg)](http://www.rubydoc.info/github/bio-miga/miga)
|
4
7
|
|
5
|
-
MiGA: Microbial Genomes Atlas
|
6
|
-
=============================
|
7
8
|
|
9
|
+
# MiGA: Microbial Genomes Atlas
|
8
10
|
|
9
|
-
|
10
|
-
|
11
|
+
**Important**: The MiGA code is under active development, and we currently
|
12
|
+
cannot ensure any stability on the different interfaces. We'll be launching a
|
13
|
+
Beta Testing program soon, with dedicated support for a small number of
|
14
|
+
laboratories. If you're interested, please [contact us][contact].
|
11
15
|
|
12
|
-
|
16
|
+
For additional information on the MiGA system, please refer to the
|
17
|
+
[MiGA manual][gitbook]. For additional information on the MiGA API
|
18
|
+
(and Ruby gem), please refer to the [miga docs][rubydoc].
|
13
19
|
|
14
20
|
|
15
|
-
|
16
|
-
-------------------------
|
17
|
-
|
18
|
-
### MiGA Interfaces
|
19
|
-
|
20
|
-
You caninteract with MiGA through different interfaces. These interfaces have
|
21
|
-
different purposes, but they also have some degree of overlap, because different
|
22
|
-
users with different aims sometimes want to do the same thing. Throughout this
|
23
|
-
manual I'll be telling you how to do things using mostly the CLI, but I'll also
|
24
|
-
try to mention the GUI and the Web Interface. The CLI is the most comprehensive
|
25
|
-
and flexible interface, but the other two are friendlier to humans. There is a
|
26
|
-
fourth interface that I won't be mentioning at all, but I'll try to document:
|
27
|
-
the Ruby API. MiGA is mostly written in Ruby, with an object-oriented approach,
|
28
|
-
and all the interfaces are just thin layers atop the Ruby core. That means that
|
29
|
-
you can write your own interfaces (or pieces) if you know how to talk to these
|
30
|
-
Ruby objects. Sometimes I even use `irb`, which is an interactive shell for
|
31
|
-
Ruby, but that's mostly for debugging.
|
32
|
-
|
33
|
-
#### MiGA CLI
|
34
|
-
|
35
|
-
CLI stands for Command Line Interface. This is a set of little scripts that let
|
36
|
-
you talk with MiGA through the terminal shell. If MiGA is in your PATH (see
|
37
|
-
[installation details](./INSTALLATION.md#miga-in-your-path)), you can simply run
|
38
|
-
`miga` in your terminal, and the help messages will take it from there. All the
|
39
|
-
MiGA CLI calls look like:
|
40
|
-
|
41
|
-
```bash
|
42
|
-
miga task [options]
|
43
|
-
```
|
44
|
-
|
45
|
-
Where `task` is one of the supported tasks and `[options]` is a set of dash-flag
|
46
|
-
options supported by each task. `-h` is always there to provide help. If you're
|
47
|
-
a MiGA administrator, this is probably the most convenient option for you (but
|
48
|
-
hey, give the GUI a chance).
|
49
|
-
|
50
|
-
#### MiGA GUI
|
51
|
-
|
52
|
-
The Graphical User Interface is the friendlier option for setting up a MiGA
|
53
|
-
project. It doesn't have as many options as the CLI, but it's pretty easy to
|
54
|
-
use, so it's a good option if you have a typical project in your hands.
|
55
|
-
|
56
|
-
#### MiGA Web
|
57
|
-
|
58
|
-
The Web interface for MiGA is the way MiGA reports results from a project. It's
|
59
|
-
not designed to set up new projects, but to explore existing ones, and to submit
|
60
|
-
non-reference datasets for analyses.
|
61
|
-
|
62
|
-
### Creating your first project
|
63
|
-
|
64
|
-
You can do this in the GUI, but I like the CLI better, so I'll be telling you
|
65
|
-
how to tell MiGA what to do from the CLI. First, think where you'll place your
|
66
|
-
project. Normally this means a location...
|
67
|
-
|
68
|
-
1. ... with enough space. This is, plan for at least 4 or 5 times the size of
|
69
|
-
the input files.
|
70
|
-
|
71
|
-
2. ... accessible by worker nodes. If you're using a single server, this is not
|
72
|
-
really an issue. However, if you plan on deploying MiGA in a cluster
|
73
|
-
infrastructure, make sure your project is reachable by worker nodes.
|
74
|
-
|
75
|
-
3. ... with fast access. It's not a great idea to set up projects in remote
|
76
|
-
drives with large latency. In some cases there no way around this, for example
|
77
|
-
when that's the only available option in your cluster infrastructure, but try
|
78
|
-
to avoid this as much as possible.
|
79
|
-
|
80
|
-
Now that you know where to create your project, go ahead and run:
|
81
|
-
|
82
|
-
```bash
|
83
|
-
miga create_project -P /path/to/project1 -t type-of-project
|
84
|
-
```
|
85
|
-
|
86
|
-
Where `/path/to/project1` is the path to where the project should be created.
|
87
|
-
You don't need to create the folder in advance, MiGA will take care. See the
|
88
|
-
next section to help you decide what `type-of-project` to use. There are some
|
89
|
-
other options that are not mandatory, but will make your project richer. Take a
|
90
|
-
look at `miga create_project -h`.
|
91
|
-
|
92
|
-
#### Project types
|
93
|
-
|
94
|
-
Projects can be set for different purposes, so we've divided them into "types".
|
95
|
-
There are four of them, depending on the types of datasets to be processed (see
|
96
|
-
[Dataset types](#dataset-types)):
|
97
|
-
|
98
|
-
1. **mixed**: A generic project with any supported type of datasets.
|
99
|
-
|
100
|
-
2. **metagenomes**: A project containing only metagenomic datasets. This
|
101
|
-
includes either (or both) metagenomes and viromes.
|
102
|
-
|
103
|
-
3. **genomes**: A project containing only single-organism datasets. This
|
104
|
-
includes any of the single-organism types: genome, scgenome, and/or popgenome.
|
105
|
-
|
106
|
-
4. **clade**: Same as "genomes", but all the datasets are expected to be from
|
107
|
-
the same species. This type of project performs additional analyses that expect
|
108
|
-
a very dense ANI matrix, so all genomes in it are expected to have AAI > 90%.
|
109
|
-
|
110
|
-
### Creating datasets
|
111
|
-
|
112
|
-
Once your project is ready, you can start populating it with datasets and data.
|
113
|
-
While it's possible to create empty datasets using `miga create_dataset`, the
|
114
|
-
preferred method is to first add data and then use the data to create the
|
115
|
-
datasets in batch. For example, lets assume you have a collection of paired-end
|
116
|
-
raw reads from several datasets. The first step is to format the filenames
|
117
|
-
properly. For each one of your datasets, pick a name that conforms the
|
118
|
-
[MiGA names](#miga-names) restrictions (we'll call it "ds1") and rename your
|
119
|
-
reads to `/path/to/project1/data/01.raw_reads/ds1.1.fastq` for the first
|
120
|
-
sister and `/path/to/project1/data/01.raw_reads/ds1.2.fastq` for the second
|
121
|
-
sister. Also, add the date into `/path/to/project1/data/01.raw_reads/ds1.done`.
|
122
|
-
Check what are the [expected result files](#expected-result-files) below if you
|
123
|
-
want to start at any other point in the pipeline. Once you have renamed (or
|
124
|
-
copied) the files inside the project folder, run:
|
125
|
-
|
126
|
-
```bash
|
127
|
-
miga find_datasets -P /path/to/project1 -a -r -t type-of-dataset
|
128
|
-
```
|
129
|
-
|
130
|
-
The `-a` flag tells MiGA that you want to add the datasets (not just find them);
|
131
|
-
the `-r` flag tells MiGA that your datasets are to be treated as "reference"
|
132
|
-
datasets (see [Non-reference datasets](#non-reference-datasets) below); and the
|
133
|
-
`-t` option tells MiGA what type of datasets you're adding (see
|
134
|
-
[Dataset types](#dataset-types) below). If you have a mixture of dataset types,
|
135
|
-
process one at a time. This is, perform this step for each dataset type. Don't
|
136
|
-
worry about the datasets that are already registered, those will be ignored by
|
137
|
-
the `find_datasets` task and will remain unchanged.
|
138
|
-
|
139
|
-
#### Expected result files
|
140
|
-
|
141
|
-
For brevity, we'll assume that you're inside `/path/to/project1/data`; *i.e.*,
|
142
|
-
in the `data` directory of your project. We'll also assume that you're naming
|
143
|
-
your dataset **ds1**, but you can change this by anything following the
|
144
|
-
[MiGA names](#miga-names) restrictions. Now, these are the "input" points that
|
145
|
-
you can use in MiGA:
|
146
|
-
|
147
|
-
1. **Paired-end raw reads**: The expected files are `01.raw_reads/ds1.1.fastq`
|
148
|
-
and `01.raw_reads/ds1.2.fastq`, each including a sister end. The reads must be
|
149
|
-
in the same order in both files (MiGA won't check). You can also use gzipped
|
150
|
-
files instead.
|
151
|
-
|
152
|
-
2. **Single-end raw reads**: The expected file is `01.raw_reads/ds1.1.fastq`.
|
153
|
-
You can also use a gzipped file instead.
|
154
|
-
|
155
|
-
3. **Paired-end trimmed reads**: These are assumed to be quality-controlled
|
156
|
-
reads in FastA format, with both ends passing the quality filters. The minimum
|
157
|
-
expected file is `04.trimmed_fasta/ds1.CoupledReads.fa`, which contains the
|
158
|
-
reads interposed. You can also pass (in addition) the reads that past the
|
159
|
-
quality check without the sister as a gzipped FastA at
|
160
|
-
`04.trimmed_fasta/ds1.SingleReads.fa.gz`.
|
161
|
-
|
162
|
-
4. **Single-end trimmed reads**: Similar to the option above, only
|
163
|
-
quality-checked reads are expected here. The expected file is
|
164
|
-
`04.trimmed_fasta/ds1.SingleReads.fa`.
|
165
|
-
|
166
|
-
5. **Assembled fragments**: This can be any assembly result, including complete
|
167
|
-
genomes. The expected file is `05.assembly/ds1.LargeContigs.fna`, containing
|
168
|
-
only contigs longer than 500bp. You can also provide the complete assembly
|
169
|
-
(without length-filtering) at `05.assembly/ds1.AllContigs.fna`.
|
170
|
-
|
171
|
-
6. **Predicted genes/proteins**: This is the total collection of predicted genes
|
172
|
-
and proteins. The expected files are `06.cds/ds1.fna`, containing genes, and
|
173
|
-
`06.cds/ds1.faa`, containing proteins. You can also provide the locations of
|
174
|
-
said genes in the genome in gzipped GFF v2 (`06.cds/ds1.gff2.gz`), gzipped
|
175
|
-
GFF v3 (`06.cds/ds1.gff3.gz`), or gzipped tabular (`06.cds/ds1.tab.gz`).
|
176
|
-
|
177
|
-
**IMPORTANT**: In all cases, an additional `ds1.done` file MUST be created in
|
178
|
-
the same folder. This is meant to prevent MiGA from mistakenly adding files as
|
179
|
-
results before they're done being processed or transferred. This file must
|
180
|
-
contain the current [date in MiGA format](#date-in-miga-format). Here's a quick
|
181
|
-
code snippet to add the `.done` file for all the input files in `01.raw_reads`
|
182
|
-
(you can adapt this accordingly to any of the other options):
|
183
|
-
|
184
|
-
```bash
|
185
|
-
cd /path/to/project1/data/01.raw_reads
|
186
|
-
for i in *.1.fastq ; do
|
187
|
-
date "+%Y-%m-%d %H:%M:%S %z" > $(basename $i .1.fastq).done
|
188
|
-
done
|
189
|
-
```
|
190
|
-
|
191
|
-
#### Dataset types
|
192
|
-
|
193
|
-
This is how you tell MiGA what kind of data you have in your datasets. Lets see
|
194
|
-
the definitions:
|
195
|
-
|
196
|
-
1. **genome**: The genome from an isolate.
|
197
|
-
2. **metagenome**: A metagenome (excluding viromes).
|
198
|
-
3. **virome**: A viral metagenome.
|
199
|
-
4. **scgenome**: A genome from a single cell.
|
200
|
-
5. **popgenome**: The genome of a population (including microdiversity).
|
201
|
-
|
202
|
-
#### Non-reference datasets
|
203
|
-
|
204
|
-
|
205
|
-
#### Creating a RefSeq project
|
206
|
-
|
207
|
-
If you've reached this point, you are now ready to create a large functional
|
208
|
-
project. If you want to continue using this documentation on real data but
|
209
|
-
don't have any of your own handy (or if you want to use RefSeq data), this
|
210
|
-
is a quick tutoral on how to create a functional MiGA project using ALL of
|
211
|
-
NCBI's Prokaryotic RefSeq data.
|
212
|
-
|
213
|
-
**Step 1: Create the project**. That's simple, just `cd` to the directory you
|
214
|
-
want to use, and execute `miga create_project -P MiGA_RefSeq -t genomes`.
|
215
|
-
|
216
|
-
**Step 2: Download the data**. Just `cd MiGA_RefSeq`, and execute this code:
|
217
|
-
|
218
|
-
```bash
|
219
|
-
wget -O reference_genomes.txt 'http://www.ncbi.nlm.nih.gov/genomes/Genome2BE/genome2srv.cgi?action=refgenomes&download=on&type=reference'
|
220
|
-
grep -v '^#' reference_genomes.txt \
|
221
|
-
| awk -F'\t' '{gsub(/[^A-Za-z0-9]/,"_",$3)} {print "miga download_dataset -P . -D "$3" -I "$4" -U ncbi --db nuccore -t genome -v # "$3""}' \
|
222
|
-
| while read ln ; do
|
223
|
-
sp=$(echo $ln | perl -pe 's/.*# //')
|
224
|
-
if [[ ! -n $(miga list_datasets -P . -D $sp) ]] ; then
|
225
|
-
echo $ln
|
226
|
-
$ln
|
227
|
-
fi
|
228
|
-
done
|
229
|
-
```
|
230
|
-
|
231
|
-
And that's it. The first line will download the most current list of genomes
|
232
|
-
included in NCBI's Prokaryotic RefSeq, and the rest will repeatedly execute the
|
233
|
-
`download_dataset` task, that automatically fetches the data (even the genome's
|
234
|
-
taxonomy!). Note that the code above checks first if a dataset already exists,
|
235
|
-
so if you want to update an existing MiGA_RefSeq project, simply repeat step 2
|
236
|
-
and only missing genomes will be fetched.
|
237
|
-
|
238
|
-
Note that running time for the above code may vary depending on the network and
|
239
|
-
the size of RefSeq, but I was able to create a complete project with 122 genomes
|
240
|
-
in under 10 minutes.
|
241
|
-
|
242
|
-
**Alternative step 2: downloading all representatives**. If you want a larger
|
243
|
-
and more comprehensive collection, and not just the reference genomes, you can
|
244
|
-
download all of the representative genomes in the prokaryotic RefSeq with this
|
245
|
-
alternative code:
|
246
|
-
|
247
|
-
```bash
|
248
|
-
wget -O representative_genomes.txt 'http://www.ncbi.nlm.nih.gov/genomes/Genome2BE/genome2srv.cgi?action=refgenomes&download=on'
|
249
|
-
grep -v '^#' representative_genomes.txt \
|
250
|
-
| awk -F'\t' '{gsub(/[^A-Za-z0-9]/,"_",$3)} $4{print "miga download_dataset -P . -D "$3" -I "$4" -U ncbi --db nuccore -t genome -v # "$3""}' \
|
251
|
-
| while read ln ; do
|
252
|
-
sp=$(echo $ln | perl -pe 's/.*# //')
|
253
|
-
if [[ ! -n $(miga list_datasets -P . -D $sp) ]] ; then
|
254
|
-
echo $ln
|
255
|
-
$ln
|
256
|
-
fi
|
257
|
-
done
|
258
|
-
```
|
259
|
-
|
260
|
-
This is a much larger set (1,246), hence it'll take much more time. I finished
|
261
|
-
downloading the whole thing in about one and a half hours.
|
262
|
-
|
263
|
-
|
264
|
-
Launching daemons
|
265
|
-
-----------------
|
266
|
-
|
267
|
-
### Configuring daemons
|
268
|
-
|
269
|
-
|
270
|
-
### Understating the MiGA configuration file
|
271
|
-
|
272
|
-
|
273
|
-
### Arbitrary configuration scripts
|
274
|
-
|
275
|
-
|
276
|
-
### Fixing system calls with aliases
|
277
|
-
|
278
|
-
In some cases, we might not have the same executable names as MiGA expects, or
|
279
|
-
we might have broken modules in our cluster that can be easily fixed with an
|
280
|
-
`alias`. In these cases, you can use
|
281
|
-
[arbitrary configuration scripts](#arbitrary-configuration-scripts) to generate
|
282
|
-
one or more `alias`. Importantly, MiGA daemons work with non-interactive shells,
|
283
|
-
which means you likely need to explicitly allow for alias extensions, for
|
284
|
-
example:
|
285
|
-
|
286
|
-
```bash
|
287
|
-
# Allow alias expansions in non-interactive shells
|
288
|
-
shopt -s expand_aliases
|
289
|
-
|
290
|
-
# Call FastQC with the environmental Perl,
|
291
|
-
# not the built-in /usr/bin/perl:
|
292
|
-
alias fastqc="perl $(which fastqc)"
|
293
|
-
|
294
|
-
# Use the standard name for RAxML (pthreads)
|
295
|
-
# instead of the one my sys-admin decided to use:
|
296
|
-
alias raxmlHPC-PTHREADS=RAxML_pthreads
|
297
|
-
```
|
298
|
-
|
299
|
-
The examples above illustrate how to use `alias` to fix broken packages or to
|
300
|
-
make Software with non-standard names reachable.
|
301
|
-
|
302
|
-
**Known caveats to this solution:** This solution CANNOT BE USED in the few
|
303
|
-
cases in which a whole package is expected based on a single executable. For
|
304
|
-
example, adding the enveomics scripts to your `PATH` is far easier than creating
|
305
|
-
an `alias` for each script. Also, MiGA expects to find the model, the activation
|
306
|
-
key, and the scripts of MetaGeneMark in the same folder of the `gmhmmp` binary,
|
307
|
-
so setting an`alias` may prevent MiGA from finding these ancillary files.
|
308
|
-
|
309
|
-
|
310
|
-
Cluster infrastructure
|
311
|
-
----------------------
|
312
|
-
|
313
|
-
|
314
|
-
### Loading optional modules
|
315
|
-
|
316
|
-
|
317
|
-
See also [Fixing system calls with aliases](#fixing-system-calls-with-aliases).
|
318
|
-
|
319
|
-
|
320
|
-
Miscellaneous
|
321
|
-
-------------
|
322
|
-
|
323
|
-
These below are reference snippets that for which I couldn't find a more
|
324
|
-
suitable home, but are important documentation.
|
325
|
-
|
326
|
-
### MiGA Names
|
327
|
-
|
328
|
-
MiGA names are non-empty strings composed exclusively of alphanumerics and
|
329
|
-
underscores. All the dataset names in MiGA must conform this restriction, but
|
330
|
-
not all the projects do. Other objects must conform the MiGA name restrictions,
|
331
|
-
such as taxonomic entries.
|
332
|
-
|
333
|
-
### Date in MiGA format
|
334
|
-
|
335
|
-
The official format in which MiGA represents date/times is the default of Ruby's
|
336
|
-
`Time.now.to_s`. In the *nix `date` utility this corresponds to the format:
|
337
|
-
`+%Y-%m-%d %H:%M:%S %z`.
|
338
|
-
|
339
|
-
|
340
|
-
Authors
|
341
|
-
-------
|
21
|
+
# Authors
|
342
22
|
|
343
23
|
Developed and maintained by [Luis M. Rodriguez-R][lrr].
|
344
24
|
|
345
25
|
|
346
|
-
License
|
347
|
-
-------
|
26
|
+
# License
|
348
27
|
|
349
28
|
See [LICENSE](LICENSE).
|
350
29
|
|
351
30
|
[lrr]: http://lmrodriguezr.github.io/
|
31
|
+
[gitbook]: https://miga.gitbooks.io/miga/content/
|
32
|
+
[rubydoc]: http://www.rubydoc.info/github/bio-miga/miga
|
33
|
+
[contact]: http://enve-omics.gatech.edu/node/7
|
data/Rakefile
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
require "rake/testtask"
|
2
|
+
|
3
|
+
SOURCES = FileList["lib/**/*.rb"]
|
4
|
+
|
5
|
+
desc "Default Task"
|
6
|
+
task :default => "test:base"
|
7
|
+
|
8
|
+
desc "Base Tests"
|
9
|
+
Rake::TestTask.new("test:base") do |t|
|
10
|
+
t.libs << "test"
|
11
|
+
t.pattern = "test/[^j]*_test.rb"
|
12
|
+
t.verbose = true
|
13
|
+
end
|
14
|
+
|
15
|
+
desc "GUI Tests"
|
16
|
+
Rake::TestTask.new("test:gui") do |t|
|
17
|
+
ENV["GUI_TESTS"] = "true"
|
18
|
+
t.libs << "test"
|
19
|
+
t.libs << "test"
|
20
|
+
t.pattern = "test/j*_test.rb"
|
21
|
+
t.verbose = true
|
22
|
+
end
|
23
|
+
|
24
|
+
desc "All the tests"
|
25
|
+
Rake::TestTask.new("test:all") do |t|
|
26
|
+
ENV["GUI_TESTS"] = "true"
|
27
|
+
t.libs << "test"
|
28
|
+
t.libs << "test"
|
29
|
+
t.pattern = "test/*_test.rb"
|
30
|
+
t.verbose = true
|
31
|
+
end
|
data/actions/add_result
CHANGED
data/actions/add_taxonomy
CHANGED
@@ -1,10 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
|
2
|
+
|
3
3
|
# @package MiGA
|
4
|
-
# @
|
5
|
-
# @license artistic license 2.0
|
6
|
-
# @update Oct-01-2015
|
7
|
-
#
|
4
|
+
# @license Artistic-2.0
|
8
5
|
|
9
6
|
o = {q:true}
|
10
7
|
OptionParser.new do |opt|
|
@@ -57,9 +54,9 @@ if not o[:taxfile].nil?
|
|
57
54
|
$stderr.puts "Reading tax-file and registering taxonomy." unless o[:q]
|
58
55
|
tfh = File.open(o[:taxfile], "r")
|
59
56
|
header = nil
|
60
|
-
|
57
|
+
tfh.each_line do |ln|
|
61
58
|
next if ln =~ /^\s*?$/
|
62
|
-
r = ln.chomp.split
|
59
|
+
r = ln.chomp.split(/\t/, -1)
|
63
60
|
dn = r.shift
|
64
61
|
if header.nil?
|
65
62
|
header = r
|
data/actions/create_dataset
CHANGED
@@ -1,10 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
|
2
|
+
|
3
3
|
# @package MiGA
|
4
|
-
# @
|
5
|
-
# @license artistic license 2.0
|
6
|
-
# @update Nov-29-2015
|
7
|
-
#
|
4
|
+
# @license Artistic-2.0
|
8
5
|
|
9
6
|
o = {q:true, ref:true}
|
10
7
|
OptionParser.new do |opt|
|
@@ -55,8 +52,10 @@ raise "Impossible to load project: #{o[:project]}" if p.nil?
|
|
55
52
|
$stderr.puts "Creating dataset." unless o[:q]
|
56
53
|
md = {}
|
57
54
|
[:type, :description, :user, :comments].each{ |k| md[k]=o[k] unless o[k].nil? }
|
58
|
-
|
55
|
+
MiGA::Dataset.new(p, o[:dataset], o[:ref], md)
|
59
56
|
p.add_dataset(o[:dataset])
|
57
|
+
res = d.first_preprocessing
|
58
|
+
put "- #{res}" unless o[:q]
|
60
59
|
|
61
60
|
$stderr.puts "Done." unless o[:q]
|
62
61
|
|