miga-base 0.2.0.6 → 0.2.0.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (60) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +3 -0
  3. data/LICENSE +201 -0
  4. data/README.md +17 -335
  5. data/Rakefile +31 -0
  6. data/actions/add_result +2 -5
  7. data/actions/add_taxonomy +4 -7
  8. data/actions/create_dataset +5 -6
  9. data/actions/create_project +2 -5
  10. data/actions/daemon +2 -5
  11. data/actions/download_dataset +88 -58
  12. data/actions/find_datasets +36 -38
  13. data/actions/import_datasets +2 -5
  14. data/actions/index_taxonomy +2 -5
  15. data/actions/list_datasets +47 -49
  16. data/actions/list_files +7 -11
  17. data/actions/unlink_dataset +2 -5
  18. data/bin/miga +1 -1
  19. data/lib/miga/common.rb +132 -0
  20. data/lib/miga/daemon.rb +229 -168
  21. data/lib/miga/dataset.rb +354 -277
  22. data/lib/miga/gui.rb +346 -269
  23. data/lib/miga/metadata.rb +115 -71
  24. data/lib/miga/project.rb +361 -259
  25. data/lib/miga/remote_dataset.rb +200 -148
  26. data/lib/miga/result.rb +150 -99
  27. data/lib/miga/tax_index.rb +124 -67
  28. data/lib/miga/taxonomy.rb +129 -100
  29. data/lib/miga/version.rb +57 -0
  30. data/lib/miga.rb +2 -77
  31. data/scripts/_distances_noref_nomulti.bash +2 -0
  32. data/scripts/_distances_ref_nomulti.bash +2 -0
  33. data/scripts/aai_distances.bash +1 -0
  34. data/scripts/ani_distances.bash +1 -0
  35. data/scripts/assembly.bash +1 -0
  36. data/scripts/cds.bash +1 -0
  37. data/scripts/clade_finding.bash +17 -1
  38. data/scripts/distances.bash +1 -0
  39. data/scripts/essential_genes.bash +1 -0
  40. data/scripts/haai_distances.bash +1 -0
  41. data/scripts/init.bash +2 -0
  42. data/scripts/mytaxa.bash +1 -0
  43. data/scripts/mytaxa_scan.bash +1 -0
  44. data/scripts/ogs.bash +1 -0
  45. data/scripts/read_quality.bash +1 -0
  46. data/scripts/ssu.bash +1 -0
  47. data/scripts/subclades.bash +1 -0
  48. data/scripts/trimmed_fasta.bash +1 -0
  49. data/scripts/trimmed_reads.bash +1 -0
  50. data/test/common_test.rb +82 -0
  51. data/test/daemon_test.rb +53 -0
  52. data/test/dataset_test.rb +156 -0
  53. data/test/jruby_gui_test.rb +20 -0
  54. data/test/metadata_test.rb +48 -0
  55. data/test/project_test.rb +54 -0
  56. data/test/remote_dataset_test.rb +41 -0
  57. data/test/tax_index_test.rb +44 -0
  58. data/test/taxonomy_test.rb +36 -0
  59. data/test/test_helper.rb +32 -0
  60. metadata +53 -38
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f80152072105bd365145133c00ddfcd432a008c0
4
- data.tar.gz: 7444a990c359e6c9f2a6a595e688e6319df50ebb
3
+ metadata.gz: 01b3728971a5d407f85578447d4a66dc4c8ab8a8
4
+ data.tar.gz: 656535155e0316681f2d7aa9bcc8b501caed9d96
5
5
  SHA512:
6
- metadata.gz: e4bb05e73def629ea39d72fac9d6e702b247051fb3b21b8db84195127e6d135d9b94bbf5037cde9c8f21e611cf580d53078c215d9c9187bdf861908ad42efe0c
7
- data.tar.gz: ee27ea7cf9b98a3de760e18249e89f6410181d963017e86f5878710bf80a6d3ed5c1715d42ff7b394371add8709816d7c54e9fda5556ae6e4e96a3c4b384ca82
6
+ metadata.gz: 5daf2a27f6a6119e18e5eda94dbda72be91ad7b46f4f8bea401f111ce95d907c88930b08d85543c1694a82aeac54c42dff5d0c3586c0846123d1e1881ad23885
7
+ data.tar.gz: 0a75b0152ad7374729c1cb09b02b539b70429902850c81cdb086f64e9b616cc66f5ba032469be85505c103dc05d15b9469b2c26689b8b4dee2b0c3ef6636ad61
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source "https://rubygems.org"
2
+ gemspec name: "miga-base"
3
+ gem "codeclimate-test-reporter", group: :test, require: nil
data/LICENSE ADDED
@@ -0,0 +1,201 @@
1
+ The Artistic License 2.0
2
+
3
+ Copyright (c) 2016 Luis M Rodriguez-R
4
+
5
+ Everyone is permitted to copy and distribute verbatim copies
6
+ of this license document, but changing it is not allowed.
7
+
8
+ Preamble
9
+
10
+ This license establishes the terms under which a given free software
11
+ Package may be copied, modified, distributed, and/or redistributed.
12
+ The intent is that the Copyright Holder maintains some artistic
13
+ control over the development of that Package while still keeping the
14
+ Package available as open source and free software.
15
+
16
+ You are always permitted to make arrangements wholly outside of this
17
+ license directly with the Copyright Holder of a given Package. If the
18
+ terms of this license do not permit the full use that you propose to
19
+ make of the Package, you should contact the Copyright Holder and seek
20
+ a different licensing arrangement.
21
+
22
+ Definitions
23
+
24
+ "Copyright Holder" means the individual(s) or organization(s)
25
+ named in the copyright notice for the entire Package.
26
+
27
+ "Contributor" means any party that has contributed code or other
28
+ material to the Package, in accordance with the Copyright Holder's
29
+ procedures.
30
+
31
+ "You" and "your" means any person who would like to copy,
32
+ distribute, or modify the Package.
33
+
34
+ "Package" means the collection of files distributed by the
35
+ Copyright Holder, and derivatives of that collection and/or of
36
+ those files. A given Package may consist of either the Standard
37
+ Version, or a Modified Version.
38
+
39
+ "Distribute" means providing a copy of the Package or making it
40
+ accessible to anyone else, or in the case of a company or
41
+ organization, to others outside of your company or organization.
42
+
43
+ "Distributor Fee" means any fee that you charge for Distributing
44
+ this Package or providing support for this Package to another
45
+ party. It does not mean licensing fees.
46
+
47
+ "Standard Version" refers to the Package if it has not been
48
+ modified, or has been modified only in ways explicitly requested
49
+ by the Copyright Holder.
50
+
51
+ "Modified Version" means the Package, if it has been changed, and
52
+ such changes were not explicitly requested by the Copyright
53
+ Holder.
54
+
55
+ "Original License" means this Artistic License as Distributed with
56
+ the Standard Version of the Package, in its current version or as
57
+ it may be modified by The Perl Foundation in the future.
58
+
59
+ "Source" form means the source code, documentation source, and
60
+ configuration files for the Package.
61
+
62
+ "Compiled" form means the compiled bytecode, object code, binary,
63
+ or any other form resulting from mechanical transformation or
64
+ translation of the Source form.
65
+
66
+
67
+ Permission for Use and Modification Without Distribution
68
+
69
+ (1) You are permitted to use the Standard Version and create and use
70
+ Modified Versions for any purpose without restriction, provided that
71
+ you do not Distribute the Modified Version.
72
+
73
+
74
+ Permissions for Redistribution of the Standard Version
75
+
76
+ (2) You may Distribute verbatim copies of the Source form of the
77
+ Standard Version of this Package in any medium without restriction,
78
+ either gratis or for a Distributor Fee, provided that you duplicate
79
+ all of the original copyright notices and associated disclaimers. At
80
+ your discretion, such verbatim copies may or may not include a
81
+ Compiled form of the Package.
82
+
83
+ (3) You may apply any bug fixes, portability changes, and other
84
+ modifications made available from the Copyright Holder. The resulting
85
+ Package will still be considered the Standard Version, and as such
86
+ will be subject to the Original License.
87
+
88
+
89
+ Distribution of Modified Versions of the Package as Source
90
+
91
+ (4) You may Distribute your Modified Version as Source (either gratis
92
+ or for a Distributor Fee, and with or without a Compiled form of the
93
+ Modified Version) provided that you clearly document how it differs
94
+ from the Standard Version, including, but not limited to, documenting
95
+ any non-standard features, executables, or modules, and provided that
96
+ you do at least ONE of the following:
97
+
98
+ (a) make the Modified Version available to the Copyright Holder
99
+ of the Standard Version, under the Original License, so that the
100
+ Copyright Holder may include your modifications in the Standard
101
+ Version.
102
+
103
+ (b) ensure that installation of your Modified Version does not
104
+ prevent the user installing or running the Standard Version. In
105
+ addition, the Modified Version must bear a name that is different
106
+ from the name of the Standard Version.
107
+
108
+ (c) allow anyone who receives a copy of the Modified Version to
109
+ make the Source form of the Modified Version available to others
110
+ under
111
+
112
+ (i) the Original License or
113
+
114
+ (ii) a license that permits the licensee to freely copy,
115
+ modify and redistribute the Modified Version using the same
116
+ licensing terms that apply to the copy that the licensee
117
+ received, and requires that the Source form of the Modified
118
+ Version, and of any works derived from it, be made freely
119
+ available in that license fees are prohibited but Distributor
120
+ Fees are allowed.
121
+
122
+
123
+ Distribution of Compiled Forms of the Standard Version
124
+ or Modified Versions without the Source
125
+
126
+ (5) You may Distribute Compiled forms of the Standard Version without
127
+ the Source, provided that you include complete instructions on how to
128
+ get the Source of the Standard Version. Such instructions must be
129
+ valid at the time of your distribution. If these instructions, at any
130
+ time while you are carrying out such distribution, become invalid, you
131
+ must provide new instructions on demand or cease further distribution.
132
+ If you provide valid instructions or cease distribution within thirty
133
+ days after you become aware that the instructions are invalid, then
134
+ you do not forfeit any of your rights under this license.
135
+
136
+ (6) You may Distribute a Modified Version in Compiled form without
137
+ the Source, provided that you comply with Section 4 with respect to
138
+ the Source of the Modified Version.
139
+
140
+
141
+ Aggregating or Linking the Package
142
+
143
+ (7) You may aggregate the Package (either the Standard Version or
144
+ Modified Version) with other packages and Distribute the resulting
145
+ aggregation provided that you do not charge a licensing fee for the
146
+ Package. Distributor Fees are permitted, and licensing fees for other
147
+ components in the aggregation are permitted. The terms of this license
148
+ apply to the use and Distribution of the Standard or Modified Versions
149
+ as included in the aggregation.
150
+
151
+ (8) You are permitted to link Modified and Standard Versions with
152
+ other works, to embed the Package in a larger work of your own, or to
153
+ build stand-alone binary or bytecode versions of applications that
154
+ include the Package, and Distribute the result without restriction,
155
+ provided the result does not expose a direct interface to the Package.
156
+
157
+
158
+ Items That are Not Considered Part of a Modified Version
159
+
160
+ (9) Works (including, but not limited to, modules and scripts) that
161
+ merely extend or make use of the Package, do not, by themselves, cause
162
+ the Package to be a Modified Version. In addition, such works are not
163
+ considered parts of the Package itself, and are not subject to the
164
+ terms of this license.
165
+
166
+
167
+ General Provisions
168
+
169
+ (10) Any use, modification, and distribution of the Standard or
170
+ Modified Versions is governed by this Artistic License. By using,
171
+ modifying or distributing the Package, you accept this license. Do not
172
+ use, modify, or distribute the Package, if you do not accept this
173
+ license.
174
+
175
+ (11) If your Modified Version has been derived from a Modified
176
+ Version made by someone other than you, you are nevertheless required
177
+ to ensure that your Modified Version complies with the requirements of
178
+ this license.
179
+
180
+ (12) This license does not grant you the right to use any trademark,
181
+ service mark, tradename, or logo of the Copyright Holder.
182
+
183
+ (13) This license includes the non-exclusive, worldwide,
184
+ free-of-charge patent license to make, have made, use, offer to sell,
185
+ sell, import and otherwise transfer the Package with respect to any
186
+ patent claims licensable by the Copyright Holder that are necessarily
187
+ infringed by the Package. If you institute patent litigation
188
+ (including a cross-claim or counterclaim) against any party alleging
189
+ that the Package constitutes direct or contributory patent
190
+ infringement, then this Artistic License to you shall terminate on the
191
+ date that such litigation is filed.
192
+
193
+ (14) Disclaimer of Warranty:
194
+ THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS "AS
195
+ IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES. THE IMPLIED
196
+ WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR
197
+ NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY YOUR LOCAL
198
+ LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR CONTRIBUTOR WILL
199
+ BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
200
+ DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE, EVEN IF
201
+ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
data/README.md CHANGED
@@ -1,351 +1,33 @@
1
1
  [![Code Climate](https://codeclimate.com/github/bio-miga/miga/badges/gpa.svg)](https://codeclimate.com/github/bio-miga/miga)
2
2
  [![Test Coverage](https://codeclimate.com/github/bio-miga/miga/badges/coverage.svg)](https://codeclimate.com/github/bio-miga/miga/coverage)
3
- [![Build Status](https://travis-ci.org/lmrodriguezr/gfa.svg?branch=master)](https://travis-ci.org/lmrodriguezr/gfa)
3
+ [![Build Status](https://travis-ci.org/bio-miga/miga.svg?branch=master)](https://travis-ci.org/bio-miga/miga)
4
+ [![Gem Version](https://badge.fury.io/rb/miga-base.svg)](https://badge.fury.io/rb/miga-base)
5
+ [![Inch docs](http://inch-ci.org/github/bio-miga/miga.svg)](http://inch-ci.org/github/bio-miga/miga)
6
+ [![Yard docs](http://img.shields.io/badge/yard-docs-blue.svg)](http://www.rubydoc.info/github/bio-miga/miga)
4
7
 
5
- MiGA: Microbial Genomes Atlas
6
- =============================
7
8
 
9
+ # MiGA: Microbial Genomes Atlas
8
10
 
9
- Installation
10
- ------------
11
+ **Important**: The MiGA code is under active development, and we currently
12
+ cannot ensure any stability on the different interfaces. We'll be launching a
13
+ Beta Testing program soon, with dedicated support for a small number of
14
+ laboratories. If you're interested, please [contact us][contact].
11
15
 
12
- Please see [INSTALLATION.md](./INSTALLATION.md) for instructions.
16
+ For additional information on the MiGA system, please refer to the
17
+ [MiGA manual][gitbook]. For additional information on the MiGA API
18
+ (and Ruby gem), please refer to the [miga docs][rubydoc].
13
19
 
14
20
 
15
- Getting started with MiGA
16
- -------------------------
17
-
18
- ### MiGA Interfaces
19
-
20
- You caninteract with MiGA through different interfaces. These interfaces have
21
- different purposes, but they also have some degree of overlap, because different
22
- users with different aims sometimes want to do the same thing. Throughout this
23
- manual I'll be telling you how to do things using mostly the CLI, but I'll also
24
- try to mention the GUI and the Web Interface. The CLI is the most comprehensive
25
- and flexible interface, but the other two are friendlier to humans. There is a
26
- fourth interface that I won't be mentioning at all, but I'll try to document:
27
- the Ruby API. MiGA is mostly written in Ruby, with an object-oriented approach,
28
- and all the interfaces are just thin layers atop the Ruby core. That means that
29
- you can write your own interfaces (or pieces) if you know how to talk to these
30
- Ruby objects. Sometimes I even use `irb`, which is an interactive shell for
31
- Ruby, but that's mostly for debugging.
32
-
33
- #### MiGA CLI
34
-
35
- CLI stands for Command Line Interface. This is a set of little scripts that let
36
- you talk with MiGA through the terminal shell. If MiGA is in your PATH (see
37
- [installation details](./INSTALLATION.md#miga-in-your-path)), you can simply run
38
- `miga` in your terminal, and the help messages will take it from there. All the
39
- MiGA CLI calls look like:
40
-
41
- ```bash
42
- miga task [options]
43
- ```
44
-
45
- Where `task` is one of the supported tasks and `[options]` is a set of dash-flag
46
- options supported by each task. `-h` is always there to provide help. If you're
47
- a MiGA administrator, this is probably the most convenient option for you (but
48
- hey, give the GUI a chance).
49
-
50
- #### MiGA GUI
51
-
52
- The Graphical User Interface is the friendlier option for setting up a MiGA
53
- project. It doesn't have as many options as the CLI, but it's pretty easy to
54
- use, so it's a good option if you have a typical project in your hands.
55
-
56
- #### MiGA Web
57
-
58
- The Web interface for MiGA is the way MiGA reports results from a project. It's
59
- not designed to set up new projects, but to explore existing ones, and to submit
60
- non-reference datasets for analyses.
61
-
62
- ### Creating your first project
63
-
64
- You can do this in the GUI, but I like the CLI better, so I'll be telling you
65
- how to tell MiGA what to do from the CLI. First, think where you'll place your
66
- project. Normally this means a location...
67
-
68
- 1. ... with enough space. This is, plan for at least 4 or 5 times the size of
69
- the input files.
70
-
71
- 2. ... accessible by worker nodes. If you're using a single server, this is not
72
- really an issue. However, if you plan on deploying MiGA in a cluster
73
- infrastructure, make sure your project is reachable by worker nodes.
74
-
75
- 3. ... with fast access. It's not a great idea to set up projects in remote
76
- drives with large latency. In some cases there no way around this, for example
77
- when that's the only available option in your cluster infrastructure, but try
78
- to avoid this as much as possible.
79
-
80
- Now that you know where to create your project, go ahead and run:
81
-
82
- ```bash
83
- miga create_project -P /path/to/project1 -t type-of-project
84
- ```
85
-
86
- Where `/path/to/project1` is the path to where the project should be created.
87
- You don't need to create the folder in advance, MiGA will take care. See the
88
- next section to help you decide what `type-of-project` to use. There are some
89
- other options that are not mandatory, but will make your project richer. Take a
90
- look at `miga create_project -h`.
91
-
92
- #### Project types
93
-
94
- Projects can be set for different purposes, so we've divided them into "types".
95
- There are four of them, depending on the types of datasets to be processed (see
96
- [Dataset types](#dataset-types)):
97
-
98
- 1. **mixed**: A generic project with any supported type of datasets.
99
-
100
- 2. **metagenomes**: A project containing only metagenomic datasets. This
101
- includes either (or both) metagenomes and viromes.
102
-
103
- 3. **genomes**: A project containing only single-organism datasets. This
104
- includes any of the single-organism types: genome, scgenome, and/or popgenome.
105
-
106
- 4. **clade**: Same as "genomes", but all the datasets are expected to be from
107
- the same species. This type of project performs additional analyses that expect
108
- a very dense ANI matrix, so all genomes in it are expected to have AAI > 90%.
109
-
110
- ### Creating datasets
111
-
112
- Once your project is ready, you can start populating it with datasets and data.
113
- While it's possible to create empty datasets using `miga create_dataset`, the
114
- preferred method is to first add data and then use the data to create the
115
- datasets in batch. For example, lets assume you have a collection of paired-end
116
- raw reads from several datasets. The first step is to format the filenames
117
- properly. For each one of your datasets, pick a name that conforms the
118
- [MiGA names](#miga-names) restrictions (we'll call it "ds1") and rename your
119
- reads to `/path/to/project1/data/01.raw_reads/ds1.1.fastq` for the first
120
- sister and `/path/to/project1/data/01.raw_reads/ds1.2.fastq` for the second
121
- sister. Also, add the date into `/path/to/project1/data/01.raw_reads/ds1.done`.
122
- Check what are the [expected result files](#expected-result-files) below if you
123
- want to start at any other point in the pipeline. Once you have renamed (or
124
- copied) the files inside the project folder, run:
125
-
126
- ```bash
127
- miga find_datasets -P /path/to/project1 -a -r -t type-of-dataset
128
- ```
129
-
130
- The `-a` flag tells MiGA that you want to add the datasets (not just find them);
131
- the `-r` flag tells MiGA that your datasets are to be treated as "reference"
132
- datasets (see [Non-reference datasets](#non-reference-datasets) below); and the
133
- `-t` option tells MiGA what type of datasets you're adding (see
134
- [Dataset types](#dataset-types) below). If you have a mixture of dataset types,
135
- process one at a time. This is, perform this step for each dataset type. Don't
136
- worry about the datasets that are already registered, those will be ignored by
137
- the `find_datasets` task and will remain unchanged.
138
-
139
- #### Expected result files
140
-
141
- For brevity, we'll assume that you're inside `/path/to/project1/data`; *i.e.*,
142
- in the `data` directory of your project. We'll also assume that you're naming
143
- your dataset **ds1**, but you can change this by anything following the
144
- [MiGA names](#miga-names) restrictions. Now, these are the "input" points that
145
- you can use in MiGA:
146
-
147
- 1. **Paired-end raw reads**: The expected files are `01.raw_reads/ds1.1.fastq`
148
- and `01.raw_reads/ds1.2.fastq`, each including a sister end. The reads must be
149
- in the same order in both files (MiGA won't check). You can also use gzipped
150
- files instead.
151
-
152
- 2. **Single-end raw reads**: The expected file is `01.raw_reads/ds1.1.fastq`.
153
- You can also use a gzipped file instead.
154
-
155
- 3. **Paired-end trimmed reads**: These are assumed to be quality-controlled
156
- reads in FastA format, with both ends passing the quality filters. The minimum
157
- expected file is `04.trimmed_fasta/ds1.CoupledReads.fa`, which contains the
158
- reads interposed. You can also pass (in addition) the reads that past the
159
- quality check without the sister as a gzipped FastA at
160
- `04.trimmed_fasta/ds1.SingleReads.fa.gz`.
161
-
162
- 4. **Single-end trimmed reads**: Similar to the option above, only
163
- quality-checked reads are expected here. The expected file is
164
- `04.trimmed_fasta/ds1.SingleReads.fa`.
165
-
166
- 5. **Assembled fragments**: This can be any assembly result, including complete
167
- genomes. The expected file is `05.assembly/ds1.LargeContigs.fna`, containing
168
- only contigs longer than 500bp. You can also provide the complete assembly
169
- (without length-filtering) at `05.assembly/ds1.AllContigs.fna`.
170
-
171
- 6. **Predicted genes/proteins**: This is the total collection of predicted genes
172
- and proteins. The expected files are `06.cds/ds1.fna`, containing genes, and
173
- `06.cds/ds1.faa`, containing proteins. You can also provide the locations of
174
- said genes in the genome in gzipped GFF v2 (`06.cds/ds1.gff2.gz`), gzipped
175
- GFF v3 (`06.cds/ds1.gff3.gz`), or gzipped tabular (`06.cds/ds1.tab.gz`).
176
-
177
- **IMPORTANT**: In all cases, an additional `ds1.done` file MUST be created in
178
- the same folder. This is meant to prevent MiGA from mistakenly adding files as
179
- results before they're done being processed or transferred. This file must
180
- contain the current [date in MiGA format](#date-in-miga-format). Here's a quick
181
- code snippet to add the `.done` file for all the input files in `01.raw_reads`
182
- (you can adapt this accordingly to any of the other options):
183
-
184
- ```bash
185
- cd /path/to/project1/data/01.raw_reads
186
- for i in *.1.fastq ; do
187
- date "+%Y-%m-%d %H:%M:%S %z" > $(basename $i .1.fastq).done
188
- done
189
- ```
190
-
191
- #### Dataset types
192
-
193
- This is how you tell MiGA what kind of data you have in your datasets. Lets see
194
- the definitions:
195
-
196
- 1. **genome**: The genome from an isolate.
197
- 2. **metagenome**: A metagenome (excluding viromes).
198
- 3. **virome**: A viral metagenome.
199
- 4. **scgenome**: A genome from a single cell.
200
- 5. **popgenome**: The genome of a population (including microdiversity).
201
-
202
- #### Non-reference datasets
203
-
204
-
205
- #### Creating a RefSeq project
206
-
207
- If you've reached this point, you are now ready to create a large functional
208
- project. If you want to continue using this documentation on real data but
209
- don't have any of your own handy (or if you want to use RefSeq data), this
210
- is a quick tutoral on how to create a functional MiGA project using ALL of
211
- NCBI's Prokaryotic RefSeq data.
212
-
213
- **Step 1: Create the project**. That's simple, just `cd` to the directory you
214
- want to use, and execute `miga create_project -P MiGA_RefSeq -t genomes`.
215
-
216
- **Step 2: Download the data**. Just `cd MiGA_RefSeq`, and execute this code:
217
-
218
- ```bash
219
- wget -O reference_genomes.txt 'http://www.ncbi.nlm.nih.gov/genomes/Genome2BE/genome2srv.cgi?action=refgenomes&download=on&type=reference'
220
- grep -v '^#' reference_genomes.txt \
221
- | awk -F'\t' '{gsub(/[^A-Za-z0-9]/,"_",$3)} {print "miga download_dataset -P . -D "$3" -I "$4" -U ncbi --db nuccore -t genome -v # "$3""}' \
222
- | while read ln ; do
223
- sp=$(echo $ln | perl -pe 's/.*# //')
224
- if [[ ! -n $(miga list_datasets -P . -D $sp) ]] ; then
225
- echo $ln
226
- $ln
227
- fi
228
- done
229
- ```
230
-
231
- And that's it. The first line will download the most current list of genomes
232
- included in NCBI's Prokaryotic RefSeq, and the rest will repeatedly execute the
233
- `download_dataset` task, that automatically fetches the data (even the genome's
234
- taxonomy!). Note that the code above checks first if a dataset already exists,
235
- so if you want to update an existing MiGA_RefSeq project, simply repeat step 2
236
- and only missing genomes will be fetched.
237
-
238
- Note that running time for the above code may vary depending on the network and
239
- the size of RefSeq, but I was able to create a complete project with 122 genomes
240
- in under 10 minutes.
241
-
242
- **Alternative step 2: downloading all representatives**. If you want a larger
243
- and more comprehensive collection, and not just the reference genomes, you can
244
- download all of the representative genomes in the prokaryotic RefSeq with this
245
- alternative code:
246
-
247
- ```bash
248
- wget -O representative_genomes.txt 'http://www.ncbi.nlm.nih.gov/genomes/Genome2BE/genome2srv.cgi?action=refgenomes&download=on'
249
- grep -v '^#' representative_genomes.txt \
250
- | awk -F'\t' '{gsub(/[^A-Za-z0-9]/,"_",$3)} $4{print "miga download_dataset -P . -D "$3" -I "$4" -U ncbi --db nuccore -t genome -v # "$3""}' \
251
- | while read ln ; do
252
- sp=$(echo $ln | perl -pe 's/.*# //')
253
- if [[ ! -n $(miga list_datasets -P . -D $sp) ]] ; then
254
- echo $ln
255
- $ln
256
- fi
257
- done
258
- ```
259
-
260
- This is a much larger set (1,246), hence it'll take much more time. I finished
261
- downloading the whole thing in about one and a half hours.
262
-
263
-
264
- Launching daemons
265
- -----------------
266
-
267
- ### Configuring daemons
268
-
269
-
270
- ### Understating the MiGA configuration file
271
-
272
-
273
- ### Arbitrary configuration scripts
274
-
275
-
276
- ### Fixing system calls with aliases
277
-
278
- In some cases, we might not have the same executable names as MiGA expects, or
279
- we might have broken modules in our cluster that can be easily fixed with an
280
- `alias`. In these cases, you can use
281
- [arbitrary configuration scripts](#arbitrary-configuration-scripts) to generate
282
- one or more `alias`. Importantly, MiGA daemons work with non-interactive shells,
283
- which means you likely need to explicitly allow for alias extensions, for
284
- example:
285
-
286
- ```bash
287
- # Allow alias expansions in non-interactive shells
288
- shopt -s expand_aliases
289
-
290
- # Call FastQC with the environmental Perl,
291
- # not the built-in /usr/bin/perl:
292
- alias fastqc="perl $(which fastqc)"
293
-
294
- # Use the standard name for RAxML (pthreads)
295
- # instead of the one my sys-admin decided to use:
296
- alias raxmlHPC-PTHREADS=RAxML_pthreads
297
- ```
298
-
299
- The examples above illustrate how to use `alias` to fix broken packages or to
300
- make Software with non-standard names reachable.
301
-
302
- **Known caveats to this solution:** This solution CANNOT BE USED in the few
303
- cases in which a whole package is expected based on a single executable. For
304
- example, adding the enveomics scripts to your `PATH` is far easier than creating
305
- an `alias` for each script. Also, MiGA expects to find the model, the activation
306
- key, and the scripts of MetaGeneMark in the same folder of the `gmhmmp` binary,
307
- so setting an`alias` may prevent MiGA from finding these ancillary files.
308
-
309
-
310
- Cluster infrastructure
311
- ----------------------
312
-
313
-
314
- ### Loading optional modules
315
-
316
-
317
- See also [Fixing system calls with aliases](#fixing-system-calls-with-aliases).
318
-
319
-
320
- Miscellaneous
321
- -------------
322
-
323
- These below are reference snippets that for which I couldn't find a more
324
- suitable home, but are important documentation.
325
-
326
- ### MiGA Names
327
-
328
- MiGA names are non-empty strings composed exclusively of alphanumerics and
329
- underscores. All the dataset names in MiGA must conform this restriction, but
330
- not all the projects do. Other objects must conform the MiGA name restrictions,
331
- such as taxonomic entries.
332
-
333
- ### Date in MiGA format
334
-
335
- The official format in which MiGA represents date/times is the default of Ruby's
336
- `Time.now.to_s`. In the *nix `date` utility this corresponds to the format:
337
- `+%Y-%m-%d %H:%M:%S %z`.
338
-
339
-
340
- Authors
341
- -------
21
+ # Authors
342
22
 
343
23
  Developed and maintained by [Luis M. Rodriguez-R][lrr].
344
24
 
345
25
 
346
- License
347
- -------
26
+ # License
348
27
 
349
28
  See [LICENSE](LICENSE).
350
29
 
351
30
  [lrr]: http://lmrodriguezr.github.io/
31
+ [gitbook]: https://miga.gitbooks.io/miga/content/
32
+ [rubydoc]: http://www.rubydoc.info/github/bio-miga/miga
33
+ [contact]: http://enve-omics.gatech.edu/node/7
data/Rakefile ADDED
@@ -0,0 +1,31 @@
1
+ require "rake/testtask"
2
+
3
+ SOURCES = FileList["lib/**/*.rb"]
4
+
5
+ desc "Default Task"
6
+ task :default => "test:base"
7
+
8
+ desc "Base Tests"
9
+ Rake::TestTask.new("test:base") do |t|
10
+ t.libs << "test"
11
+ t.pattern = "test/[^j]*_test.rb"
12
+ t.verbose = true
13
+ end
14
+
15
+ desc "GUI Tests"
16
+ Rake::TestTask.new("test:gui") do |t|
17
+ ENV["GUI_TESTS"] = "true"
18
+ t.libs << "test"
19
+ t.libs << "test"
20
+ t.pattern = "test/j*_test.rb"
21
+ t.verbose = true
22
+ end
23
+
24
+ desc "All the tests"
25
+ Rake::TestTask.new("test:all") do |t|
26
+ ENV["GUI_TESTS"] = "true"
27
+ t.libs << "test"
28
+ t.libs << "test"
29
+ t.pattern = "test/*_test.rb"
30
+ t.verbose = true
31
+ end
data/actions/add_result CHANGED
@@ -1,10 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
- #
2
+
3
3
  # @package MiGA
4
- # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
5
- # @license artistic license 2.0
6
- # @update Oct-01-2015
7
- #
4
+ # @license Artistic-2.0
8
5
 
9
6
  o = {q:true}
10
7
  opts = OptionParser.new do |opt|
data/actions/add_taxonomy CHANGED
@@ -1,10 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
- #
2
+
3
3
  # @package MiGA
4
- # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
5
- # @license artistic license 2.0
6
- # @update Oct-01-2015
7
- #
4
+ # @license Artistic-2.0
8
5
 
9
6
  o = {q:true}
10
7
  OptionParser.new do |opt|
@@ -57,9 +54,9 @@ if not o[:taxfile].nil?
57
54
  $stderr.puts "Reading tax-file and registering taxonomy." unless o[:q]
58
55
  tfh = File.open(o[:taxfile], "r")
59
56
  header = nil
60
- while ln = tfh.gets
57
+ tfh.each_line do |ln|
61
58
  next if ln =~ /^\s*?$/
62
- r = ln.chomp.split /\t/, -1
59
+ r = ln.chomp.split(/\t/, -1)
63
60
  dn = r.shift
64
61
  if header.nil?
65
62
  header = r
@@ -1,10 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
- #
2
+
3
3
  # @package MiGA
4
- # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
5
- # @license artistic license 2.0
6
- # @update Nov-29-2015
7
- #
4
+ # @license Artistic-2.0
8
5
 
9
6
  o = {q:true, ref:true}
10
7
  OptionParser.new do |opt|
@@ -55,8 +52,10 @@ raise "Impossible to load project: #{o[:project]}" if p.nil?
55
52
  $stderr.puts "Creating dataset." unless o[:q]
56
53
  md = {}
57
54
  [:type, :description, :user, :comments].each{ |k| md[k]=o[k] unless o[k].nil? }
58
- d = MiGA::Dataset.new(p, o[:dataset], o[:ref], md)
55
+ MiGA::Dataset.new(p, o[:dataset], o[:ref], md)
59
56
  p.add_dataset(o[:dataset])
57
+ res = d.first_preprocessing
58
+ put "- #{res}" unless o[:q]
60
59
 
61
60
  $stderr.puts "Done." unless o[:q]
62
61