miga-base 1.2.18.2 → 1.3.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (31) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/cli/action/doctor/base.rb +2 -1
  3. data/lib/miga/cli/action/init.rb +1 -1
  4. data/lib/miga/dataset/result/add.rb +3 -2
  5. data/lib/miga/version.rb +2 -2
  6. data/scripts/essential_genes.bash +4 -8
  7. data/utils/FastAAI/LICENSE +8 -0
  8. data/utils/FastAAI/README.md +151 -40
  9. data/utils/FastAAI/__init__.py +1 -0
  10. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962915_1.fna.gz +0 -0
  11. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962925_1.fna.gz +0 -0
  12. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962935_1.fna.gz +0 -0
  13. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962945_1.fna.gz +0 -0
  14. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000962995_1.fna.gz +0 -0
  15. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963025_1.fna.gz +0 -0
  16. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963055_1.fna.gz +0 -0
  17. data/utils/FastAAI/example_genomes/Xanthomonas_albilineans_GCA_000963065_1.fna.gz +0 -0
  18. data/utils/FastAAI/example_genomes/_Pseudomonas__cissicola_GCA_002019225_1.fna.gz +0 -0
  19. data/utils/FastAAI/example_genomes/_Pseudomonas__cissicola_GCA_008801575_1.fna.gz +0 -0
  20. data/utils/FastAAI/fastaai/__init__.py +1 -0
  21. data/utils/FastAAI/fastaai/fastaai +4805 -0
  22. data/utils/FastAAI/fastaai/fastaai.py +4805 -0
  23. data/utils/FastAAI/fastaai/fastaai_miga_crystals_to_db.py +297 -0
  24. data/utils/FastAAI/fastaai/fastaai_miga_preproc.py +931 -0
  25. data/utils/FastAAI/metadata/Accession_names_and_IDs.txt +122 -0
  26. data/utils/distance/commands.rb +51 -23
  27. metadata +23 -6
  28. data/utils/FastAAI/FastAAI +0 -3659
  29. /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Archaea_SCG.hmm +0 -0
  30. /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Bacteria_SCG.hmm +0 -0
  31. /data/utils/FastAAI/{00.Libraries → fastaai/00.Libraries}/01.SCG_HMMs/Complete_SCG_DB.hmm +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4ad3a195707e0391dfc4166b1296d0baf7d42267e4c61013bed5ed1ae3e38fa5
4
- data.tar.gz: b73a2b06c2f21a7cf96f2c0110c6a0ecbddf26ea8ce26974d511f09c0df5b4aa
3
+ metadata.gz: '0832ca5217f409e381a60a13fb077540c6850b3eac0410eb3f077c7e8c82b8ae'
4
+ data.tar.gz: 44e8b3dfdda624c8fd11e7c2b2e142594383b64a912772b1ae6df8df7a0ef33b
5
5
  SHA512:
6
- metadata.gz: 27150ce62c811276af7ccd5d6990f074ea42a1adbae38f2134bb1cf4fbd0be9722a9ce3c335868b97e88e21a3dcd6c7c5eb65c9b958d298265ffe030e1438510
7
- data.tar.gz: 871dabd94f165be6ae1e4d7310bae0565c85e5c1a0fa5ab38a2dd11c1a988580c7cdb0419f561882aa887ff7819e7c67f51928127d0c5c7cf80559baa460c881
6
+ metadata.gz: 5dcc61a0bfe803218b94b262f44ee88f2a591ba5416cbedf755f932811825aa91780967e6852aabbbfd38c436e56deef997635209efa7eb8d67fdcd31f9e1a59
7
+ data.tar.gz: cadef92e9f5a6bc029323660962404c13e6344ef6c662dd2894e364f030b9f8d7d2f2c6ded2617f6de857e485e2e1eff3abafe71f4cb1e4013078dca86c7bf8a
@@ -131,6 +131,7 @@ module MiGA::Cli::Action::Doctor::Base
131
131
  def outdated_fastaai_ess(res)
132
132
  idx1 = res.file_path(:fastaai_index)
133
133
  idx2 = res.file_path(:fastaai_index_2)
134
- idx2.nil? && !idx1.nil?
134
+ idx3 = res.file_path(:fastaai_crystal)
135
+ idx3.nil? && !(idx1.nil? && idx2.nil?)
135
136
  end
136
137
  end
@@ -194,7 +194,7 @@ class MiGA::Cli::Action::Init < MiGA::Cli::Action
194
194
  req_libraries = {
195
195
  r: %w[ape cluster vegan],
196
196
  ruby: %w[sqlite3 daemons json],
197
- python: %w[numpy sqlite3]
197
+ python: %w[numpy sqlite3 pyhmmer]
198
198
  }
199
199
 
200
200
  req_libraries.each do |language, libraries|
@@ -125,7 +125,8 @@ module MiGA::Dataset::Result::Add
125
125
  report: '.ess/log',
126
126
  alignments: '.ess/proteins.aln',
127
127
  fastaai_index: '.faix.db.gz',
128
- fastaai_index_2: '.faix'
128
+ fastaai_index_2: '.faix',
129
+ fastaai_crystal: '.crystal'
129
130
  )
130
131
  end
131
132
 
@@ -283,4 +284,4 @@ module MiGA::Dataset::Result::Add
283
284
  r.add_files(files)
284
285
  r
285
286
  end
286
- end
287
+ end
data/lib/miga/version.rb CHANGED
@@ -12,7 +12,7 @@ module MiGA
12
12
  # - String indicating release status:
13
13
  # - rc* release candidate, not released as gem
14
14
  # - [0-9]+ stable release, released as gem
15
- VERSION = [1.2, 18, 2].freeze
15
+ VERSION = [1.3, 0, 0].freeze
16
16
 
17
17
  ##
18
18
  # Nickname for the current major.minor version.
@@ -20,7 +20,7 @@ module MiGA
20
20
 
21
21
  ##
22
22
  # Date of the current gem relese.
23
- VERSION_DATE = Date.new(2023, 3, 7)
23
+ VERSION_DATE = Date.new(2023, 3, 20)
24
24
 
25
25
  ##
26
26
  # References of MiGA
@@ -38,14 +38,10 @@ HMM.essential.rb \
38
38
  # Index for FastAAI
39
39
  NOMULTI=$(miga ls -P "$PROJECT" -D "$DATASET" --no-multi \
40
40
  | wc -l | awk '{print $1}')
41
- if [[ "$NOMULTI" -eq "1" ]] ; then
42
- echo "$FAA" > "$DATASET"
43
- FastAAI build_db --protein_file "$DATASET" \
44
- -o "${DATASET}.faix.d" --threads "$CORES"
45
- rm "$DATASET"
46
- mv "${DATASET}.faix.d/database/FastAAI_database.sqlite.db" "${DATASET}.faix"
47
- rm -r "${DATASET}.faix.d"
48
- fi
41
+ [[ "$NOMULTI" -eq "1" ]] && \
42
+ python3 "$MIGA/utils/FastAAI/fastaai/fastaai_miga_preproc.py" \
43
+ --protein "$FAA" --output_crystal "${DATASET}.crystal" \
44
+ --compress
49
45
 
50
46
  # Reduce files
51
47
  if exists "$DATASET".ess/*.faa ; then
@@ -0,0 +1,8 @@
1
+ The MIT License (MIT)
2
+ Copyright © 2022 Kenji Gerhardt, Carlos Ruiz-Perez, Miguel Rodriguez-Rojas, Konstantinos Konstantinidis
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
5
+
6
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
7
+
8
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -1,6 +1,5 @@
1
1
  # FastAAI
2
- Fast estimation of Average Amino Acid Identities (AAI) for bacterial and viral genomes.
3
- Includes a module for the classification of viral genomes.
2
+ Fast estimation of Average Amino Acid Identities (AAI) for bacterial and archaeal genomes.
4
3
 
5
4
  ## Content Table
6
5
  * [Features](#features)
@@ -8,76 +7,188 @@ Includes a module for the classification of viral genomes.
8
7
  * [Requirements](#requirements)
9
8
  * [Installation](#installation)
10
9
  * [Usage](#usage)
11
- * [FAQs](#faqs)
10
+ * [Example](#example)
11
+ * [Additional Information](#info)
12
12
  * [License](#license)
13
13
 
14
14
  ## Features
15
- Coming soon
15
+ *
16
16
 
17
17
  ## Citation
18
18
  Coming soon
19
19
 
20
20
  ## Requirements:
21
- - Programs:
22
- - [HMMER](http://hmmer.org/) >= 3.1
23
- - Python >=3.6,<3.9
24
- - Base Python Modules:
25
- - argparse
26
- - datetime
27
- - pathlib
28
- - shutil
29
- - subprocess
30
- - gzip
31
- - multiprocessing
32
- - textwrap
33
- - pickle
34
- - tempfile
35
- - sys
36
- - functools
21
+ - Python >=3.6 (3.9+ recommended)
37
22
  - Additional Python Modules:
38
23
  - numpy
24
+ - pyrodigal - https://github.com/althonos/pyrodigal/
25
+ - pyhmmer - https://github.com/althonos/pyhmmer
39
26
 
40
27
  ## Installation
41
- ### Conda Installation
42
- FastAAIIt appears we need a bunch of pre-requisites to run FastAAI No worries, their installation using Conda is quite easy. If you don't have Conda, you can install it as follows:
43
- 1. Download Anaconda from https://www.anaconda.com/products/individual.
44
- 2. Run `bash Anaconda-latest-Linux-x86_64.sh` and follow the installation instructions.
45
- 3. Once installed you can run `conda -V`. You should get the version of conda that you installed.
46
28
 
47
- Now, let's add the conda channels required to install the pre-requisites:
29
+ FastAAI and all its dependencies can be installed through pip with the following command:
48
30
 
49
31
  ```bash
50
- conda config --add channels conda-forge
51
- conda config --add channels bioconda
52
- conda config --add channels cruizperez
32
+ pip install FastAAI
53
33
  ```
54
34
 
55
- Then, create an environment for MicrobeAnnotator:
35
+ ## Usage
36
+
37
+ FastAAI executes its behaviors through commands. A list of commands and their behaviors can be seen through simply calling FastAAI on the command line:
56
38
 
57
39
  ```bash
58
- conda create -n fastaai hmmer prodigal numpy python=3.7 fastaai
40
+ fastaai
59
41
  ```
60
42
 
61
- And activate it:
43
+ The various commands each contain their own usage instructions, which can be accessed through calling fastaai [command], e.g.
62
44
 
63
45
  ```bash
64
- conda activate microbeannotator
46
+ fastaai build_db
65
47
  ```
66
48
 
67
- Both main scripts (microbeannotator and microbeannotator_db_builder) should be in your path ready for use!
68
- This should take care of most of the requirements except for Aspera Connect and KofamScan, which are a little more involved. Let's install those.
49
+ The five FastAAI commands are
69
50
 
70
- ### Pip Installation
71
- #Once you have installed the pre-requisites to run MicrobeAnnotator, or if you already had them and you are not using Conda, you can install MicrobeAnnotator using pip:
51
+ * build_db - Input a set of genomes and predict proteins, identify single-copy proteins, and construct (or add to) a FastAAI database.
52
+ * merge_db - Merge two or more FastAAI databases. Can create a new database or modify an existing one.
53
+ * simple_query - Input a set of genomes as a query and a prebuilt FastAAI database as a target; calculate AAI for each query against each target
54
+ * db_query - Query the genomes in one FastAAI database against the genomes in another (or itself). Calculate AAI for each genome pair between the two.
55
+ * single_query - Input exactly two genomes; preprocess as needed and calculate AAI between the pair of genomes.
72
56
 
57
+ ## Example
73
58
 
74
- ## Usage
75
- ### Database creation
59
+ Let's say we have a collection of genomes in a folder labeled "example_genomes" (which you can find in this respository). Each genome in the folder is in its own nucleotide FASTA-format file. The files can be gzipped or uncompressed - FastAAI doesn't care. The ones in the folder here are gzipped.
60
+
61
+ The first step is building a database. Here's an example command to do so:
62
+
63
+ ```bash
64
+ fastaai build_db --genomes example_genomes/ --threads 4 --verbose --output example_build --database my_example_db.db --compress
65
+ ```
66
+
67
+ This will create a folder called "example_build" which contains subfolders named "predicted_proteins," "hmms", "database", and "logs." The logs folder will contain a file named "FastAAI_preprocessing_log.txt," recording information about the protein prediction and HMM search results for each queryy genome. Finally, the database folder will contain "my_example_db.db," which is your completed FastAAI database.
68
+
69
+ Because we used the --compress flag, files in the predicted_proteins and hmms folders will be gzipped upon output, and because we used the --verbose flag, we'll get a progress report as FastAAI works that will look like so:
70
+
71
+ Completion |############### | 30.00% ( 3 of 10 ) at 19/09/2022 13:56:41
72
+
73
+ The report only updates every 2% completion, so it may be some time between updates if you're running hundreds or thousands of genomes. A build_db command will have two progress bars, one for preprocessing and one for database formatting, but they'll all look like so.
74
+
75
+ Next, we can calculate AAI:
76
+
77
+ ```bash
78
+ fastaai db_query --query example_build/database/my_example_db.db --target example_build/database/my_example_db.db --threads 4 --verbose --output example_build
79
+ ```
80
+
81
+ By supplying the same database as query and target, we'll be calculating an all vs. all AAI estimation for the genomes in the database. This will our all vs. all estimate for the genomes we had in our "example_genomes" folder.
82
+
83
+ We didn't supply --output_style matrix, so we'll be getting tabular output files. We also didn't tell FastAAI to calculate standard deviations with --do_stdev, so the fourth column will be all N/A. These files will be in example_build/results/, since we gave the same directory base as the output location.
84
+
85
+ When it's done (which should take less than a second), you'll find files that look like this:
86
+
87
+ query target avg_jacc_sim jacc_SD num_shared_SCPs poss_shared_SCPs AAI_estimate
88
+
89
+ _Pseudomonas__cissicola_GCA_002019225_1.fna.gz Xanthomonas_albilineans_GCA_000962915_1.fna.gz 0.5199 N/A 79 79 68.75
90
+
91
+ _Pseudomonas__cissicola_GCA_002019225_1.fna.gz Xanthomonas_albilineans_GCA_000962925_1.fna.gz 0.5176 N/A 79 79 68.63
92
+
93
+ _Pseudomonas__cissicola_GCA_002019225_1.fna.gz Xanthomonas_albilineans_GCA_000962935_1.fna.gz 0.5193 N/A 79 79 68.72
94
+
95
+ _Pseudomonas__cissicola_GCA_002019225_1.fna.gz Xanthomonas_albilineans_GCA_000962945_1.fna.gz 0.5189 N/A 79 79 68.7
96
+
97
+ ...
98
+
99
+ That's it!
100
+
101
+ ## Additional Information
102
+
103
+ #### Input files and their formats
104
+ FastAAI takes genomes, proteins, and tabular HMM search files (see below) as its basic inputs. Genomes are expected to be supplied in nucleotide FASTA format, with each genome (even if they are collections of multiple contigs) to be in a single, separate file. Each protein file is expected to contain the predicted proteome of a single genome in amino acid FASTA format. Each HMM file is expected to be the tabular output resulting from a search of a single genome's proteome against FastAAI's reference set of HMM models.
105
+
106
+ Inputs of each type (genome, protein, HMM) can be given in one of three ways:
107
+ * As a directory: a path to a directory containing only files of a particular type
108
+ * As a file: a text file containing paths to only files of a given type, one path per line
109
+ * As a string: a comma-separated string of paths to files; note that Python will give up if there are too many files given this way.
110
+
111
+ FastAAI will automatically detect the way you supply inputs.
112
+
113
+ #### HMM Files
114
+ In this repositoriy, you can find FastAAI's single copy protein HMM models under the heading of 00.Libraries/01.SCG_HMMs/Complete_SCG_DB.hmm. An HMM file produced by FastAAI will be the result of a search against this collection of models. While FastAAI uses [PyHMMER](https://github.com/althonos/pyhmmer) to implement its HMM search, any FastAAI HMM search can also be replicated with the following command:
115
+
116
+ ```bash
117
+ hmmsearch --tblout [FastAAI_HMM_file] -o [file_to_dsiscard] --cut_tc Complete_SCG_DB.hmm [input_proteome_file]
118
+ ```
119
+
120
+ #### Databases
121
+ FastAAI databases are SQLite 3 databases containing collections of genomes. Within every FastAAI database there will be two metadata tables describing the genomes the database contains and an additional two tables for each SCP observed within the set of genomes the database contains, up to 244 possible tables. The two metadata tables will always be named "genome_index" and "genome_acc_kmer_counts" with the paired, per-SCP tables named as "SCP_Accession_ID" and "SCP_Accession_ID_genomes", with all characters that have syntactic meaning in SQL replaced with underscores. An example of an SCP table pair would be for the SCP PF00119.20, which becomes PF00119_20 after character changes and will produce the tables PF00119_20 and PF00119_20_genomes. Schemas follow:
122
+
123
+ * genome_index (genome TEXT, gen_id INTEGER, protein_count INTEGER)
124
+ * genome_acc_kmer_counts (genome INTEGER, accession INTEGER, count INTEGER)
125
+ * [accession_ID] (kmer INTEGER, genomes BLOB)
126
+ * [accession_ID_genomes] (genome INTEGER, kmers BLOB)
127
+
128
+ Additional notes:
129
+
130
+ * Accession IDs are numbered according to an internal scheme used by FastAAI. Numbering for the IDs is available on this github in the metadata folder.
131
+ * Genomes are always numbered from 0 to (#genomes - 1) within a FastAAI database. No indices may be absent; FastAAI will refuse to add a genome if the genome contains no data FastAAI can use.
132
+ * Both the genomes blob and kmers blob are bytestrings of 32 bit integers.
133
+ * Kmers are represented in the database using 32-bit integers rather than as text. The integer representation of a particular tetramer can be found by finding the ASCII decimal values of each character and concatenating them in ordr, e.g. "KCMK" has ASCII values K = 75, C = 67, M = 77, K = 75, and thus is represented by the integer 75677775 in a database.
134
+
135
+ ### Building a FastAAI Database
136
+
137
+ The build_db module of FastAAI is designed to take a set of inputs, preprocess them and build a database from those inputs. Input formatting is discussed above. This section discusses the components of a build.
138
+
139
+ #### Preprocessing:
140
+ Preprocessing consists of detecting the input format and (for genome inputs) predicting the proteome for each genome using [Pyrodigal](https://github.com/althonos/pyrodigal), (for genome and protein inputs) searching proteomes against FastAAI's set of SCP HMMs using PyHMMER (https://github.com/althonos/pyhmmer), and (for all input types) extracting the unique tetramer sets of each SCP protein identified as a bidirectional best-match by HMMER. The final result of preprocessing for each genome is a list SCPs, each with the unique amino acid tetramer set for the corresponding protein.
141
+
142
+ #### Identification of best-matching SCPs
143
+ An HMM search consists of searching protein sequences against a prebuilt model. Proteins matching models below a cutoff are not reported by FastAAI. Where FastAAI is concerned, three remaining features are important: the SCP accession, protein ID, and HMM score. The score of a protein indicates the quality of match to the associated SCP, where higher scores indicate better matches.
144
+
145
+ Among the proteins that pass the initial filter, FastAAI identifies the highest scoring SCP assignment for each protein and the highest scoring assignment for each SCP. If a protein appears multiple times (that is, it matched to more than one SCP), then only the protein's highest scoring match is considered and the others are discarded. Likewise, if an SCP appears multiple times in the results (that is, multiple proteins matched to it), then only the highest scoring protein is retained for that SCP. Bidirectional best-matches are the remaining protein-SCP pairs. This all means that for each best-match between a protein and SCP, the protein's highest scoring match and the SCP's highest scoring match must be their counterpart in the pair.
146
+
147
+ A consequence of this approach is that each SCP can appear only once for each genome and each protein in a proteome can only be the representative for one SCP. A genome can have as little as one SCP or as many as 122 - more typically, a genome will have 50-90.
148
+
149
+ #### Database Construction:
150
+
151
+ After the input genomes are preprocessed, they are ready to be added to a database. The databases' genome_index table will be created or updated as needed to provide a numerical index of each genome, and metadata assosciated with the presence of each SCP and the count of kmers associated with it will be added to the genome_acc_kmer_counts table. Genome names are only represented as text in the genome_index table; in all other places, they are represented with integers according to the genome index.
152
+
153
+ FastAAI will add a record for each SCP in each genome to the corresponding SCP_genomes table, providing a record of the set of kmers associated with each genome in the database that is directly accessible using genome ID as a key. This genome-first representation is used by FastAAI when the database is used as a query.
154
+
155
+ FastAAI will then reorganize the data for each SCP into a kmer-first structure, listing each genome that contained a particular tetramer (e.g. KCMK) on the protein assosciated with a particular SCP (e.g. PF00119_20). This results in each tetramer being the key to a list of genome indices (e.g. table PF00119_20 , tetramer = KCMK, genomes = (0, 2, 5, 13, ...), where genomes 0, 2, 5, 13, etc., all have a representative protein that matched the HMM for SCP PF00119_20, and all of these representatives proteins contained the tetramer KCMK). This allows access to the set of genomes where a tetramer intersection would occur using a tetramer as a key. This tetramer-first representation is used by FastAAI when the database is used as a target.
156
+
157
+ Finally, tetramer tables are indexed if an index does not already exist. This is done to speed up the retrieval of tetramers during AAI calculation.
158
+
159
+ #### AAI Calculation
160
+
161
+ All of FastAAI's queries proceed essentially according to the same logic: the set of SCPs shared in common between a pair of genomes are selected, and the Jaccard index of unique tetramers is calculated for each shared SCP in the pairing. The average Jaccard index is calculated from the individual SCP pairings, unweighted, and the average Jaccard index is then transformed into an estimated AAI through an equation (see the FastAAI paper.)
162
+
163
+ To calculate the Jaccard index for a particular SCP, FastAAI selects one genome at a time as a query and sequentially searches each SCP in that genome against a target database. The organization of data discussed above allows FastAAI to select the tetramers associated with each of a query genomes' SCPs using the genome-first representation and then request all of the target genomes associated with each tetramer in the query using the tetramer-first representation. A tabulation of the number of appearances of each target genome produces the size of the tetramer intersection between the query and every target genome.
164
+
165
+ The total number of tetramers associated with each SCP of each target genome are stored in the genome_acc_kmer_counts table, so the calculation of union size for each query and target pair is simply the sum of the number of the current query's tetramers and the number of tetramers in each target, minus the size of the intersection for the query and each target. Calculation of Jaccard index is trivial from here.
166
+
167
+ * size(union(Q, T)) = size(Q) + size(Q) - size(intersection(Q, T))
168
+ * Jaccard(Q, T) = size(intersection(Q, T))/size(union(Q, T))
169
+
170
+ #### Outputs
171
+
172
+ FastAAI allows for two primary output formats: tabular and matrix. The format of outputs is set using the "--output_style" argument. "--output_style tsv" produces a tab-separated output file for each query genome containing results for that query genome against all target genomes. These files have column headers which report:
173
+
174
+ * Query genome name
175
+ * Target genome name
176
+ * Average Jaccard index
177
+ * Jaccard index std. deviation ("N/A" unless --do_stdev is used)
178
+ * Count of shared SCPs
179
+ * Number of possibly shared SCPs (max number of SCPs in either member of the query-target genome pair)
180
+ * Estimated AAI.
181
+
182
+ All values other than query name and target name are "N/A" if a query-target paring share no SCPs.
76
183
 
184
+ The matrix format ("--output_style matrix") produces a tab-separated matrix containing query names in the first column, target names in the first row, and the final pairwise AAI estimate for each query-target pairing in the appropriate cells of the matrix. The matrix is complete, so there is a duplicate of all AAI estimates off of the main diagonal of the matrix. Further, there are some differences in the reporting of AAI when compared to the tabular format:
77
185
 
78
- ## FAQs
186
+ * The tabular format lists the AAI estimate for query-target pairs with no shared SCPs as "N/A." The matrix format reports these estimates as 0.0.
187
+ * The tabular format constrains AAI estimates between 30 <= AAI <= 90. AAI estimates <30% AAI are reported as "<30% AAI," rather than with a number, as are AAI estimates >90% AAI with the ">90% AAI" label. The matrix format reports these categorical estimates with 15.0 and 95.0 AAI, respectively.
79
188
 
189
+ These changes to avoid text labelling were done to make working with the (often quite large) tabular results in subsequent analyses using R, Python, or another language easier, as all of the results are already in numerical format.
80
190
 
191
+ None of the other data aside from the estimated AAI is available in the matrix-formatted output.
81
192
 
82
193
  ## License
83
194
 
@@ -0,0 +1 @@
1
+ from .fastaai import *
@@ -0,0 +1 @@
1
+ from .fastaai import *