PyamilySeq 0.8.1__tar.gz → 0.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pyamilyseq-0.8.1/src/PyamilySeq.egg-info → pyamilyseq-0.9.0}/PKG-INFO +100 -42
- {pyamilyseq-0.8.1 → pyamilyseq-0.9.0}/README.md +98 -41
- {pyamilyseq-0.8.1 → pyamilyseq-0.9.0}/pyproject.toml +1 -0
- {pyamilyseq-0.8.1 → pyamilyseq-0.9.0}/setup.cfg +3 -1
- pyamilyseq-0.9.0/src/PyamilySeq/Cluster_Summary.py +163 -0
- pyamilyseq-0.9.0/src/PyamilySeq/Constants.py +2 -0
- {pyamilyseq-0.8.1 → pyamilyseq-0.9.0}/src/PyamilySeq/Group_Splitter.py +145 -113
- {pyamilyseq-0.8.1 → pyamilyseq-0.9.0}/src/PyamilySeq/PyamilySeq.py +16 -15
- {pyamilyseq-0.8.1 → pyamilyseq-0.9.0}/src/PyamilySeq/Seq_Combiner.py +8 -4
- {pyamilyseq-0.8.1 → pyamilyseq-0.9.0}/src/PyamilySeq/utils.py +38 -62
- {pyamilyseq-0.8.1 → pyamilyseq-0.9.0/src/PyamilySeq.egg-info}/PKG-INFO +100 -42
- {pyamilyseq-0.8.1 → pyamilyseq-0.9.0}/src/PyamilySeq.egg-info/SOURCES.txt +2 -0
- {pyamilyseq-0.8.1 → pyamilyseq-0.9.0}/src/PyamilySeq.egg-info/entry_points.txt +1 -0
- pyamilyseq-0.9.0/src/PyamilySeq.egg-info/requires.txt +1 -0
- pyamilyseq-0.8.1/src/PyamilySeq/Constants.py +0 -2
- {pyamilyseq-0.8.1 → pyamilyseq-0.9.0}/LICENSE +0 -0
- {pyamilyseq-0.8.1 → pyamilyseq-0.9.0}/src/PyamilySeq/PyamilySeq_Genus.py +0 -0
- {pyamilyseq-0.8.1 → pyamilyseq-0.9.0}/src/PyamilySeq/PyamilySeq_Species.py +0 -0
- {pyamilyseq-0.8.1 → pyamilyseq-0.9.0}/src/PyamilySeq/__init__.py +0 -0
- {pyamilyseq-0.8.1 → pyamilyseq-0.9.0}/src/PyamilySeq/clusterings.py +0 -0
- {pyamilyseq-0.8.1 → pyamilyseq-0.9.0}/src/PyamilySeq.egg-info/dependency_links.txt +0 -0
- {pyamilyseq-0.8.1 → pyamilyseq-0.9.0}/src/PyamilySeq.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: PyamilySeq
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.9.0
|
|
4
4
|
Summary: PyamilySeq - A a tool to look for sequence-based gene groups identified by clustering methods such as CD-HIT, DIAMOND, BLAST or MMseqs2.
|
|
5
5
|
Home-page: https://github.com/NickJD/PyamilySeq
|
|
6
6
|
Author: Nicholas Dimonaco
|
|
@@ -12,6 +12,7 @@ Classifier: Operating System :: OS Independent
|
|
|
12
12
|
Requires-Python: >=3.6
|
|
13
13
|
Description-Content-Type: text/markdown
|
|
14
14
|
License-File: LICENSE
|
|
15
|
+
Requires-Dist: levenshtein
|
|
15
16
|
|
|
16
17
|
# PyamilySeq - !BETA!
|
|
17
18
|
**PyamilySeq** is a Python tool for clustering gene sequences into groups based on sequence similarity identified by tools such as CD-HIT, BLAST, DIAMOND or MMseqs2.
|
|
@@ -34,19 +35,18 @@ PyamilySeq probably requires Python 3.6 or higher. Install using pip:
|
|
|
34
35
|
```bash
|
|
35
36
|
pip install PyamilySeq
|
|
36
37
|
```
|
|
37
|
-
|
|
38
|
+
PyamilySeq is regularly updated with bugfixes and new features so to update to the newest version add '-U' to end of the pip install command.
|
|
38
39
|
## Example usage: Below are two examples of running PyamilySeq in its two main modes.
|
|
39
40
|
### 'Full Mode': Will conduct clustering of sequences with CD-HIT as part of PyamilySeq run
|
|
40
41
|
```
|
|
41
42
|
PyamilySeq -run_mode Full -group_mode Species -clustering_format CD-HIT -output_dir .../test_data/testing/Full
|
|
42
|
-
-input_type combined -input_dir .../test_data/genomes -name_split _combined.gff3 -pid 0.95 -len_diff 0.80
|
|
43
|
-
-gpa -a -w 99
|
|
43
|
+
-input_type combined -input_dir .../test_data/genomes -name_split _combined.gff3 -pid 0.95 -len_diff 0.80 -a -w 99
|
|
44
44
|
```
|
|
45
45
|
### 'Partial Mode': Will take the output of a sequence clustering.
|
|
46
46
|
```
|
|
47
|
-
PyamilySeq -run_mode Partial -group_mode Species -clustering_format TSV -output_dir .../test_data/
|
|
48
|
-
-cluster_file .../test_data/
|
|
49
|
-
-original_fasta .../test_data/species/combined_Ensmbl_cds.fasta -
|
|
47
|
+
PyamilySeq -run_mode Partial -group_mode Species -clustering_format TSV -output_dir .../test_data/species/testing/Partial
|
|
48
|
+
-cluster_file .../test_data/species/MMseqs2/combined_Ensmbl_pep_cluster.tsv
|
|
49
|
+
-original_fasta .../test_data/species/combined_Ensmbl_cds.fasta -a -w 99 -verbose
|
|
50
50
|
|
|
51
51
|
```
|
|
52
52
|
#### Note: '-clustering_format TSV/CSV' requires input to be two in two columns as below (Same format as MMseqs2 tsv) - Genome name and sequence name are separated by '|'.
|
|
@@ -58,7 +58,7 @@ Escherichia_coli_110957|ENSB:TIZS9kbTvShDvyX Escherichia_coli_110957|ENSB:TIZS9k
|
|
|
58
58
|
```
|
|
59
59
|
### Example output:
|
|
60
60
|
```
|
|
61
|
-
Running PyamilySeq v0.
|
|
61
|
+
Running PyamilySeq v0.9.0
|
|
62
62
|
Calculating Groups
|
|
63
63
|
Gene Groups:
|
|
64
64
|
First_core_99: 2682
|
|
@@ -80,7 +80,7 @@ PyamilySeq -run_mode Partial -group_mode Genus -clustering_format CD-HIT -output
|
|
|
80
80
|
-cluster_file .../test_data/genus/CD-HIT/combined_cds_cd-hit_80_60.clstr -gpa
|
|
81
81
|
```
|
|
82
82
|
```commandline
|
|
83
|
-
Running PyamilySeq v0.
|
|
83
|
+
Running PyamilySeq v0.9.0
|
|
84
84
|
Calculating Groups
|
|
85
85
|
Genus Groups:
|
|
86
86
|
First_genera_1: 28549
|
|
@@ -137,14 +137,14 @@ Please report any issues to: https://github.com/NickJD/PyamilySeq/issues
|
|
|
137
137
|
## PyamilySeq - Menu:
|
|
138
138
|
### PyamilySeq is separated into two main 'run modes', Full and Partial. They each have their own set of required and optional arguments.
|
|
139
139
|
```
|
|
140
|
-
Running PyamilySeq v0.
|
|
140
|
+
Running PyamilySeq v0.9.0
|
|
141
141
|
usage: PyamilySeq.py [-h] -run_mode {Full,Partial} -group_mode {Species,Genus} -clustering_format {CD-HIT,TSV,CSV} -output_dir OUTPUT_DIR
|
|
142
142
|
[-input_type {separate,combined}] [-input_dir INPUT_DIR] [-name_split NAME_SPLIT] [-sequence_type {AA,DNA}] [-gene_ident GENE_IDENT]
|
|
143
143
|
[-pid PIDENT] [-len_diff LEN_DIFF] [-mem CLUSTERING_MEMORY] [-t CLUSTERING_THREADS] [-cluster_file CLUSTER_FILE]
|
|
144
144
|
[-reclustered RECLUSTERED] [-seq_tag SEQUENCE_TAG] [-core_groups CORE_GROUPS] [-genus_groups GENUS_GROUPS] [-w WRITE_GROUPS] [-a]
|
|
145
145
|
[-original_fasta ORIGINAL_FASTA] [-gpa] [-verbose] [-v]
|
|
146
146
|
|
|
147
|
-
PyamilySeq v0.
|
|
147
|
+
PyamilySeq v0.9.0: A tool that groups genes into unique clusters.
|
|
148
148
|
|
|
149
149
|
options:
|
|
150
150
|
-h, --help show this help message and exit
|
|
@@ -198,15 +198,16 @@ Output Parameters:
|
|
|
198
198
|
-w WRITE_GROUPS Default - No output: Output sequences of identified groups (provide levels at which to output - Species "-w 99,95" Genus "-w 2,3" -
|
|
199
199
|
Must provide FASTA file with -original_fasta if in Partial run mode.
|
|
200
200
|
-a Default - No output: SLOW! (Only works for Species mode) Output aligned and concatinated sequences of identified groups -provide
|
|
201
|
-
group levels at which to output "-w 99,95" - Must provide FASTA file with -original_fasta in
|
|
201
|
+
group levels at which to output "-w 99,95" - Must provide FASTA file with -original_fasta in Partialrun mode.
|
|
202
202
|
-original_fasta ORIGINAL_FASTA
|
|
203
|
-
FASTA file to use in conjunction with "-w" or "-
|
|
204
|
-
-
|
|
205
|
-
|
|
203
|
+
FASTA file to use in conjunction with "-w" or "-con" when running in Partial Mode.
|
|
204
|
+
-no_gpa Do not create a Roary/Panaroo formatted gene_presence_absence.csv (created by default) - Required for Coinfinder and other
|
|
205
|
+
downstream tools
|
|
206
|
+
|
|
207
|
+
Misc Parameters:
|
|
208
|
+
-verbose Print verbose output.
|
|
209
|
+
-v, --version Print out version number and exit
|
|
206
210
|
|
|
207
|
-
Misc:
|
|
208
|
-
-verbose Default - False: Print out runtime messages
|
|
209
|
-
-v Default - False: Print out version number and exit
|
|
210
211
|
```
|
|
211
212
|
|
|
212
213
|
|
|
@@ -216,13 +217,14 @@ Misc:
|
|
|
216
217
|
## Seq-Combiner: This tool is provided to enable the pre-processing of multiple GFF/FASTA files together ready to be clustered by the user.
|
|
217
218
|
### Example:
|
|
218
219
|
```bash
|
|
219
|
-
Seq-Combiner -input_dir .../test_data/genomes -name_split
|
|
220
|
+
Seq-Combiner -input_dir .../test_data/genomes -name_split .gff3 -output_dir .../test_data/genomes -output_name combine_fasta_seqs.fa -input_type combined
|
|
220
221
|
```
|
|
221
222
|
### Seq-Combiner Menu:
|
|
222
223
|
```
|
|
223
|
-
usage: Seq_Combiner.py [-h] -input_dir INPUT_DIR -input_type {separate,combined,fasta} -name_split NAME_SPLIT -output_dir OUTPUT_DIR -output_name
|
|
224
|
+
usage: Seq_Combiner.py [-h] -input_dir INPUT_DIR -input_type {separate,combined,fasta} -name_split NAME_SPLIT -output_dir OUTPUT_DIR -output_name
|
|
225
|
+
OUTPUT_FILE [-gene_ident GENE_IDENT] [-translate] [-v]
|
|
224
226
|
|
|
225
|
-
|
|
227
|
+
PyamilySeq v0.9.0: Seq-Combiner - A tool to extract sequences from GFF/FASTA files and prepare them for PyamilySeq.
|
|
226
228
|
|
|
227
229
|
options:
|
|
228
230
|
-h, --help show this help message and exit
|
|
@@ -230,7 +232,8 @@ options:
|
|
|
230
232
|
Required Arguments:
|
|
231
233
|
-input_dir INPUT_DIR Directory location where the files are located.
|
|
232
234
|
-input_type {separate,combined,fasta}
|
|
233
|
-
Type of input files: "separate" for separate FASTA and GFF files, "combined" for GFF files with embedded FASTA sequences and "fasta"
|
|
235
|
+
Type of input files: "separate" for separate FASTA and GFF files, "combined" for GFF files with embedded FASTA sequences and "fasta"
|
|
236
|
+
for combining multiple FASTA files together.
|
|
234
237
|
-name_split NAME_SPLIT
|
|
235
238
|
substring used to split the filename and extract the genome name ('_combined.gff3' or '.gff').
|
|
236
239
|
-output_dir OUTPUT_DIR
|
|
@@ -240,48 +243,103 @@ Required Arguments:
|
|
|
240
243
|
|
|
241
244
|
Optional Arguments:
|
|
242
245
|
-gene_ident GENE_IDENT
|
|
243
|
-
Default - "CDS": Identifier used for extraction of sequences such as
|
|
246
|
+
Default - "CDS": Identifier used for extraction of sequences such as
|
|
247
|
+
"misc_RNA,gene,mRNA,CDS,rRNA,tRNA,tmRNA,CRISPR,ncRNA,regulatory_region,oriC,pseudo" - Not compatible with "fasta" input mode.
|
|
244
248
|
-translate Default - False: Translate extracted sequences to their AA counterpart?
|
|
245
249
|
|
|
246
250
|
Misc Arguments:
|
|
247
|
-
-v
|
|
248
|
-
|
|
251
|
+
-v, --version Print out version number and exit
|
|
249
252
|
|
|
250
253
|
```
|
|
251
254
|
|
|
252
|
-
|
|
253
|
-
|
|
255
|
+
## Group-Splitter: This tool can split multi-copy gene groups using CD-HIT after initial PyamilySeq analysis.
|
|
256
|
+
### Example:
|
|
257
|
+
```bash
|
|
258
|
+
Group-Splitter -genome_num 74 -input_fasta .../test/species/ -output_dir .../test/species/ -sequence_type AA
|
|
259
|
+
```
|
|
260
|
+
### Group-Splitter Menu:
|
|
254
261
|
```
|
|
255
|
-
usage: Group_Splitter.py [-h] -input_fasta INPUT_FASTA -
|
|
256
|
-
|
|
262
|
+
usage: Group_Splitter.py [-h] -input_fasta INPUT_FASTA -sequence_type {AA,DNA}
|
|
263
|
+
-genome_num GENOME_NUM -output_dir OUTPUT_DIR
|
|
264
|
+
[-groups GROUPS] [-group_threshold GROUP_THRESHOLD]
|
|
265
|
+
[-c PIDENT] [-s LEN_DIFF] [-T CLUSTERING_THREADS]
|
|
266
|
+
[-M CLUSTERING_MEMORY] [-no_delete_temp_files]
|
|
267
|
+
[-verbose] [-v]
|
|
257
268
|
|
|
258
|
-
|
|
269
|
+
PyamilySeq v0.9.0: Group-Splitter - A tool to split multi-copy gene groups
|
|
270
|
+
identified by PyamilySeq.
|
|
259
271
|
|
|
260
272
|
options:
|
|
261
273
|
-h, --help show this help message and exit
|
|
262
274
|
|
|
263
|
-
Required
|
|
275
|
+
Required Parameters:
|
|
264
276
|
-input_fasta INPUT_FASTA
|
|
265
277
|
Input FASTA file containing gene groups.
|
|
266
278
|
-sequence_type {AA,DNA}
|
|
267
279
|
Default - DNA: Are groups "DNA" or "AA" sequences?
|
|
280
|
+
-genome_num GENOME_NUM
|
|
281
|
+
The total number of genomes must be provide
|
|
268
282
|
-output_dir OUTPUT_DIR
|
|
269
283
|
Output directory.
|
|
270
284
|
|
|
271
|
-
|
|
272
|
-
-
|
|
273
|
-
|
|
274
|
-
|
|
285
|
+
Regrouping Parameters:
|
|
286
|
+
-groups GROUPS Default - auto: Detect groups to be split (see
|
|
287
|
+
-group_threshold). Provide "-groups 1,2,3,4" with
|
|
288
|
+
group IDs to split specific groups.
|
|
289
|
+
-group_threshold GROUP_THRESHOLD
|
|
290
|
+
Minimum percentage of genomes with multi-copy
|
|
291
|
+
(default: 80.0) - Does not work with "-groups"
|
|
292
|
+
|
|
293
|
+
CD-HIT Reclustering Parameters:
|
|
294
|
+
-c PIDENT Sequence identity threshold (default: 0.8) - Probably
|
|
295
|
+
should be higher than what was used in initial
|
|
296
|
+
clustering.
|
|
297
|
+
-s LEN_DIFF Length difference cutoff (default: 0.20) - Often the
|
|
298
|
+
most impactful parameter to split 'multi-copy' gene
|
|
299
|
+
groups.
|
|
300
|
+
-T CLUSTERING_THREADS
|
|
275
301
|
Number of threads for clustering (default: 4)
|
|
276
|
-
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
302
|
+
-M CLUSTERING_MEMORY Memory limit in MB for clustering (default: 2000)
|
|
303
|
+
|
|
304
|
+
Misc Parameters:
|
|
305
|
+
-no_delete_temp_files
|
|
306
|
+
Default: Delete all temporary files after processing.
|
|
280
307
|
-verbose Print verbose output.
|
|
281
|
-
-
|
|
308
|
+
-v, --version Print out version number and exit
|
|
309
|
+
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
## Cluster-Summary menu: This tool can be used to summarise CD-HIT .clstr files:
|
|
313
|
+
### Example:
|
|
314
|
+
```bash
|
|
315
|
+
Cluster-Summary -genome_num 74 -input_clstr .../test_data/species/E-coli/E-coli_extracted_pep_cd-hit_80.clstr -output_tsv .../test_data/species/E-coli/E-coli_extracted_pep_cd-hit_80_Summary.tsv
|
|
316
|
+
```
|
|
317
|
+
### Cluster-Summary Menu:
|
|
318
|
+
```
|
|
319
|
+
usage: Cluster_Summary.py [-h] -input_clstr INPUT_CLSTR -output OUTPUT -genome_num GENOME_NUM
|
|
320
|
+
[-output_dir OUTPUT_DIR] [-verbose] [-v]
|
|
321
|
+
|
|
322
|
+
PyamilySeq v0.9.0: Cluster-Summary - A tool to summarise CD-HIT clustering files.
|
|
323
|
+
|
|
324
|
+
options:
|
|
325
|
+
-h, --help show this help message and exit
|
|
326
|
+
|
|
327
|
+
Required Parameters:
|
|
328
|
+
-input_clstr INPUT_CLSTR
|
|
329
|
+
Input CD-HIT .clstr file
|
|
330
|
+
-output OUTPUT Output TSV file to store cluster summaries - Will add '.tsv' if not
|
|
331
|
+
provided by user
|
|
332
|
+
-genome_num GENOME_NUM
|
|
333
|
+
The total number of genomes must be provide
|
|
334
|
+
|
|
335
|
+
Optional Arguments:
|
|
336
|
+
-output_dir OUTPUT_DIR
|
|
337
|
+
Default: Same as input file
|
|
338
|
+
|
|
339
|
+
Misc Parameters:
|
|
340
|
+
-verbose Print verbose output.
|
|
341
|
+
-v, --version Print out version number and exit
|
|
282
342
|
|
|
283
|
-
Misc Arguments:
|
|
284
|
-
-v Print out version number and exit
|
|
285
343
|
```
|
|
286
344
|
|
|
287
345
|
### All example input and output data can be found in the 'test_data' directory.
|
|
@@ -19,19 +19,18 @@ PyamilySeq probably requires Python 3.6 or higher. Install using pip:
|
|
|
19
19
|
```bash
|
|
20
20
|
pip install PyamilySeq
|
|
21
21
|
```
|
|
22
|
-
|
|
22
|
+
PyamilySeq is regularly updated with bugfixes and new features so to update to the newest version add '-U' to end of the pip install command.
|
|
23
23
|
## Example usage: Below are two examples of running PyamilySeq in its two main modes.
|
|
24
24
|
### 'Full Mode': Will conduct clustering of sequences with CD-HIT as part of PyamilySeq run
|
|
25
25
|
```
|
|
26
26
|
PyamilySeq -run_mode Full -group_mode Species -clustering_format CD-HIT -output_dir .../test_data/testing/Full
|
|
27
|
-
-input_type combined -input_dir .../test_data/genomes -name_split _combined.gff3 -pid 0.95 -len_diff 0.80
|
|
28
|
-
-gpa -a -w 99
|
|
27
|
+
-input_type combined -input_dir .../test_data/genomes -name_split _combined.gff3 -pid 0.95 -len_diff 0.80 -a -w 99
|
|
29
28
|
```
|
|
30
29
|
### 'Partial Mode': Will take the output of a sequence clustering.
|
|
31
30
|
```
|
|
32
|
-
PyamilySeq -run_mode Partial -group_mode Species -clustering_format TSV -output_dir .../test_data/
|
|
33
|
-
-cluster_file .../test_data/
|
|
34
|
-
-original_fasta .../test_data/species/combined_Ensmbl_cds.fasta -
|
|
31
|
+
PyamilySeq -run_mode Partial -group_mode Species -clustering_format TSV -output_dir .../test_data/species/testing/Partial
|
|
32
|
+
-cluster_file .../test_data/species/MMseqs2/combined_Ensmbl_pep_cluster.tsv
|
|
33
|
+
-original_fasta .../test_data/species/combined_Ensmbl_cds.fasta -a -w 99 -verbose
|
|
35
34
|
|
|
36
35
|
```
|
|
37
36
|
#### Note: '-clustering_format TSV/CSV' requires input to be two in two columns as below (Same format as MMseqs2 tsv) - Genome name and sequence name are separated by '|'.
|
|
@@ -43,7 +42,7 @@ Escherichia_coli_110957|ENSB:TIZS9kbTvShDvyX Escherichia_coli_110957|ENSB:TIZS9k
|
|
|
43
42
|
```
|
|
44
43
|
### Example output:
|
|
45
44
|
```
|
|
46
|
-
Running PyamilySeq v0.
|
|
45
|
+
Running PyamilySeq v0.9.0
|
|
47
46
|
Calculating Groups
|
|
48
47
|
Gene Groups:
|
|
49
48
|
First_core_99: 2682
|
|
@@ -65,7 +64,7 @@ PyamilySeq -run_mode Partial -group_mode Genus -clustering_format CD-HIT -output
|
|
|
65
64
|
-cluster_file .../test_data/genus/CD-HIT/combined_cds_cd-hit_80_60.clstr -gpa
|
|
66
65
|
```
|
|
67
66
|
```commandline
|
|
68
|
-
Running PyamilySeq v0.
|
|
67
|
+
Running PyamilySeq v0.9.0
|
|
69
68
|
Calculating Groups
|
|
70
69
|
Genus Groups:
|
|
71
70
|
First_genera_1: 28549
|
|
@@ -122,14 +121,14 @@ Please report any issues to: https://github.com/NickJD/PyamilySeq/issues
|
|
|
122
121
|
## PyamilySeq - Menu:
|
|
123
122
|
### PyamilySeq is separated into two main 'run modes', Full and Partial. They each have their own set of required and optional arguments.
|
|
124
123
|
```
|
|
125
|
-
Running PyamilySeq v0.
|
|
124
|
+
Running PyamilySeq v0.9.0
|
|
126
125
|
usage: PyamilySeq.py [-h] -run_mode {Full,Partial} -group_mode {Species,Genus} -clustering_format {CD-HIT,TSV,CSV} -output_dir OUTPUT_DIR
|
|
127
126
|
[-input_type {separate,combined}] [-input_dir INPUT_DIR] [-name_split NAME_SPLIT] [-sequence_type {AA,DNA}] [-gene_ident GENE_IDENT]
|
|
128
127
|
[-pid PIDENT] [-len_diff LEN_DIFF] [-mem CLUSTERING_MEMORY] [-t CLUSTERING_THREADS] [-cluster_file CLUSTER_FILE]
|
|
129
128
|
[-reclustered RECLUSTERED] [-seq_tag SEQUENCE_TAG] [-core_groups CORE_GROUPS] [-genus_groups GENUS_GROUPS] [-w WRITE_GROUPS] [-a]
|
|
130
129
|
[-original_fasta ORIGINAL_FASTA] [-gpa] [-verbose] [-v]
|
|
131
130
|
|
|
132
|
-
PyamilySeq v0.
|
|
131
|
+
PyamilySeq v0.9.0: A tool that groups genes into unique clusters.
|
|
133
132
|
|
|
134
133
|
options:
|
|
135
134
|
-h, --help show this help message and exit
|
|
@@ -183,15 +182,16 @@ Output Parameters:
|
|
|
183
182
|
-w WRITE_GROUPS Default - No output: Output sequences of identified groups (provide levels at which to output - Species "-w 99,95" Genus "-w 2,3" -
|
|
184
183
|
Must provide FASTA file with -original_fasta if in Partial run mode.
|
|
185
184
|
-a Default - No output: SLOW! (Only works for Species mode) Output aligned and concatinated sequences of identified groups -provide
|
|
186
|
-
group levels at which to output "-w 99,95" - Must provide FASTA file with -original_fasta in
|
|
185
|
+
group levels at which to output "-w 99,95" - Must provide FASTA file with -original_fasta in Partialrun mode.
|
|
187
186
|
-original_fasta ORIGINAL_FASTA
|
|
188
|
-
FASTA file to use in conjunction with "-w" or "-
|
|
189
|
-
-
|
|
190
|
-
|
|
187
|
+
FASTA file to use in conjunction with "-w" or "-con" when running in Partial Mode.
|
|
188
|
+
-no_gpa Do not create a Roary/Panaroo formatted gene_presence_absence.csv (created by default) - Required for Coinfinder and other
|
|
189
|
+
downstream tools
|
|
190
|
+
|
|
191
|
+
Misc Parameters:
|
|
192
|
+
-verbose Print verbose output.
|
|
193
|
+
-v, --version Print out version number and exit
|
|
191
194
|
|
|
192
|
-
Misc:
|
|
193
|
-
-verbose Default - False: Print out runtime messages
|
|
194
|
-
-v Default - False: Print out version number and exit
|
|
195
195
|
```
|
|
196
196
|
|
|
197
197
|
|
|
@@ -201,13 +201,14 @@ Misc:
|
|
|
201
201
|
## Seq-Combiner: This tool is provided to enable the pre-processing of multiple GFF/FASTA files together ready to be clustered by the user.
|
|
202
202
|
### Example:
|
|
203
203
|
```bash
|
|
204
|
-
Seq-Combiner -input_dir .../test_data/genomes -name_split
|
|
204
|
+
Seq-Combiner -input_dir .../test_data/genomes -name_split .gff3 -output_dir .../test_data/genomes -output_name combine_fasta_seqs.fa -input_type combined
|
|
205
205
|
```
|
|
206
206
|
### Seq-Combiner Menu:
|
|
207
207
|
```
|
|
208
|
-
usage: Seq_Combiner.py [-h] -input_dir INPUT_DIR -input_type {separate,combined,fasta} -name_split NAME_SPLIT -output_dir OUTPUT_DIR -output_name
|
|
208
|
+
usage: Seq_Combiner.py [-h] -input_dir INPUT_DIR -input_type {separate,combined,fasta} -name_split NAME_SPLIT -output_dir OUTPUT_DIR -output_name
|
|
209
|
+
OUTPUT_FILE [-gene_ident GENE_IDENT] [-translate] [-v]
|
|
209
210
|
|
|
210
|
-
|
|
211
|
+
PyamilySeq v0.9.0: Seq-Combiner - A tool to extract sequences from GFF/FASTA files and prepare them for PyamilySeq.
|
|
211
212
|
|
|
212
213
|
options:
|
|
213
214
|
-h, --help show this help message and exit
|
|
@@ -215,7 +216,8 @@ options:
|
|
|
215
216
|
Required Arguments:
|
|
216
217
|
-input_dir INPUT_DIR Directory location where the files are located.
|
|
217
218
|
-input_type {separate,combined,fasta}
|
|
218
|
-
Type of input files: "separate" for separate FASTA and GFF files, "combined" for GFF files with embedded FASTA sequences and "fasta"
|
|
219
|
+
Type of input files: "separate" for separate FASTA and GFF files, "combined" for GFF files with embedded FASTA sequences and "fasta"
|
|
220
|
+
for combining multiple FASTA files together.
|
|
219
221
|
-name_split NAME_SPLIT
|
|
220
222
|
substring used to split the filename and extract the genome name ('_combined.gff3' or '.gff').
|
|
221
223
|
-output_dir OUTPUT_DIR
|
|
@@ -225,48 +227,103 @@ Required Arguments:
|
|
|
225
227
|
|
|
226
228
|
Optional Arguments:
|
|
227
229
|
-gene_ident GENE_IDENT
|
|
228
|
-
Default - "CDS": Identifier used for extraction of sequences such as
|
|
230
|
+
Default - "CDS": Identifier used for extraction of sequences such as
|
|
231
|
+
"misc_RNA,gene,mRNA,CDS,rRNA,tRNA,tmRNA,CRISPR,ncRNA,regulatory_region,oriC,pseudo" - Not compatible with "fasta" input mode.
|
|
229
232
|
-translate Default - False: Translate extracted sequences to their AA counterpart?
|
|
230
233
|
|
|
231
234
|
Misc Arguments:
|
|
232
|
-
-v
|
|
233
|
-
|
|
235
|
+
-v, --version Print out version number and exit
|
|
234
236
|
|
|
235
237
|
```
|
|
236
238
|
|
|
237
|
-
|
|
238
|
-
|
|
239
|
+
## Group-Splitter: This tool can split multi-copy gene groups using CD-HIT after initial PyamilySeq analysis.
|
|
240
|
+
### Example:
|
|
241
|
+
```bash
|
|
242
|
+
Group-Splitter -genome_num 74 -input_fasta .../test/species/ -output_dir .../test/species/ -sequence_type AA
|
|
243
|
+
```
|
|
244
|
+
### Group-Splitter Menu:
|
|
239
245
|
```
|
|
240
|
-
usage: Group_Splitter.py [-h] -input_fasta INPUT_FASTA -
|
|
241
|
-
|
|
246
|
+
usage: Group_Splitter.py [-h] -input_fasta INPUT_FASTA -sequence_type {AA,DNA}
|
|
247
|
+
-genome_num GENOME_NUM -output_dir OUTPUT_DIR
|
|
248
|
+
[-groups GROUPS] [-group_threshold GROUP_THRESHOLD]
|
|
249
|
+
[-c PIDENT] [-s LEN_DIFF] [-T CLUSTERING_THREADS]
|
|
250
|
+
[-M CLUSTERING_MEMORY] [-no_delete_temp_files]
|
|
251
|
+
[-verbose] [-v]
|
|
242
252
|
|
|
243
|
-
|
|
253
|
+
PyamilySeq v0.9.0: Group-Splitter - A tool to split multi-copy gene groups
|
|
254
|
+
identified by PyamilySeq.
|
|
244
255
|
|
|
245
256
|
options:
|
|
246
257
|
-h, --help show this help message and exit
|
|
247
258
|
|
|
248
|
-
Required
|
|
259
|
+
Required Parameters:
|
|
249
260
|
-input_fasta INPUT_FASTA
|
|
250
261
|
Input FASTA file containing gene groups.
|
|
251
262
|
-sequence_type {AA,DNA}
|
|
252
263
|
Default - DNA: Are groups "DNA" or "AA" sequences?
|
|
264
|
+
-genome_num GENOME_NUM
|
|
265
|
+
The total number of genomes must be provide
|
|
253
266
|
-output_dir OUTPUT_DIR
|
|
254
267
|
Output directory.
|
|
255
268
|
|
|
256
|
-
|
|
257
|
-
-
|
|
258
|
-
|
|
259
|
-
|
|
269
|
+
Regrouping Parameters:
|
|
270
|
+
-groups GROUPS Default - auto: Detect groups to be split (see
|
|
271
|
+
-group_threshold). Provide "-groups 1,2,3,4" with
|
|
272
|
+
group IDs to split specific groups.
|
|
273
|
+
-group_threshold GROUP_THRESHOLD
|
|
274
|
+
Minimum percentage of genomes with multi-copy
|
|
275
|
+
(default: 80.0) - Does not work with "-groups"
|
|
276
|
+
|
|
277
|
+
CD-HIT Reclustering Parameters:
|
|
278
|
+
-c PIDENT Sequence identity threshold (default: 0.8) - Probably
|
|
279
|
+
should be higher than what was used in initial
|
|
280
|
+
clustering.
|
|
281
|
+
-s LEN_DIFF Length difference cutoff (default: 0.20) - Often the
|
|
282
|
+
most impactful parameter to split 'multi-copy' gene
|
|
283
|
+
groups.
|
|
284
|
+
-T CLUSTERING_THREADS
|
|
260
285
|
Number of threads for clustering (default: 4)
|
|
261
|
-
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
286
|
+
-M CLUSTERING_MEMORY Memory limit in MB for clustering (default: 2000)
|
|
287
|
+
|
|
288
|
+
Misc Parameters:
|
|
289
|
+
-no_delete_temp_files
|
|
290
|
+
Default: Delete all temporary files after processing.
|
|
265
291
|
-verbose Print verbose output.
|
|
266
|
-
-
|
|
292
|
+
-v, --version Print out version number and exit
|
|
293
|
+
|
|
294
|
+
```
|
|
295
|
+
|
|
296
|
+
## Cluster-Summary menu: This tool can be used to summarise CD-HIT .clstr files:
|
|
297
|
+
### Example:
|
|
298
|
+
```bash
|
|
299
|
+
Cluster-Summary -genome_num 74 -input_clstr .../test_data/species/E-coli/E-coli_extracted_pep_cd-hit_80.clstr -output_tsv .../test_data/species/E-coli/E-coli_extracted_pep_cd-hit_80_Summary.tsv
|
|
300
|
+
```
|
|
301
|
+
### Cluster-Summary Menu:
|
|
302
|
+
```
|
|
303
|
+
usage: Cluster_Summary.py [-h] -input_clstr INPUT_CLSTR -output OUTPUT -genome_num GENOME_NUM
|
|
304
|
+
[-output_dir OUTPUT_DIR] [-verbose] [-v]
|
|
305
|
+
|
|
306
|
+
PyamilySeq v0.9.0: Cluster-Summary - A tool to summarise CD-HIT clustering files.
|
|
307
|
+
|
|
308
|
+
options:
|
|
309
|
+
-h, --help show this help message and exit
|
|
310
|
+
|
|
311
|
+
Required Parameters:
|
|
312
|
+
-input_clstr INPUT_CLSTR
|
|
313
|
+
Input CD-HIT .clstr file
|
|
314
|
+
-output OUTPUT Output TSV file to store cluster summaries - Will add '.tsv' if not
|
|
315
|
+
provided by user
|
|
316
|
+
-genome_num GENOME_NUM
|
|
317
|
+
The total number of genomes must be provide
|
|
318
|
+
|
|
319
|
+
Optional Arguments:
|
|
320
|
+
-output_dir OUTPUT_DIR
|
|
321
|
+
Default: Same as input file
|
|
322
|
+
|
|
323
|
+
Misc Parameters:
|
|
324
|
+
-verbose Print verbose output.
|
|
325
|
+
-v, --version Print out version number and exit
|
|
267
326
|
|
|
268
|
-
Misc Arguments:
|
|
269
|
-
-v Print out version number and exit
|
|
270
327
|
```
|
|
271
328
|
|
|
272
329
|
### All example input and output data can be found in the 'test_data' directory.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[metadata]
|
|
2
2
|
name = PyamilySeq
|
|
3
|
-
version = v0.
|
|
3
|
+
version = v0.9.0
|
|
4
4
|
author = Nicholas Dimonaco
|
|
5
5
|
author_email = nicholas@dimonaco.co.uk
|
|
6
6
|
description = PyamilySeq - A a tool to look for sequence-based gene groups identified by clustering methods such as CD-HIT, DIAMOND, BLAST or MMseqs2.
|
|
@@ -20,6 +20,7 @@ package_dir =
|
|
|
20
20
|
packages = find:
|
|
21
21
|
python_requires = >=3.6
|
|
22
22
|
install_requires =
|
|
23
|
+
levenshtein
|
|
23
24
|
|
|
24
25
|
[options.packages.find]
|
|
25
26
|
where = src
|
|
@@ -30,6 +31,7 @@ console_scripts =
|
|
|
30
31
|
PyamilySeq = PyamilySeq.PyamilySeq:main
|
|
31
32
|
Seq-Combiner = PyamilySeq.Seq_Combiner:main
|
|
32
33
|
Group-Splitter = PyamilySeq.Group_Splitter:main
|
|
34
|
+
Cluster-Summary = PyamilySeq.Cluster_Summary:main
|
|
33
35
|
|
|
34
36
|
[egg_info]
|
|
35
37
|
tag_build =
|