PyamilySeq 0.8.0__tar.gz → 0.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pyamilyseq-0.8.0/src/PyamilySeq.egg-info → pyamilyseq-0.9.0}/PKG-INFO +105 -44
- {pyamilyseq-0.8.0 → pyamilyseq-0.9.0}/README.md +103 -43
- {pyamilyseq-0.8.0 → pyamilyseq-0.9.0}/pyproject.toml +1 -0
- {pyamilyseq-0.8.0 → pyamilyseq-0.9.0}/setup.cfg +3 -1
- pyamilyseq-0.9.0/src/PyamilySeq/Cluster_Summary.py +163 -0
- pyamilyseq-0.9.0/src/PyamilySeq/Constants.py +2 -0
- pyamilyseq-0.9.0/src/PyamilySeq/Group_Splitter.py +382 -0
- {pyamilyseq-0.8.0 → pyamilyseq-0.9.0}/src/PyamilySeq/PyamilySeq.py +21 -17
- {pyamilyseq-0.8.0 → pyamilyseq-0.9.0}/src/PyamilySeq/Seq_Combiner.py +8 -4
- {pyamilyseq-0.8.0 → pyamilyseq-0.9.0}/src/PyamilySeq/utils.py +53 -62
- {pyamilyseq-0.8.0 → pyamilyseq-0.9.0/src/PyamilySeq.egg-info}/PKG-INFO +105 -44
- {pyamilyseq-0.8.0 → pyamilyseq-0.9.0}/src/PyamilySeq.egg-info/SOURCES.txt +2 -0
- {pyamilyseq-0.8.0 → pyamilyseq-0.9.0}/src/PyamilySeq.egg-info/entry_points.txt +1 -0
- pyamilyseq-0.9.0/src/PyamilySeq.egg-info/requires.txt +1 -0
- pyamilyseq-0.8.0/src/PyamilySeq/Constants.py +0 -2
- pyamilyseq-0.8.0/src/PyamilySeq/Group_Splitter.py +0 -335
- {pyamilyseq-0.8.0 → pyamilyseq-0.9.0}/LICENSE +0 -0
- {pyamilyseq-0.8.0 → pyamilyseq-0.9.0}/src/PyamilySeq/PyamilySeq_Genus.py +0 -0
- {pyamilyseq-0.8.0 → pyamilyseq-0.9.0}/src/PyamilySeq/PyamilySeq_Species.py +0 -0
- {pyamilyseq-0.8.0 → pyamilyseq-0.9.0}/src/PyamilySeq/__init__.py +0 -0
- {pyamilyseq-0.8.0 → pyamilyseq-0.9.0}/src/PyamilySeq/clusterings.py +0 -0
- {pyamilyseq-0.8.0 → pyamilyseq-0.9.0}/src/PyamilySeq.egg-info/dependency_links.txt +0 -0
- {pyamilyseq-0.8.0 → pyamilyseq-0.9.0}/src/PyamilySeq.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: PyamilySeq
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.9.0
|
|
4
4
|
Summary: PyamilySeq - A a tool to look for sequence-based gene groups identified by clustering methods such as CD-HIT, DIAMOND, BLAST or MMseqs2.
|
|
5
5
|
Home-page: https://github.com/NickJD/PyamilySeq
|
|
6
6
|
Author: Nicholas Dimonaco
|
|
@@ -12,6 +12,7 @@ Classifier: Operating System :: OS Independent
|
|
|
12
12
|
Requires-Python: >=3.6
|
|
13
13
|
Description-Content-Type: text/markdown
|
|
14
14
|
License-File: LICENSE
|
|
15
|
+
Requires-Dist: levenshtein
|
|
15
16
|
|
|
16
17
|
# PyamilySeq - !BETA!
|
|
17
18
|
**PyamilySeq** is a Python tool for clustering gene sequences into groups based on sequence similarity identified by tools such as CD-HIT, BLAST, DIAMOND or MMseqs2.
|
|
@@ -34,19 +35,18 @@ PyamilySeq probably requires Python 3.6 or higher. Install using pip:
|
|
|
34
35
|
```bash
|
|
35
36
|
pip install PyamilySeq
|
|
36
37
|
```
|
|
37
|
-
|
|
38
|
+
PyamilySeq is regularly updated with bugfixes and new features so to update to the newest version add '-U' to end of the pip install command.
|
|
38
39
|
## Example usage: Below are two examples of running PyamilySeq in its two main modes.
|
|
39
40
|
### 'Full Mode': Will conduct clustering of sequences with CD-HIT as part of PyamilySeq run
|
|
40
41
|
```
|
|
41
42
|
PyamilySeq -run_mode Full -group_mode Species -clustering_format CD-HIT -output_dir .../test_data/testing/Full
|
|
42
|
-
-input_type combined -input_dir .../test_data/genomes -name_split _combined.gff3 -pid 0.95 -len_diff 0.80
|
|
43
|
-
-gpa -a -w 99
|
|
43
|
+
-input_type combined -input_dir .../test_data/genomes -name_split _combined.gff3 -pid 0.95 -len_diff 0.80 -a -w 99
|
|
44
44
|
```
|
|
45
45
|
### 'Partial Mode': Will take the output of a sequence clustering.
|
|
46
46
|
```
|
|
47
|
-
PyamilySeq -run_mode Partial -group_mode Species -clustering_format TSV -output_dir .../test_data/
|
|
48
|
-
-cluster_file .../test_data/
|
|
49
|
-
-original_fasta .../test_data/species/combined_Ensmbl_cds.fasta -
|
|
47
|
+
PyamilySeq -run_mode Partial -group_mode Species -clustering_format TSV -output_dir .../test_data/species/testing/Partial
|
|
48
|
+
-cluster_file .../test_data/species/MMseqs2/combined_Ensmbl_pep_cluster.tsv
|
|
49
|
+
-original_fasta .../test_data/species/combined_Ensmbl_cds.fasta -a -w 99 -verbose
|
|
50
50
|
|
|
51
51
|
```
|
|
52
52
|
#### Note: '-clustering_format TSV/CSV' requires input to be two in two columns as below (Same format as MMseqs2 tsv) - Genome name and sequence name are separated by '|'.
|
|
@@ -58,7 +58,7 @@ Escherichia_coli_110957|ENSB:TIZS9kbTvShDvyX Escherichia_coli_110957|ENSB:TIZS9k
|
|
|
58
58
|
```
|
|
59
59
|
### Example output:
|
|
60
60
|
```
|
|
61
|
-
Running PyamilySeq v0.
|
|
61
|
+
Running PyamilySeq v0.9.0
|
|
62
62
|
Calculating Groups
|
|
63
63
|
Gene Groups:
|
|
64
64
|
First_core_99: 2682
|
|
@@ -80,7 +80,7 @@ PyamilySeq -run_mode Partial -group_mode Genus -clustering_format CD-HIT -output
|
|
|
80
80
|
-cluster_file .../test_data/genus/CD-HIT/combined_cds_cd-hit_80_60.clstr -gpa
|
|
81
81
|
```
|
|
82
82
|
```commandline
|
|
83
|
-
Running PyamilySeq v0.
|
|
83
|
+
Running PyamilySeq v0.9.0
|
|
84
84
|
Calculating Groups
|
|
85
85
|
Genus Groups:
|
|
86
86
|
First_genera_1: 28549
|
|
@@ -137,14 +137,14 @@ Please report any issues to: https://github.com/NickJD/PyamilySeq/issues
|
|
|
137
137
|
## PyamilySeq - Menu:
|
|
138
138
|
### PyamilySeq is separated into two main 'run modes', Full and Partial. They each have their own set of required and optional arguments.
|
|
139
139
|
```
|
|
140
|
-
Running PyamilySeq v0.
|
|
140
|
+
Running PyamilySeq v0.9.0
|
|
141
141
|
usage: PyamilySeq.py [-h] -run_mode {Full,Partial} -group_mode {Species,Genus} -clustering_format {CD-HIT,TSV,CSV} -output_dir OUTPUT_DIR
|
|
142
142
|
[-input_type {separate,combined}] [-input_dir INPUT_DIR] [-name_split NAME_SPLIT] [-sequence_type {AA,DNA}] [-gene_ident GENE_IDENT]
|
|
143
143
|
[-pid PIDENT] [-len_diff LEN_DIFF] [-mem CLUSTERING_MEMORY] [-t CLUSTERING_THREADS] [-cluster_file CLUSTER_FILE]
|
|
144
144
|
[-reclustered RECLUSTERED] [-seq_tag SEQUENCE_TAG] [-core_groups CORE_GROUPS] [-genus_groups GENUS_GROUPS] [-w WRITE_GROUPS] [-a]
|
|
145
145
|
[-original_fasta ORIGINAL_FASTA] [-gpa] [-verbose] [-v]
|
|
146
146
|
|
|
147
|
-
PyamilySeq v0.
|
|
147
|
+
PyamilySeq v0.9.0: A tool that groups genes into unique clusters.
|
|
148
148
|
|
|
149
149
|
options:
|
|
150
150
|
-h, --help show this help message and exit
|
|
@@ -176,8 +176,9 @@ Full-Mode Arguments - Required when "-run_mode Full" is used:
|
|
|
176
176
|
Clustering Runtime Arguments - Optional when "-run_mode Full" is used:
|
|
177
177
|
-mem CLUSTERING_MEMORY
|
|
178
178
|
Default 4000: Memory to be allocated for clustering (in MBs).
|
|
179
|
-
-t
|
|
180
|
-
|
|
179
|
+
-t THREADS Default 8: Threads to be allocated for clustering
|
|
180
|
+
and/or alignment.
|
|
181
|
+
|
|
181
182
|
|
|
182
183
|
Partial-Mode Arguments - Required when "-run_mode Partial" is used:
|
|
183
184
|
-cluster_file CLUSTER_FILE
|
|
@@ -197,15 +198,16 @@ Output Parameters:
|
|
|
197
198
|
-w WRITE_GROUPS Default - No output: Output sequences of identified groups (provide levels at which to output - Species "-w 99,95" Genus "-w 2,3" -
|
|
198
199
|
Must provide FASTA file with -original_fasta if in Partial run mode.
|
|
199
200
|
-a Default - No output: SLOW! (Only works for Species mode) Output aligned and concatinated sequences of identified groups -provide
|
|
200
|
-
group levels at which to output "-w 99,95" - Must provide FASTA file with -original_fasta in
|
|
201
|
+
group levels at which to output "-w 99,95" - Must provide FASTA file with -original_fasta in Partialrun mode.
|
|
201
202
|
-original_fasta ORIGINAL_FASTA
|
|
202
|
-
FASTA file to use in conjunction with "-w" or "-
|
|
203
|
-
-
|
|
204
|
-
|
|
203
|
+
FASTA file to use in conjunction with "-w" or "-con" when running in Partial Mode.
|
|
204
|
+
-no_gpa Do not create a Roary/Panaroo formatted gene_presence_absence.csv (created by default) - Required for Coinfinder and other
|
|
205
|
+
downstream tools
|
|
206
|
+
|
|
207
|
+
Misc Parameters:
|
|
208
|
+
-verbose Print verbose output.
|
|
209
|
+
-v, --version Print out version number and exit
|
|
205
210
|
|
|
206
|
-
Misc:
|
|
207
|
-
-verbose Default - False: Print out runtime messages
|
|
208
|
-
-v Default - False: Print out version number and exit
|
|
209
211
|
```
|
|
210
212
|
|
|
211
213
|
|
|
@@ -215,13 +217,14 @@ Misc:
|
|
|
215
217
|
## Seq-Combiner: This tool is provided to enable the pre-processing of multiple GFF/FASTA files together ready to be clustered by the user.
|
|
216
218
|
### Example:
|
|
217
219
|
```bash
|
|
218
|
-
Seq-Combiner -input_dir .../test_data/genomes -name_split
|
|
220
|
+
Seq-Combiner -input_dir .../test_data/genomes -name_split .gff3 -output_dir .../test_data/genomes -output_name combine_fasta_seqs.fa -input_type combined
|
|
219
221
|
```
|
|
220
222
|
### Seq-Combiner Menu:
|
|
221
223
|
```
|
|
222
|
-
usage: Seq_Combiner.py [-h] -input_dir INPUT_DIR -input_type {separate,combined,fasta} -name_split NAME_SPLIT -output_dir OUTPUT_DIR -output_name
|
|
224
|
+
usage: Seq_Combiner.py [-h] -input_dir INPUT_DIR -input_type {separate,combined,fasta} -name_split NAME_SPLIT -output_dir OUTPUT_DIR -output_name
|
|
225
|
+
OUTPUT_FILE [-gene_ident GENE_IDENT] [-translate] [-v]
|
|
223
226
|
|
|
224
|
-
|
|
227
|
+
PyamilySeq v0.9.0: Seq-Combiner - A tool to extract sequences from GFF/FASTA files and prepare them for PyamilySeq.
|
|
225
228
|
|
|
226
229
|
options:
|
|
227
230
|
-h, --help show this help message and exit
|
|
@@ -229,7 +232,8 @@ options:
|
|
|
229
232
|
Required Arguments:
|
|
230
233
|
-input_dir INPUT_DIR Directory location where the files are located.
|
|
231
234
|
-input_type {separate,combined,fasta}
|
|
232
|
-
Type of input files: "separate" for separate FASTA and GFF files, "combined" for GFF files with embedded FASTA sequences and "fasta"
|
|
235
|
+
Type of input files: "separate" for separate FASTA and GFF files, "combined" for GFF files with embedded FASTA sequences and "fasta"
|
|
236
|
+
for combining multiple FASTA files together.
|
|
233
237
|
-name_split NAME_SPLIT
|
|
234
238
|
substring used to split the filename and extract the genome name ('_combined.gff3' or '.gff').
|
|
235
239
|
-output_dir OUTPUT_DIR
|
|
@@ -239,46 +243,103 @@ Required Arguments:
|
|
|
239
243
|
|
|
240
244
|
Optional Arguments:
|
|
241
245
|
-gene_ident GENE_IDENT
|
|
242
|
-
Default - "CDS": Identifier used for extraction of sequences such as
|
|
246
|
+
Default - "CDS": Identifier used for extraction of sequences such as
|
|
247
|
+
"misc_RNA,gene,mRNA,CDS,rRNA,tRNA,tmRNA,CRISPR,ncRNA,regulatory_region,oriC,pseudo" - Not compatible with "fasta" input mode.
|
|
243
248
|
-translate Default - False: Translate extracted sequences to their AA counterpart?
|
|
244
249
|
|
|
245
250
|
Misc Arguments:
|
|
246
|
-
-v
|
|
247
|
-
|
|
251
|
+
-v, --version Print out version number and exit
|
|
248
252
|
|
|
249
253
|
```
|
|
250
254
|
|
|
251
|
-
|
|
252
|
-
|
|
255
|
+
## Group-Splitter: This tool can split multi-copy gene groups using CD-HIT after initial PyamilySeq analysis.
|
|
256
|
+
### Example:
|
|
257
|
+
```bash
|
|
258
|
+
Group-Splitter -genome_num 74 -input_fasta .../test/species/ -output_dir .../test/species/ -sequence_type AA
|
|
253
259
|
```
|
|
254
|
-
|
|
255
|
-
|
|
260
|
+
### Group-Splitter Menu:
|
|
261
|
+
```
|
|
262
|
+
usage: Group_Splitter.py [-h] -input_fasta INPUT_FASTA -sequence_type {AA,DNA}
|
|
263
|
+
-genome_num GENOME_NUM -output_dir OUTPUT_DIR
|
|
264
|
+
[-groups GROUPS] [-group_threshold GROUP_THRESHOLD]
|
|
265
|
+
[-c PIDENT] [-s LEN_DIFF] [-T CLUSTERING_THREADS]
|
|
266
|
+
[-M CLUSTERING_MEMORY] [-no_delete_temp_files]
|
|
267
|
+
[-verbose] [-v]
|
|
256
268
|
|
|
257
|
-
|
|
269
|
+
PyamilySeq v0.9.0: Group-Splitter - A tool to split multi-copy gene groups
|
|
270
|
+
identified by PyamilySeq.
|
|
258
271
|
|
|
259
272
|
options:
|
|
260
273
|
-h, --help show this help message and exit
|
|
261
274
|
|
|
262
|
-
Required
|
|
275
|
+
Required Parameters:
|
|
263
276
|
-input_fasta INPUT_FASTA
|
|
264
277
|
Input FASTA file containing gene groups.
|
|
278
|
+
-sequence_type {AA,DNA}
|
|
279
|
+
Default - DNA: Are groups "DNA" or "AA" sequences?
|
|
280
|
+
-genome_num GENOME_NUM
|
|
281
|
+
The total number of genomes must be provide
|
|
265
282
|
-output_dir OUTPUT_DIR
|
|
266
283
|
Output directory.
|
|
267
284
|
|
|
268
|
-
|
|
269
|
-
-
|
|
270
|
-
|
|
271
|
-
|
|
285
|
+
Regrouping Parameters:
|
|
286
|
+
-groups GROUPS Default - auto: Detect groups to be split (see
|
|
287
|
+
-group_threshold). Provide "-groups 1,2,3,4" with
|
|
288
|
+
group IDs to split specific groups.
|
|
289
|
+
-group_threshold GROUP_THRESHOLD
|
|
290
|
+
Minimum percentage of genomes with multi-copy
|
|
291
|
+
(default: 80.0) - Does not work with "-groups"
|
|
292
|
+
|
|
293
|
+
CD-HIT Reclustering Parameters:
|
|
294
|
+
-c PIDENT Sequence identity threshold (default: 0.8) - Probably
|
|
295
|
+
should be higher than what was used in initial
|
|
296
|
+
clustering.
|
|
297
|
+
-s LEN_DIFF Length difference cutoff (default: 0.20) - Often the
|
|
298
|
+
most impactful parameter to split 'multi-copy' gene
|
|
299
|
+
groups.
|
|
300
|
+
-T CLUSTERING_THREADS
|
|
272
301
|
Number of threads for clustering (default: 4)
|
|
273
|
-
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
302
|
+
-M CLUSTERING_MEMORY Memory limit in MB for clustering (default: 2000)
|
|
303
|
+
|
|
304
|
+
Misc Parameters:
|
|
305
|
+
-no_delete_temp_files
|
|
306
|
+
Default: Delete all temporary files after processing.
|
|
277
307
|
-verbose Print verbose output.
|
|
278
|
-
-
|
|
308
|
+
-v, --version Print out version number and exit
|
|
309
|
+
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
## Cluster-Summary menu: This tool can be used to summarise CD-HIT .clstr files:
|
|
313
|
+
### Example:
|
|
314
|
+
```bash
|
|
315
|
+
Cluster-Summary -genome_num 74 -input_clstr .../test_data/species/E-coli/E-coli_extracted_pep_cd-hit_80.clstr -output_tsv .../test_data/species/E-coli/E-coli_extracted_pep_cd-hit_80_Summary.tsv
|
|
316
|
+
```
|
|
317
|
+
### Cluster-Summary Menu:
|
|
318
|
+
```
|
|
319
|
+
usage: Cluster_Summary.py [-h] -input_clstr INPUT_CLSTR -output OUTPUT -genome_num GENOME_NUM
|
|
320
|
+
[-output_dir OUTPUT_DIR] [-verbose] [-v]
|
|
321
|
+
|
|
322
|
+
PyamilySeq v0.9.0: Cluster-Summary - A tool to summarise CD-HIT clustering files.
|
|
323
|
+
|
|
324
|
+
options:
|
|
325
|
+
-h, --help show this help message and exit
|
|
326
|
+
|
|
327
|
+
Required Parameters:
|
|
328
|
+
-input_clstr INPUT_CLSTR
|
|
329
|
+
Input CD-HIT .clstr file
|
|
330
|
+
-output OUTPUT Output TSV file to store cluster summaries - Will add '.tsv' if not
|
|
331
|
+
provided by user
|
|
332
|
+
-genome_num GENOME_NUM
|
|
333
|
+
The total number of genomes must be provide
|
|
334
|
+
|
|
335
|
+
Optional Arguments:
|
|
336
|
+
-output_dir OUTPUT_DIR
|
|
337
|
+
Default: Same as input file
|
|
338
|
+
|
|
339
|
+
Misc Parameters:
|
|
340
|
+
-verbose Print verbose output.
|
|
341
|
+
-v, --version Print out version number and exit
|
|
279
342
|
|
|
280
|
-
Misc Arguments:
|
|
281
|
-
-v Print out version number and exit
|
|
282
343
|
```
|
|
283
344
|
|
|
284
345
|
### All example input and output data can be found in the 'test_data' directory.
|
|
@@ -19,19 +19,18 @@ PyamilySeq probably requires Python 3.6 or higher. Install using pip:
|
|
|
19
19
|
```bash
|
|
20
20
|
pip install PyamilySeq
|
|
21
21
|
```
|
|
22
|
-
|
|
22
|
+
PyamilySeq is regularly updated with bugfixes and new features so to update to the newest version add '-U' to end of the pip install command.
|
|
23
23
|
## Example usage: Below are two examples of running PyamilySeq in its two main modes.
|
|
24
24
|
### 'Full Mode': Will conduct clustering of sequences with CD-HIT as part of PyamilySeq run
|
|
25
25
|
```
|
|
26
26
|
PyamilySeq -run_mode Full -group_mode Species -clustering_format CD-HIT -output_dir .../test_data/testing/Full
|
|
27
|
-
-input_type combined -input_dir .../test_data/genomes -name_split _combined.gff3 -pid 0.95 -len_diff 0.80
|
|
28
|
-
-gpa -a -w 99
|
|
27
|
+
-input_type combined -input_dir .../test_data/genomes -name_split _combined.gff3 -pid 0.95 -len_diff 0.80 -a -w 99
|
|
29
28
|
```
|
|
30
29
|
### 'Partial Mode': Will take the output of a sequence clustering.
|
|
31
30
|
```
|
|
32
|
-
PyamilySeq -run_mode Partial -group_mode Species -clustering_format TSV -output_dir .../test_data/
|
|
33
|
-
-cluster_file .../test_data/
|
|
34
|
-
-original_fasta .../test_data/species/combined_Ensmbl_cds.fasta -
|
|
31
|
+
PyamilySeq -run_mode Partial -group_mode Species -clustering_format TSV -output_dir .../test_data/species/testing/Partial
|
|
32
|
+
-cluster_file .../test_data/species/MMseqs2/combined_Ensmbl_pep_cluster.tsv
|
|
33
|
+
-original_fasta .../test_data/species/combined_Ensmbl_cds.fasta -a -w 99 -verbose
|
|
35
34
|
|
|
36
35
|
```
|
|
37
36
|
#### Note: '-clustering_format TSV/CSV' requires input to be two in two columns as below (Same format as MMseqs2 tsv) - Genome name and sequence name are separated by '|'.
|
|
@@ -43,7 +42,7 @@ Escherichia_coli_110957|ENSB:TIZS9kbTvShDvyX Escherichia_coli_110957|ENSB:TIZS9k
|
|
|
43
42
|
```
|
|
44
43
|
### Example output:
|
|
45
44
|
```
|
|
46
|
-
Running PyamilySeq v0.
|
|
45
|
+
Running PyamilySeq v0.9.0
|
|
47
46
|
Calculating Groups
|
|
48
47
|
Gene Groups:
|
|
49
48
|
First_core_99: 2682
|
|
@@ -65,7 +64,7 @@ PyamilySeq -run_mode Partial -group_mode Genus -clustering_format CD-HIT -output
|
|
|
65
64
|
-cluster_file .../test_data/genus/CD-HIT/combined_cds_cd-hit_80_60.clstr -gpa
|
|
66
65
|
```
|
|
67
66
|
```commandline
|
|
68
|
-
Running PyamilySeq v0.
|
|
67
|
+
Running PyamilySeq v0.9.0
|
|
69
68
|
Calculating Groups
|
|
70
69
|
Genus Groups:
|
|
71
70
|
First_genera_1: 28549
|
|
@@ -122,14 +121,14 @@ Please report any issues to: https://github.com/NickJD/PyamilySeq/issues
|
|
|
122
121
|
## PyamilySeq - Menu:
|
|
123
122
|
### PyamilySeq is separated into two main 'run modes', Full and Partial. They each have their own set of required and optional arguments.
|
|
124
123
|
```
|
|
125
|
-
Running PyamilySeq v0.
|
|
124
|
+
Running PyamilySeq v0.9.0
|
|
126
125
|
usage: PyamilySeq.py [-h] -run_mode {Full,Partial} -group_mode {Species,Genus} -clustering_format {CD-HIT,TSV,CSV} -output_dir OUTPUT_DIR
|
|
127
126
|
[-input_type {separate,combined}] [-input_dir INPUT_DIR] [-name_split NAME_SPLIT] [-sequence_type {AA,DNA}] [-gene_ident GENE_IDENT]
|
|
128
127
|
[-pid PIDENT] [-len_diff LEN_DIFF] [-mem CLUSTERING_MEMORY] [-t CLUSTERING_THREADS] [-cluster_file CLUSTER_FILE]
|
|
129
128
|
[-reclustered RECLUSTERED] [-seq_tag SEQUENCE_TAG] [-core_groups CORE_GROUPS] [-genus_groups GENUS_GROUPS] [-w WRITE_GROUPS] [-a]
|
|
130
129
|
[-original_fasta ORIGINAL_FASTA] [-gpa] [-verbose] [-v]
|
|
131
130
|
|
|
132
|
-
PyamilySeq v0.
|
|
131
|
+
PyamilySeq v0.9.0: A tool that groups genes into unique clusters.
|
|
133
132
|
|
|
134
133
|
options:
|
|
135
134
|
-h, --help show this help message and exit
|
|
@@ -161,8 +160,9 @@ Full-Mode Arguments - Required when "-run_mode Full" is used:
|
|
|
161
160
|
Clustering Runtime Arguments - Optional when "-run_mode Full" is used:
|
|
162
161
|
-mem CLUSTERING_MEMORY
|
|
163
162
|
Default 4000: Memory to be allocated for clustering (in MBs).
|
|
164
|
-
-t
|
|
165
|
-
|
|
163
|
+
-t THREADS Default 8: Threads to be allocated for clustering
|
|
164
|
+
and/or alignment.
|
|
165
|
+
|
|
166
166
|
|
|
167
167
|
Partial-Mode Arguments - Required when "-run_mode Partial" is used:
|
|
168
168
|
-cluster_file CLUSTER_FILE
|
|
@@ -182,15 +182,16 @@ Output Parameters:
|
|
|
182
182
|
-w WRITE_GROUPS Default - No output: Output sequences of identified groups (provide levels at which to output - Species "-w 99,95" Genus "-w 2,3" -
|
|
183
183
|
Must provide FASTA file with -original_fasta if in Partial run mode.
|
|
184
184
|
-a Default - No output: SLOW! (Only works for Species mode) Output aligned and concatinated sequences of identified groups -provide
|
|
185
|
-
group levels at which to output "-w 99,95" - Must provide FASTA file with -original_fasta in
|
|
185
|
+
group levels at which to output "-w 99,95" - Must provide FASTA file with -original_fasta in Partialrun mode.
|
|
186
186
|
-original_fasta ORIGINAL_FASTA
|
|
187
|
-
FASTA file to use in conjunction with "-w" or "-
|
|
188
|
-
-
|
|
189
|
-
|
|
187
|
+
FASTA file to use in conjunction with "-w" or "-con" when running in Partial Mode.
|
|
188
|
+
-no_gpa Do not create a Roary/Panaroo formatted gene_presence_absence.csv (created by default) - Required for Coinfinder and other
|
|
189
|
+
downstream tools
|
|
190
|
+
|
|
191
|
+
Misc Parameters:
|
|
192
|
+
-verbose Print verbose output.
|
|
193
|
+
-v, --version Print out version number and exit
|
|
190
194
|
|
|
191
|
-
Misc:
|
|
192
|
-
-verbose Default - False: Print out runtime messages
|
|
193
|
-
-v Default - False: Print out version number and exit
|
|
194
195
|
```
|
|
195
196
|
|
|
196
197
|
|
|
@@ -200,13 +201,14 @@ Misc:
|
|
|
200
201
|
## Seq-Combiner: This tool is provided to enable the pre-processing of multiple GFF/FASTA files together ready to be clustered by the user.
|
|
201
202
|
### Example:
|
|
202
203
|
```bash
|
|
203
|
-
Seq-Combiner -input_dir .../test_data/genomes -name_split
|
|
204
|
+
Seq-Combiner -input_dir .../test_data/genomes -name_split .gff3 -output_dir .../test_data/genomes -output_name combine_fasta_seqs.fa -input_type combined
|
|
204
205
|
```
|
|
205
206
|
### Seq-Combiner Menu:
|
|
206
207
|
```
|
|
207
|
-
usage: Seq_Combiner.py [-h] -input_dir INPUT_DIR -input_type {separate,combined,fasta} -name_split NAME_SPLIT -output_dir OUTPUT_DIR -output_name
|
|
208
|
+
usage: Seq_Combiner.py [-h] -input_dir INPUT_DIR -input_type {separate,combined,fasta} -name_split NAME_SPLIT -output_dir OUTPUT_DIR -output_name
|
|
209
|
+
OUTPUT_FILE [-gene_ident GENE_IDENT] [-translate] [-v]
|
|
208
210
|
|
|
209
|
-
|
|
211
|
+
PyamilySeq v0.9.0: Seq-Combiner - A tool to extract sequences from GFF/FASTA files and prepare them for PyamilySeq.
|
|
210
212
|
|
|
211
213
|
options:
|
|
212
214
|
-h, --help show this help message and exit
|
|
@@ -214,7 +216,8 @@ options:
|
|
|
214
216
|
Required Arguments:
|
|
215
217
|
-input_dir INPUT_DIR Directory location where the files are located.
|
|
216
218
|
-input_type {separate,combined,fasta}
|
|
217
|
-
Type of input files: "separate" for separate FASTA and GFF files, "combined" for GFF files with embedded FASTA sequences and "fasta"
|
|
219
|
+
Type of input files: "separate" for separate FASTA and GFF files, "combined" for GFF files with embedded FASTA sequences and "fasta"
|
|
220
|
+
for combining multiple FASTA files together.
|
|
218
221
|
-name_split NAME_SPLIT
|
|
219
222
|
substring used to split the filename and extract the genome name ('_combined.gff3' or '.gff').
|
|
220
223
|
-output_dir OUTPUT_DIR
|
|
@@ -224,46 +227,103 @@ Required Arguments:
|
|
|
224
227
|
|
|
225
228
|
Optional Arguments:
|
|
226
229
|
-gene_ident GENE_IDENT
|
|
227
|
-
Default - "CDS": Identifier used for extraction of sequences such as
|
|
230
|
+
Default - "CDS": Identifier used for extraction of sequences such as
|
|
231
|
+
"misc_RNA,gene,mRNA,CDS,rRNA,tRNA,tmRNA,CRISPR,ncRNA,regulatory_region,oriC,pseudo" - Not compatible with "fasta" input mode.
|
|
228
232
|
-translate Default - False: Translate extracted sequences to their AA counterpart?
|
|
229
233
|
|
|
230
234
|
Misc Arguments:
|
|
231
|
-
-v
|
|
232
|
-
|
|
235
|
+
-v, --version Print out version number and exit
|
|
233
236
|
|
|
234
237
|
```
|
|
235
238
|
|
|
236
|
-
|
|
237
|
-
|
|
239
|
+
## Group-Splitter: This tool can split multi-copy gene groups using CD-HIT after initial PyamilySeq analysis.
|
|
240
|
+
### Example:
|
|
241
|
+
```bash
|
|
242
|
+
Group-Splitter -genome_num 74 -input_fasta .../test/species/ -output_dir .../test/species/ -sequence_type AA
|
|
238
243
|
```
|
|
239
|
-
|
|
240
|
-
|
|
244
|
+
### Group-Splitter Menu:
|
|
245
|
+
```
|
|
246
|
+
usage: Group_Splitter.py [-h] -input_fasta INPUT_FASTA -sequence_type {AA,DNA}
|
|
247
|
+
-genome_num GENOME_NUM -output_dir OUTPUT_DIR
|
|
248
|
+
[-groups GROUPS] [-group_threshold GROUP_THRESHOLD]
|
|
249
|
+
[-c PIDENT] [-s LEN_DIFF] [-T CLUSTERING_THREADS]
|
|
250
|
+
[-M CLUSTERING_MEMORY] [-no_delete_temp_files]
|
|
251
|
+
[-verbose] [-v]
|
|
241
252
|
|
|
242
|
-
|
|
253
|
+
PyamilySeq v0.9.0: Group-Splitter - A tool to split multi-copy gene groups
|
|
254
|
+
identified by PyamilySeq.
|
|
243
255
|
|
|
244
256
|
options:
|
|
245
257
|
-h, --help show this help message and exit
|
|
246
258
|
|
|
247
|
-
Required
|
|
259
|
+
Required Parameters:
|
|
248
260
|
-input_fasta INPUT_FASTA
|
|
249
261
|
Input FASTA file containing gene groups.
|
|
262
|
+
-sequence_type {AA,DNA}
|
|
263
|
+
Default - DNA: Are groups "DNA" or "AA" sequences?
|
|
264
|
+
-genome_num GENOME_NUM
|
|
265
|
+
The total number of genomes must be provide
|
|
250
266
|
-output_dir OUTPUT_DIR
|
|
251
267
|
Output directory.
|
|
252
268
|
|
|
253
|
-
|
|
254
|
-
-
|
|
255
|
-
|
|
256
|
-
|
|
269
|
+
Regrouping Parameters:
|
|
270
|
+
-groups GROUPS Default - auto: Detect groups to be split (see
|
|
271
|
+
-group_threshold). Provide "-groups 1,2,3,4" with
|
|
272
|
+
group IDs to split specific groups.
|
|
273
|
+
-group_threshold GROUP_THRESHOLD
|
|
274
|
+
Minimum percentage of genomes with multi-copy
|
|
275
|
+
(default: 80.0) - Does not work with "-groups"
|
|
276
|
+
|
|
277
|
+
CD-HIT Reclustering Parameters:
|
|
278
|
+
-c PIDENT Sequence identity threshold (default: 0.8) - Probably
|
|
279
|
+
should be higher than what was used in initial
|
|
280
|
+
clustering.
|
|
281
|
+
-s LEN_DIFF Length difference cutoff (default: 0.20) - Often the
|
|
282
|
+
most impactful parameter to split 'multi-copy' gene
|
|
283
|
+
groups.
|
|
284
|
+
-T CLUSTERING_THREADS
|
|
257
285
|
Number of threads for clustering (default: 4)
|
|
258
|
-
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
286
|
+
-M CLUSTERING_MEMORY Memory limit in MB for clustering (default: 2000)
|
|
287
|
+
|
|
288
|
+
Misc Parameters:
|
|
289
|
+
-no_delete_temp_files
|
|
290
|
+
Default: Delete all temporary files after processing.
|
|
262
291
|
-verbose Print verbose output.
|
|
263
|
-
-
|
|
292
|
+
-v, --version Print out version number and exit
|
|
293
|
+
|
|
294
|
+
```
|
|
295
|
+
|
|
296
|
+
## Cluster-Summary menu: This tool can be used to summarise CD-HIT .clstr files:
|
|
297
|
+
### Example:
|
|
298
|
+
```bash
|
|
299
|
+
Cluster-Summary -genome_num 74 -input_clstr .../test_data/species/E-coli/E-coli_extracted_pep_cd-hit_80.clstr -output_tsv .../test_data/species/E-coli/E-coli_extracted_pep_cd-hit_80_Summary.tsv
|
|
300
|
+
```
|
|
301
|
+
### Cluster-Summary Menu:
|
|
302
|
+
```
|
|
303
|
+
usage: Cluster_Summary.py [-h] -input_clstr INPUT_CLSTR -output OUTPUT -genome_num GENOME_NUM
|
|
304
|
+
[-output_dir OUTPUT_DIR] [-verbose] [-v]
|
|
305
|
+
|
|
306
|
+
PyamilySeq v0.9.0: Cluster-Summary - A tool to summarise CD-HIT clustering files.
|
|
307
|
+
|
|
308
|
+
options:
|
|
309
|
+
-h, --help show this help message and exit
|
|
310
|
+
|
|
311
|
+
Required Parameters:
|
|
312
|
+
-input_clstr INPUT_CLSTR
|
|
313
|
+
Input CD-HIT .clstr file
|
|
314
|
+
-output OUTPUT Output TSV file to store cluster summaries - Will add '.tsv' if not
|
|
315
|
+
provided by user
|
|
316
|
+
-genome_num GENOME_NUM
|
|
317
|
+
The total number of genomes must be provide
|
|
318
|
+
|
|
319
|
+
Optional Arguments:
|
|
320
|
+
-output_dir OUTPUT_DIR
|
|
321
|
+
Default: Same as input file
|
|
322
|
+
|
|
323
|
+
Misc Parameters:
|
|
324
|
+
-verbose Print verbose output.
|
|
325
|
+
-v, --version Print out version number and exit
|
|
264
326
|
|
|
265
|
-
Misc Arguments:
|
|
266
|
-
-v Print out version number and exit
|
|
267
327
|
```
|
|
268
328
|
|
|
269
329
|
### All example input and output data can be found in the 'test_data' directory.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[metadata]
|
|
2
2
|
name = PyamilySeq
|
|
3
|
-
version = v0.
|
|
3
|
+
version = v0.9.0
|
|
4
4
|
author = Nicholas Dimonaco
|
|
5
5
|
author_email = nicholas@dimonaco.co.uk
|
|
6
6
|
description = PyamilySeq - A a tool to look for sequence-based gene groups identified by clustering methods such as CD-HIT, DIAMOND, BLAST or MMseqs2.
|
|
@@ -20,6 +20,7 @@ package_dir =
|
|
|
20
20
|
packages = find:
|
|
21
21
|
python_requires = >=3.6
|
|
22
22
|
install_requires =
|
|
23
|
+
levenshtein
|
|
23
24
|
|
|
24
25
|
[options.packages.find]
|
|
25
26
|
where = src
|
|
@@ -30,6 +31,7 @@ console_scripts =
|
|
|
30
31
|
PyamilySeq = PyamilySeq.PyamilySeq:main
|
|
31
32
|
Seq-Combiner = PyamilySeq.Seq_Combiner:main
|
|
32
33
|
Group-Splitter = PyamilySeq.Group_Splitter:main
|
|
34
|
+
Cluster-Summary = PyamilySeq.Cluster_Summary:main
|
|
33
35
|
|
|
34
36
|
[egg_info]
|
|
35
37
|
tag_build =
|