ORForise 1.4.3__tar.gz → 1.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. {orforise-1.4.3 → orforise-1.5.1}/PKG-INFO +7 -31
  2. {orforise-1.4.3 → orforise-1.5.1}/README.md +6 -30
  3. {orforise-1.4.3 → orforise-1.5.1}/setup.cfg +6 -1
  4. orforise-1.5.1/src/ORForise/Aggregate_Compare.py +382 -0
  5. orforise-1.5.1/src/ORForise/Annotation_Compare.py +353 -0
  6. orforise-1.5.1/src/ORForise/Comparator.py +881 -0
  7. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/ORForise_Analysis/genome_Metrics.py +51 -33
  8. orforise-1.5.1/src/ORForise/Tools/Augustus/Augustus.py +42 -0
  9. orforise-1.5.1/src/ORForise/Tools/Balrog/Balrog.py +44 -0
  10. orforise-1.4.3/src/ORForise/Tools/StORF_Reporter/StORF_Reporter.py → orforise-1.5.1/src/ORForise/Tools/EasyGene/EasyGene.py +19 -13
  11. orforise-1.5.1/src/ORForise/Tools/FGENESB/FGENESB.py +45 -0
  12. orforise-1.5.1/src/ORForise/Tools/FragGeneScan/FragGeneScan.py +42 -0
  13. orforise-1.5.1/src/ORForise/Tools/GFF/GFF.py +66 -0
  14. orforise-1.5.1/src/ORForise/Tools/GLIMMER_3/GLIMMER_3.py +47 -0
  15. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/Tools/GeneMark/GeneMark.py +46 -40
  16. orforise-1.5.1/src/ORForise/Tools/GeneMark_HA/GeneMark_HA.py +42 -0
  17. orforise-1.4.3/src/ORForise/Tools/Balrog/Balrog.py → orforise-1.5.1/src/ORForise/Tools/GeneMark_HMM/GeneMark_HMM.py +19 -12
  18. orforise-1.4.3/src/ORForise/Tools/MetaGeneMark/MetaGeneMark.py → orforise-1.5.1/src/ORForise/Tools/GeneMark_S/GeneMark_S.py +19 -13
  19. orforise-1.4.3/src/ORForise/Tools/Prokka/Prokka.py → orforise-1.5.1/src/ORForise/Tools/GeneMark_S_2/GeneMark_S_2.py +17 -14
  20. orforise-1.5.1/src/ORForise/Tools/MetaGene/MetaGene.py +42 -0
  21. orforise-1.5.1/src/ORForise/Tools/MetaGeneAnnotator/MetaGeneAnnotator.py +43 -0
  22. orforise-1.5.1/src/ORForise/Tools/MetaGeneMark/MetaGeneMark.py +43 -0
  23. orforise-1.5.1/src/ORForise/Tools/Prodigal/Prodigal.py +43 -0
  24. orforise-1.5.1/src/ORForise/Tools/Prokka/Prokka.py +45 -0
  25. orforise-1.5.1/src/ORForise/Tools/StORF_Reporter/StORF_Reporter.py +44 -0
  26. orforise-1.5.1/src/ORForise/Tools/TransDecoder/TransDecoder.py +42 -0
  27. orforise-1.5.1/src/ORForise/utils.py +233 -0
  28. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise.egg-info/PKG-INFO +7 -31
  29. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise.egg-info/entry_points.txt +5 -0
  30. orforise-1.4.3/src/ORForise/Aggregate_Compare.py +0 -197
  31. orforise-1.4.3/src/ORForise/Annotation_Compare.py +0 -184
  32. orforise-1.4.3/src/ORForise/Comparator.py +0 -801
  33. orforise-1.4.3/src/ORForise/Tools/Augustus/Augustus.py +0 -35
  34. orforise-1.4.3/src/ORForise/Tools/EasyGene/EasyGene.py +0 -35
  35. orforise-1.4.3/src/ORForise/Tools/FGENESB/FGENESB.py +0 -38
  36. orforise-1.4.3/src/ORForise/Tools/FragGeneScan/FragGeneScan.py +0 -35
  37. orforise-1.4.3/src/ORForise/Tools/GFF/GFF.py +0 -62
  38. orforise-1.4.3/src/ORForise/Tools/GLIMMER_3/GLIMMER_3.py +0 -40
  39. orforise-1.4.3/src/ORForise/Tools/GeneMark_HA/GeneMark_HA.py +0 -35
  40. orforise-1.4.3/src/ORForise/Tools/GeneMark_HMM/GeneMark_HMM.py +0 -36
  41. orforise-1.4.3/src/ORForise/Tools/GeneMark_S/GeneMark_S.py +0 -35
  42. orforise-1.4.3/src/ORForise/Tools/GeneMark_S_2/GeneMark_S_2.py +0 -39
  43. orforise-1.4.3/src/ORForise/Tools/MetaGene/MetaGene.py +0 -35
  44. orforise-1.4.3/src/ORForise/Tools/MetaGeneAnnotator/MetaGeneAnnotator.py +0 -36
  45. orforise-1.4.3/src/ORForise/Tools/Prodigal/Prodigal.py +0 -39
  46. orforise-1.4.3/src/ORForise/Tools/TransDecoder/TransDecoder.py +0 -35
  47. orforise-1.4.3/src/ORForise/utils.py +0 -31
  48. {orforise-1.4.3 → orforise-1.5.1}/LICENSE +0 -0
  49. {orforise-1.4.3 → orforise-1.5.1}/pyproject.toml +0 -0
  50. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/GFF_Adder.py +0 -0
  51. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/GFF_Intersector.py +0 -0
  52. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/ORForise_Analysis/__init__.py +0 -0
  53. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/ORForise_Analysis/cds_checker.py +0 -0
  54. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/ORForise_Analysis/gene_Lenghts.py +0 -0
  55. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/ORForise_Analysis/hypothetical_gene_predictions.py +0 -0
  56. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/ORForise_Analysis/missed_Gene_Metrics.py +0 -0
  57. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/ORForise_Analysis/parital_Match_Analysis.py +0 -0
  58. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/ORForise_Analysis/result_File_Analysis.py +0 -0
  59. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/ORForise_Analysis/start_Codon_Substitution.py +0 -0
  60. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/StORForise.py +0 -0
  61. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/Tools/Augustus/__init__.py +0 -0
  62. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/Tools/Balrog/__init__.py +0 -0
  63. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/Tools/EasyGene/__init__.py +0 -0
  64. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/Tools/FGENESB/__init__.py +0 -0
  65. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/Tools/FragGeneScan/__init__.py +0 -0
  66. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/Tools/GFF/__init__.py +0 -0
  67. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/Tools/GLIMMER_3/__init__.py +0 -0
  68. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/Tools/GeneMark/__init__.py +0 -0
  69. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/Tools/GeneMark_HA/__init__.py +0 -0
  70. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/Tools/GeneMark_HMM/__init__.py +0 -0
  71. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/Tools/GeneMark_S/__init__.py +0 -0
  72. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/Tools/GeneMark_S_2/__init__.py +0 -0
  73. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/Tools/MetaGene/__init__.py +0 -0
  74. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/Tools/MetaGeneAnnotator/__init__.py +0 -0
  75. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/Tools/MetaGeneMark/__init__.py +0 -0
  76. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/Tools/Prodigal/__init__.py +0 -0
  77. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/Tools/Prokka/__init__.py +0 -0
  78. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/Tools/StORF_Reporter/__init__.py +0 -0
  79. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/Tools/StORF_Undetected/Completely_Undetected/Completey_Undetected.py +0 -0
  80. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/Tools/StORF_Undetected/Completely_Undetected/__init__.py +0 -0
  81. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/Tools/StORF_Undetected/StORF_Undetected.py +0 -0
  82. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/Tools/StORF_Undetected/__init__.py +0 -0
  83. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/Tools/StORF_Undetected/unvitiated_Genes/__init__.py +0 -0
  84. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/Tools/StORF_Undetected/unvitiated_Genes/unvitiated_Missed_Genes.py +0 -0
  85. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/Tools/TransDecoder/__init__.py +0 -0
  86. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/Tools/__init__.py +0 -0
  87. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise/__init__.py +0 -0
  88. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise.egg-info/SOURCES.txt +0 -0
  89. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise.egg-info/dependency_links.txt +0 -0
  90. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise.egg-info/requires.txt +0 -0
  91. {orforise-1.4.3 → orforise-1.5.1}/src/ORForise.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ORForise
3
- Version: 1.4.3
3
+ Version: 1.5.1
4
4
  Summary: ORForise - Platform for analysing and comparing Prokaryote CoDing Sequence (CDS) Gene Predictions.
5
5
  Home-page: https://github.com/NickJD/ORForise
6
6
  Author: Nicholas Dimonaco
@@ -57,13 +57,7 @@ Example output files from ```Annotation-Compare```, ```GFF-Adder``` and ```GFF-I
57
57
  For Help: ```Annotation-Compare -h ```
58
58
 
59
59
  ```python
60
- Thank you for using ORForise
61
- Please report any issues to: https://github.com/NickJD/ORForise/issues
62
- #####
63
- usage: Annotation_Compare.py [-h] -dna GENOME_DNA -ref REFERENCE_ANNOTATION -t TOOL -tp TOOL_PREDICTION
64
- [-rt REFERENCE_TOOL] [-o OUTNAME] [-v {True,False}]
65
-
66
- ORForise v1.4.3: Annotatione-Compare Run Parameters.
60
+ ORForise v1.5.1: Annotatione-Compare Run Parameters.
67
61
 
68
62
  Required Arguments:
69
63
  -dna GENOME_DNA Genome DNA file (.fa) which both annotations are based on
@@ -78,8 +72,8 @@ Optional Arguments:
78
72
  name to compare output from two tools
79
73
 
80
74
  Output:
81
- -o OUTNAME Define full output filename (format is CSV) - If not provided, summary will be printed to
82
- std-out
75
+ -o OUTDIR Define directory where detailed output should be places
76
+ -n OUTNAME Define output filename(s) prefix - If not provided, filename of reference annotation file will be used- <outname>_<contig_id>_ORF_Comparison.csv
83
77
 
84
78
  Misc:
85
79
  -v {True,False} Default - False: Print out runtime status
@@ -107,13 +101,7 @@ ORForise can be used as the example below.
107
101
  For Help: ```Aggregate-Compare -h ```
108
102
 
109
103
  ```python
110
- Thank you for using ORForise
111
- Please report any issues to: https://github.com/NickJD/ORForise/issues
112
- #####
113
- usage: Aggregate_Compare.py [-h] -dna GENOME_DNA -t TOOLS -tp TOOL_PREDICTIONS -ref REFERENCE_ANNOTATION
114
- [-rt REFERENCE_TOOL] [-o OUTNAME] [-v {True,False}]
115
-
116
- ORForise v1.4.3: Aggregate-Compare Run Parameters.
104
+ ORForise v1.5.1: Aggregate-Compare Run Parameters.
117
105
 
118
106
  Required Arguments:
119
107
  -dna GENOME_DNA Genome DNA file (.fa) which both annotations are based on
@@ -261,13 +249,7 @@ The ```-gi``` option can be used to allow for different genomic elements to be a
261
249
  For Help: ```GFF-Adder -h ```
262
250
 
263
251
  ```python
264
- Thank you for using ORForise
265
- Please report any issues to: https://github.com/NickJD/ORForise/issues
266
- #####
267
- usage: GFF_Adder.py [-h] -dna GENOME_DNA -ref REFERENCE_ANNOTATION -at ADDITIONAL_TOOL -add ADDITIONAL_ANNOTATION -o
268
- OUTPUT_FILE [-rt REFERENCE_TOOL] [-gi GENE_IDENT] [-gene_ident GENE_IDENT] [-olap OVERLAP]
269
-
270
- ORForise v1.4.3: GFF-Adder Run Parameters.
252
+ ORForise v1.5.1: GFF-Adder Run Parameters.
271
253
 
272
254
  Required Arguments:
273
255
  -dna GENOME_DNA Genome DNA file (.fa) which both annotations are based on
@@ -323,13 +305,7 @@ The ```-gi``` option can be used to allow for different genomic elements to be a
323
305
 
324
306
  For Help: ```GFF-Intersector -h ```
325
307
  ```python
326
- Thank you for using ORForise
327
- Please report any issues to: https://github.com/NickJD/ORForise/issues
328
- #####
329
- usage: GFF_Intersector.py [-h] -dna GENOME_DNA -ref REFERENCE_ANNOTATION -at ADDITIONAL_TOOL -add
330
- ADDITIONAL_ANNOTATION -o OUTPUT_FILE [-rt REFERENCE_TOOL] [-gi GENE_IDENT] [-cov COVERAGE]
331
-
332
- ORForise v1.4.3: GFF-Intersector Run Parameters.
308
+ ORForise v1.5.1: GFF-Intersector Run Parameters.
333
309
 
334
310
  Required Arguments:
335
311
  -dna GENOME_DNA Genome DNA file (.fa) which both annotations are based on
@@ -40,13 +40,7 @@ Example output files from ```Annotation-Compare```, ```GFF-Adder``` and ```GFF-I
40
40
  For Help: ```Annotation-Compare -h ```
41
41
 
42
42
  ```python
43
- Thank you for using ORForise
44
- Please report any issues to: https://github.com/NickJD/ORForise/issues
45
- #####
46
- usage: Annotation_Compare.py [-h] -dna GENOME_DNA -ref REFERENCE_ANNOTATION -t TOOL -tp TOOL_PREDICTION
47
- [-rt REFERENCE_TOOL] [-o OUTNAME] [-v {True,False}]
48
-
49
- ORForise v1.4.3: Annotatione-Compare Run Parameters.
43
+ ORForise v1.5.1: Annotatione-Compare Run Parameters.
50
44
 
51
45
  Required Arguments:
52
46
  -dna GENOME_DNA Genome DNA file (.fa) which both annotations are based on
@@ -61,8 +55,8 @@ Optional Arguments:
61
55
  name to compare output from two tools
62
56
 
63
57
  Output:
64
- -o OUTNAME Define full output filename (format is CSV) - If not provided, summary will be printed to
65
- std-out
58
+ -o OUTDIR Define directory where detailed output should be places
59
+ -n OUTNAME Define output filename(s) prefix - If not provided, filename of reference annotation file will be used- <outname>_<contig_id>_ORF_Comparison.csv
66
60
 
67
61
  Misc:
68
62
  -v {True,False} Default - False: Print out runtime status
@@ -90,13 +84,7 @@ ORForise can be used as the example below.
90
84
  For Help: ```Aggregate-Compare -h ```
91
85
 
92
86
  ```python
93
- Thank you for using ORForise
94
- Please report any issues to: https://github.com/NickJD/ORForise/issues
95
- #####
96
- usage: Aggregate_Compare.py [-h] -dna GENOME_DNA -t TOOLS -tp TOOL_PREDICTIONS -ref REFERENCE_ANNOTATION
97
- [-rt REFERENCE_TOOL] [-o OUTNAME] [-v {True,False}]
98
-
99
- ORForise v1.4.3: Aggregate-Compare Run Parameters.
87
+ ORForise v1.5.1: Aggregate-Compare Run Parameters.
100
88
 
101
89
  Required Arguments:
102
90
  -dna GENOME_DNA Genome DNA file (.fa) which both annotations are based on
@@ -244,13 +232,7 @@ The ```-gi``` option can be used to allow for different genomic elements to be a
244
232
  For Help: ```GFF-Adder -h ```
245
233
 
246
234
  ```python
247
- Thank you for using ORForise
248
- Please report any issues to: https://github.com/NickJD/ORForise/issues
249
- #####
250
- usage: GFF_Adder.py [-h] -dna GENOME_DNA -ref REFERENCE_ANNOTATION -at ADDITIONAL_TOOL -add ADDITIONAL_ANNOTATION -o
251
- OUTPUT_FILE [-rt REFERENCE_TOOL] [-gi GENE_IDENT] [-gene_ident GENE_IDENT] [-olap OVERLAP]
252
-
253
- ORForise v1.4.3: GFF-Adder Run Parameters.
235
+ ORForise v1.5.1: GFF-Adder Run Parameters.
254
236
 
255
237
  Required Arguments:
256
238
  -dna GENOME_DNA Genome DNA file (.fa) which both annotations are based on
@@ -306,13 +288,7 @@ The ```-gi``` option can be used to allow for different genomic elements to be a
306
288
 
307
289
  For Help: ```GFF-Intersector -h ```
308
290
  ```python
309
- Thank you for using ORForise
310
- Please report any issues to: https://github.com/NickJD/ORForise/issues
311
- #####
312
- usage: GFF_Intersector.py [-h] -dna GENOME_DNA -ref REFERENCE_ANNOTATION -at ADDITIONAL_TOOL -add
313
- ADDITIONAL_ANNOTATION -o OUTPUT_FILE [-rt REFERENCE_TOOL] [-gi GENE_IDENT] [-cov COVERAGE]
314
-
315
- ORForise v1.4.3: GFF-Intersector Run Parameters.
291
+ ORForise v1.5.1: GFF-Intersector Run Parameters.
316
292
 
317
293
  Required Arguments:
318
294
  -dna GENOME_DNA Genome DNA file (.fa) which both annotations are based on
@@ -1,6 +1,6 @@
1
1
  [metadata]
2
2
  name = ORForise
3
- version = 1.4.3
3
+ version = 1.5.1
4
4
  author = Nicholas Dimonaco
5
5
  author_email = nicholas@dimonaco.co.uk
6
6
  description = ORForise - Platform for analysing and comparing Prokaryote CoDing Sequence (CDS) Gene Predictions.
@@ -29,10 +29,15 @@ include = *
29
29
  [options.entry_points]
30
30
  console_scripts =
31
31
  Annotation-Compare = ORForise.Annotation_Compare:main
32
+ annotation-compare = ORForise.Annotation_Compare:main
32
33
  Aggregate-Compare = ORForise.Aggregate_Compare:main
34
+ aggregate-compare = ORForise.Aggregate_Compare:main
33
35
  StORForise = ORForise.StORForise:main
36
+ storforise = ORForise.StORForise:main
34
37
  GFF-Adder = ORForise.GFF_Adder:main
38
+ gff-adder = ORForise.GFF_Adder:main
35
39
  GFF-Intersector = ORForise.GFF_Intersector:main
40
+ gff-intersector = ORForise.GFF_Intersector:main
36
41
 
37
42
  [egg_info]
38
43
  tag_build =
@@ -0,0 +1,382 @@
1
+ from importlib import import_module
2
+ import argparse
3
+ import csv, os, gzip, sys
4
+
5
+
6
+ try:
7
+ from Comparator import tool_comparison
8
+ from utils import *
9
+ except ImportError:
10
+ from .Comparator import tool_comparison
11
+ from .utils import *
12
+
13
+ ############################################
14
+
15
+ def comparator(options):
16
+ try:
17
+ try: # Detect whether fasta/gff files are .gz or text and read accordingly
18
+ fasta_in = gzip.open(options.genome_dna, 'rt')
19
+ dna_regions = fasta_load(fasta_in)
20
+ except:
21
+ fasta_in = open(options.genome_dna, 'r', encoding='unicode_escape')
22
+ dna_regions = fasta_load(fasta_in)
23
+ try:
24
+ gff_in = gzip.open(options.reference_annotation, 'rt')
25
+ dna_regions = gff_load(options, gff_in, dna_regions)
26
+ except:
27
+ gff_in = open(options.reference_annotation, 'r', encoding='unicode_escape')
28
+ dna_regions = gff_load(options, gff_in, dna_regions)
29
+ except AttributeError:
30
+ sys.exit("Attribute Error:\nStORF'ed GFF probably already exists - Must be deleted before running (-overwrite)")
31
+ except FileNotFoundError:
32
+ split_path = options.gff.split(os.sep)
33
+ sys.exit("Directory '" + split_path[-2] + "' missing fna/gff files")
34
+ ###############################################
35
+ total_ref_genes = sum(
36
+ len(v[2]) if isinstance(v[2], (list, tuple, set, dict, str)) else 1 for v in dna_regions.values())
37
+ #############################################
38
+ # Collect predictions from tools
39
+ aggregate_Predictions = collections.OrderedDict()
40
+ aggregate_Tools = options.tools.split(',')
41
+ for i, (tool) in enumerate(aggregate_Tools):
42
+ tool_prediction = options.tool_predictions.split(',')[i]
43
+ print(tool)
44
+ try:
45
+ tool_ = import_module('Tools.' + tool + '.' + tool, package='my_current_pkg')
46
+ except ModuleNotFoundError:
47
+ try:
48
+ tool_ = import_module('ORForise.Tools.' + tool + '.' + tool, package='my_current_pkg')
49
+ except ModuleNotFoundError:
50
+ sys.exit("Tool not available")
51
+ tool_ = getattr(tool_, tool)
52
+ ##
53
+ orfs = tool_(tool_prediction, dna_regions)
54
+ for current_contig in orfs:
55
+ if current_contig not in aggregate_Predictions:
56
+ aggregate_Predictions[current_contig] = {}
57
+ current_orfs = orfs[current_contig]
58
+ for key, value in current_orfs.items():
59
+ if key in aggregate_Predictions[current_contig]:
60
+ aggregate_Predictions[current_contig][key][-1] += '|' + tool
61
+ else:
62
+ aggregate_Predictions[current_contig][key] = value
63
+
64
+ aggregate_ORFs = {k: sortORFs(v) for k, v in aggregate_Predictions.items()}
65
+ results = tool_comparison(aggregate_ORFs, dna_regions, options.verbose)
66
+ ############## Printing to std-out and optional csv file
67
+ # Ensure the output directory exists
68
+ os.makedirs(options.outdir, exist_ok=True)
69
+ # Use outname as a directory, basename for files is output-outname
70
+ base_out = os.path.join(options.outdir, f"{os.path.basename(options.outname)}")
71
+
72
+ # Prepare to collect summary stats for all contigs
73
+ contig_summaries = []
74
+ ############################################# To get default output filename from input file details
75
+ if options.outdir:
76
+ # Ensure the output directory exists
77
+ os.makedirs(options.outdir, exist_ok=True)
78
+ # Use outname as a directory, basename for files is output-outname
79
+ base_out = os.path.join(options.outdir, f"{os.path.basename(options.outname)}")
80
+ with open(f"{base_out}_summary.txt", 'w', encoding='utf-8') as out_file:
81
+ out_file.write('Genome Used: ' + str(options.genome_dna.split('/')[-1]) + '\n')
82
+ if options.reference_tool:
83
+ out_file.write('Reference Tool Used: ' + str(options.reference_tool) + '\n')
84
+ else:
85
+ out_file.write('Reference Used: ' + str(options.reference_annotation.split('/')[-1]) + '\n')
86
+ out_file.write('Tool Compared: ' + str(options.tools) + '\n')
87
+ out_file.write('Total Number of Reference Genes: ' + str(total_ref_genes) + '\n')
88
+ out_file.write('Number of Contigs: ' + str(len(dna_regions)) + '\n')
89
+ out_file.write(
90
+ 'Contig\tGenes\tORFs\tPerfect_Matches\tPartial_Matches\tMissed_Genes\tUnmatched_ORFs\tMulti_Matched_ORFs\n')
91
+
92
+ for dna_region, result in results.items():
93
+ num_current_genes = len(dna_regions[dna_region][2])
94
+ num_orfs = result['pred_metrics']['Number_of_ORFs']
95
+ num_perfect = result['pred_metrics']['Number_of_Perfect_Matches']
96
+ num_partial = len(result['pred_metrics']['partial_Hits'])
97
+ num_missed = len(result['rep_metrics']['genes_Undetected'])
98
+ num_unmatched = len(result['pred_metrics']['unmatched_ORFs'])
99
+ num_multi = len(result['pred_metrics']['multi_Matched_ORFs'])
100
+
101
+ ####
102
+ # Tool-specific stats
103
+ tool_stats = {}
104
+ for tool in options.tools.split(','):
105
+ tool_stats[tool] = {
106
+ 'perfect': 0,
107
+ 'partial': 0,
108
+ 'unmatched': 0,
109
+ 'multi': 0
110
+ }
111
+ # Count perfect matches per tool
112
+ for key in result['pred_metrics'].get('perfect_Matches', {}):
113
+ for tool in options.tools.split(','):
114
+ if tool in key:
115
+ tool_stats[tool]['perfect'] += 1
116
+ # Count partial matches per tool
117
+ for key in result['pred_metrics'].get('partial_Hits', {}):
118
+ for tool in options.tools.split(','):
119
+ if tool in key:
120
+ tool_stats[tool]['partial'] += 1
121
+ # Count unmatched ORFs per tool
122
+ for key in result['pred_metrics'].get('unmatched_ORFs', {}):
123
+ for tool in options.tools.split(','):
124
+ if tool in key:
125
+ tool_stats[tool]['unmatched'] += 1
126
+ # Count multi-matched ORFs per tool
127
+ for key in result['pred_metrics'].get('multi_Matched_ORFs', {}):
128
+ for tool in options.tools.split(','):
129
+ if tool in key:
130
+ tool_stats[tool]['multi'] += 1
131
+ ####
132
+
133
+ # Collect summary for this contig
134
+ if options.outdir:
135
+ contig_summaries.append([
136
+ dna_region, num_current_genes, num_orfs, num_perfect, num_partial, num_missed, num_unmatched, num_multi
137
+ ])
138
+ ###
139
+ num_current_genes = len(dna_regions[dna_region][2])
140
+ print("These are the results for: " + dna_region + '\n')
141
+ ############################################# To get default output filename from input file details
142
+ genome_name = options.reference_annotation.split('/')[-1].split('.')[0]
143
+ rep_metric_description, rep_metrics = get_rep_metrics(result)
144
+ all_metric_description, all_metrics = get_all_metrics(result)
145
+
146
+ print('Current Contig: ' + str(dna_region))
147
+ print('Number of Genes: ' + str(num_current_genes))
148
+ print('Number of ORFs: ' + str(result['pred_metrics']['Number_of_ORFs']))
149
+ print('Perfect Matches: ' + str(result['pred_metrics']['Number_of_Perfect_Matches']) + ' [' + str(num_current_genes)+ '] - '+ format(100 * result['pred_metrics']['Number_of_Perfect_Matches']/num_current_genes,'.2f')+'%')
150
+ print('Partial Matches: ' + str(len(result['pred_metrics']['partial_Hits'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['pred_metrics']['partial_Hits'])/num_current_genes,'.2f')+'%')
151
+ print('Missed Genes: ' + str(len(result['rep_metrics']['genes_Undetected'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['rep_metrics']['genes_Undetected'])/num_current_genes,'.2f')+'%')
152
+ print('Unmatched ORFs: ' + str(len(result['pred_metrics']['unmatched_ORFs'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['pred_metrics']['unmatched_ORFs'])/num_current_genes,'.2f')+'%')
153
+ print('Multi-matched ORFs: ' + str(len(result['pred_metrics']['multi_Matched_ORFs'])) + ' [' + str(num_current_genes)+ '] - '+ format(100 * len(result['pred_metrics']['multi_Matched_ORFs'])/num_current_genes,'.2f')+'%')
154
+ print('Tool breakdown:')
155
+ for tool, stats in tool_stats.items():
156
+ print(
157
+ f" {tool}: Perfect={stats['perfect']}, Partial={stats['partial']}, Unmatched={stats['unmatched']}, Multi-matched={stats['multi']}")
158
+
159
+ if options.outdir:
160
+ # Prepare output directory and file names for each contig
161
+ contig_save = dna_region.replace('/', '_').replace('\\', '_')
162
+ contig_dir = os.path.join(options.outdir, contig_save)
163
+ os.makedirs(contig_dir, exist_ok=True)
164
+ summary_file = os.path.join(contig_dir, "summary.txt")
165
+ csv_file = os.path.join(contig_dir, "metrics.csv")
166
+ perfect_fasta = os.path.join(contig_dir, "perfect_matches.fasta")
167
+ partial_fasta = os.path.join(contig_dir, "partial_matches.fasta")
168
+ missed_fasta = os.path.join(contig_dir, "missed_genes.fasta")
169
+ unmatched_fasta = os.path.join(contig_dir, "unmatched_orfs.fasta")
170
+ multi_fasta = os.path.join(contig_dir, "multi_matched_orfs.fasta")
171
+
172
+ # Write summary to text file
173
+ with open(summary_file, 'w', encoding='utf-8') as sf:
174
+ sf.write('Current Contig: ' + str(dna_region) + '\n')
175
+ sf.write('Number of Genes: ' + str(num_current_genes) + '\n')
176
+ sf.write('Number of ORFs: ' + str(result['pred_metrics']['Number_of_ORFs']) + '\n')
177
+ sf.write('Perfect Matches: ' + str(result['pred_metrics']['Number_of_Perfect_Matches']) + ' [' + str(
178
+ num_current_genes) + '] - ' + format(
179
+ 100 * result['pred_metrics']['Number_of_Perfect_Matches'] / num_current_genes, '.2f') + '%\n')
180
+ sf.write('Partial Matches: ' + str(len(result['pred_metrics']['partial_Hits'])) + ' [' + str(
181
+ num_current_genes) + '] - ' + format(
182
+ 100 * len(result['pred_metrics']['partial_Hits']) / num_current_genes, '.2f') + '%\n')
183
+ sf.write('Missed Genes: ' + str(len(result['rep_metrics']['genes_Undetected'])) + ' [' + str(
184
+ num_current_genes) + '] - ' + format(
185
+ 100 * len(result['rep_metrics']['genes_Undetected']) / num_current_genes, '.2f') + '%\n')
186
+ sf.write('Unmatched ORFs: ' + str(len(result['pred_metrics']['unmatched_ORFs'])) + ' [' + str(
187
+ num_current_genes) + '] - ' + format(
188
+ 100 * len(result['pred_metrics']['unmatched_ORFs']) / num_current_genes, '.2f') + '%\n')
189
+ sf.write('Multi-matched ORFs: ' + str(len(result['pred_metrics']['multi_Matched_ORFs'])) + ' [' + str(
190
+ num_current_genes) + '] - ' + format(
191
+ 100 * len(result['pred_metrics']['multi_Matched_ORFs']) / num_current_genes, '.2f') + '%\n')
192
+ sf.write('Tool breakdown:\n')
193
+ for tool, stats in tool_stats.items():
194
+ sf.write(
195
+ f" {tool}: Perfect={stats['perfect']}, Partial={stats['partial']}, Unmatched={stats['unmatched']}, Multi-matched={stats['multi']}\n")
196
+
197
+ # Write metrics to CSV
198
+ with open(csv_file, 'w', newline='\n', encoding='utf-8') as out_file:
199
+ tool_out = csv.writer(out_file, quoting=csv.QUOTE_NONE, escapechar=" ")
200
+ tool_out.writerow(['Representative_Metrics:'])
201
+ tool_out.writerow(rep_metric_description.split(','))
202
+ tool_out.writerow([*rep_metrics])
203
+ tool_out.writerow(['Prediction_Metrics:'])
204
+ tool_out.writerow(all_metric_description.split(','))
205
+ tool_out.writerow([*all_metrics])
206
+ tool_out.writerow(['Reference_CDS_Gene_Coverage_of_Genome'])
207
+ tool_out.writerow([''.join(map(str, result['rep_metrics']['gene_Coverage_Genome']))])
208
+ tool_out.writerow(['Predicted_CDS_Coverage_of_Genome'])
209
+ tool_out.writerow([''.join(map(str, result['pred_metrics']['orf_Coverage_Genome']))])
210
+ tool_out.writerow(['Matched_Predicted_CDS_Coverage_of_Genome'])
211
+ tool_out.writerow([''.join(map(str, result['pred_metrics']['matched_ORF_Coverage_Genome']))])
212
+ # tool_out.writerow(['Start_Position_Difference:'])
213
+ # tool_out.writerow(result.get('start_Difference', []))
214
+ # tool_out.writerow(['Stop_Position_Difference:'])
215
+ # tool_out.writerow(result.get('stop_Difference', []))
216
+ # tool_out.writerow(['Alternative_Starts_Predicted:'])
217
+ # tool_out.writerow(result.get('other_Starts', []))
218
+ # tool_out.writerow(['Alternative_Stops_Predicted:'])
219
+ # tool_out.writerow(result.get('other_Stops', []))
220
+ # tool_out.writerow(['Undetected_Gene_Metrics:'])
221
+ # tool_out.writerow([
222
+ # 'ATG_Start,GTG_Start,TTG_Start,ATT_Start,CTG_Start,Alternative_Start_Codon,TGA_Stop,TAA_Stop,TAG_Stop,Alternative_Stop_Codon,Median_Length,ORFs_on_Positive_Strand,ORFs_on_Negative_Strand'
223
+ # ])
224
+ # tool_out.writerow(result.get('undetected_Gene_Metrics', []))
225
+ # tool_out.writerow(['\nPredicted_CDSs_Without_Corresponding_Gene_In_Reference_Metrics:'])
226
+ # tool_out.writerow([
227
+ # 'ATG_Start,GTG_Start,TTG_Start,ATT_Start,CTG_Start,Alternative_Start_Codon,TGA_Stop,TAA_Stop,TAG_Stop,Alternative_Stop_Codon,Median_Length,ORFs_on_Positive_Strand,ORFs_on_Negative_Strand'
228
+ # ])
229
+ # tool_out.writerow(result.get('unmatched_ORF_Metrics', []))
230
+
231
+ # Write perfect matches to FASTA
232
+ with open(perfect_fasta, 'w', encoding='utf-8') as f:
233
+ for key, value in result['pred_metrics'].get('perfect_Matches', {}).items():
234
+ key_parts = key.split(',')
235
+ id = f">{genome_name}_{key_parts[0]}_{key_parts[1]}_{key_parts[2]}_{key_parts[5]}"
236
+ f.write(f"{id}\n{value}\n")
237
+
238
+ # Write partial matches to FASTA
239
+ with open(partial_fasta, 'w', encoding='utf- 8') as f:
240
+ for key, value in result['pred_metrics'].get('partial_Hits', {}).items():
241
+ key_parts = key.split(';')
242
+ gene_Seq = value[0]
243
+ orf_Seq = value[1]
244
+ f.write(f">{key_parts[0]}_gene\n{gene_Seq}\n>{key_parts[1]}_orf\n{orf_Seq}\n")
245
+
246
+ # Write missed genes to FASTA
247
+ with open(missed_fasta, 'w', encoding='utf-8') as f:
248
+ for key, value in result['rep_metrics'].get('genes_Undetected', {}).items():
249
+ key_parts = key.split(',')
250
+ id = f">{genome_name}_{key_parts[0]}_{key_parts[1]}_{key_parts[2]}"
251
+ f.write(f"{id}\n{value}\n")
252
+
253
+ # Write unmatched ORFs to FASTA
254
+ with open(unmatched_fasta, 'w', encoding='utf-8') as f:
255
+ for key, value in result['pred_metrics'].get('unmatched_ORFs', {}).items():
256
+ key_parts = key.split(',')
257
+ id = f">{options.tools}_{key_parts[0]}_{key_parts[1]}_{key_parts[2]}"
258
+ f.write(f"{id}\n{value}\n")
259
+
260
+ # Write multi-matched ORFs to FASTA
261
+ with open(multi_fasta, 'w', encoding='utf-8') as f:
262
+ for key, value in result['pred_metrics'].get('multi_Matched_ORFs', {}).items():
263
+ key_parts = key.split(',')
264
+ multi = f">Predicted_CDS:{key_parts[0]}-{key_parts[1]}_Genes:{'|'.join(value)}"
265
+ f.write(f"{multi}\n")
266
+
267
+ # After all contigs, append the summary table to the main summary file
268
+ if options.outdir and contig_summaries:
269
+ with open(f"{base_out}_summary.txt", 'a', encoding='utf-8') as out_file:
270
+ for row in contig_summaries:
271
+ out_file.write('\t'.join(map(str, row)) + '\n')
272
+ # Optionally, add overall totals
273
+ total_genes = sum(row[1] for row in contig_summaries)
274
+ total_orfs = sum(row[2] for row in contig_summaries)
275
+ total_perfect = sum(row[3] for row in contig_summaries)
276
+ total_partial = sum(row[4] for row in contig_summaries)
277
+ total_missed = sum(row[5] for row in contig_summaries)
278
+ total_unmatched = sum(row[6] for row in contig_summaries)
279
+ total_multi = sum(row[7] for row in contig_summaries)
280
+ out_file.write('\nOverall Summary:\n')
281
+ out_file.write(f'Number of Genes: {total_genes}\n')
282
+ out_file.write(f'Number of ORFs: {total_orfs}\n')
283
+ out_file.write(
284
+ f'Perfect Matches: {total_perfect} [{total_genes}] - {format(100 * total_perfect / total_genes, ".2f")}%\n')
285
+ out_file.write(
286
+ f'Partial Matches: {total_partial} [{total_genes}] - {format(100 * total_partial / total_genes, ".2f")}%\n')
287
+ out_file.write(
288
+ f'Missed Genes: {total_missed} [{total_genes}] - {format(100 * total_missed / total_genes, ".2f")}%\n')
289
+ out_file.write(
290
+ f'Unmatched ORFs: {total_unmatched} [{total_genes}] - {format(100 * total_unmatched / total_genes, ".2f")}%\n')
291
+ out_file.write(
292
+ f'Multi-matched ORFs: {total_multi} [{total_genes}] - {format(100 * total_multi / total_genes, ".2f")}%\n')
293
+
294
+ # Calculate combined tool stats - could be optimised further
295
+ combined_tool_stats = {tool: {'perfect': 0, 'partial': 0, 'unmatched': 0, 'multi': 0} for tool in
296
+ options.tools.split(',')}
297
+ for dna_region, result in results.items():
298
+ for tool in options.tools.split(','):
299
+ # perfect
300
+ for key in result['pred_metrics'].get('perfect_Matches', {}):
301
+ if tool in key:
302
+ combined_tool_stats[tool]['perfect'] += 1
303
+ # partial
304
+ for key in result['pred_metrics'].get('partial_Hits', {}):
305
+ if tool in key:
306
+ combined_tool_stats[tool]['partial'] += 1
307
+ # unmatched
308
+ for key in result['pred_metrics'].get('unmatched_ORFs', {}):
309
+ if tool in key:
310
+ combined_tool_stats[tool]['unmatched'] += 1
311
+ # multi
312
+ for key in result['pred_metrics'].get('multi_Matched_ORFs', {}):
313
+ if tool in key:
314
+ combined_tool_stats[tool]['multi'] += 1
315
+ for tool, stats in combined_tool_stats.items():
316
+ out_file.write('\n'+
317
+ f" {tool}: Perfect={stats['perfect']}, Partial={stats['partial']}, Unmatched={stats['unmatched']}, Multi-matched={stats['multi']}\n"
318
+ )
319
+
320
+ # Print combined metrics to stdout
321
+ print("\nCombined metrics for all contigs:")
322
+ print(f'Number of Genes: {total_genes}')
323
+ print(f'Number of ORFs: {total_orfs}')
324
+ print(
325
+ f'Perfect Matches: {total_perfect} [{total_genes}] - {format(100 * total_perfect / total_genes, ".2f")}%')
326
+ print(
327
+ f'Partial Matches: {total_partial} [{total_genes}] - {format(100 * total_partial / total_genes, ".2f")}%')
328
+ print(f'Missed Genes: {total_missed} [{total_genes}] - {format(100 * total_missed / total_genes, ".2f")}%')
329
+ print(
330
+ f'Unmatched ORFs: {total_unmatched} [{total_genes}] - {format(100 * total_unmatched / total_genes, ".2f")}%')
331
+ print(
332
+ f'Multi-matched ORFs: {total_multi} [{total_genes}] - {format(100 * total_multi / total_genes, ".2f")}%')
333
+
334
+ print('Tool breakdown (combined):')
335
+ for tool, stats in combined_tool_stats.items():
336
+ print('\n'+
337
+ f" {tool}: Perfect={stats['perfect']}, Partial={stats['partial']}, Unmatched={stats['unmatched']}, Multi-matched={stats['multi']}"
338
+ )
339
+
340
+
341
+ def main():
342
+ print("Thank you for using ORForise\nPlease report any issues to: https://github.com/NickJD/ORForise/issues\n"
343
+ "Please Cite: https://doi.org/10.1093/bioinformatics/btab827\n"
344
+ "#####")
345
+
346
+ parser = argparse.ArgumentParser(description='ORForise ' + ORForise_Version + ': Aggregate-Compare Run Parameters.')
347
+ parser._action_groups.pop()
348
+
349
+ required = parser.add_argument_group('Required Arguments')
350
+
351
+ required.add_argument('-dna', dest='genome_dna', required=True, help='Genome DNA file (.fa) which both annotations '
352
+ 'are based on')
353
+ required.add_argument('-t', dest='tools', required=True, help='Which tools to analyse? (Prodigal,GeneMarkS)')
354
+ required.add_argument('-tp', dest='tool_predictions', required=True, help='Tool genome prediction file (.gff) - Provide'
355
+ 'file locations for each tool comma separated')
356
+ required.add_argument('-ref', dest='reference_annotation', required=True,
357
+ help='Which reference annotation file to use as reference?')
358
+
359
+ optional = parser.add_argument_group('Optional Arguments')
360
+ optional.add_argument('-gene_ident', action='store', dest='gene_ident', default='CDS',
361
+ help='What features to consider as genes? - Default: CDS - '
362
+ 'Provide comma separated list of features to consider as genes (e.g. CDS,exon)')
363
+ optional.add_argument('-rt', dest='reference_tool', required=False,
364
+ help='What type of Annotation to compare to? -- Leave blank for Ensembl reference'
365
+ '- Provide tool name to compare output from two tools')
366
+
367
+ output = parser.add_argument_group('Output')
368
+ output.add_argument('-o', dest='outdir', required=False,
369
+ help='Define directory where detailed output should be places - If not provided, summary will be printed to std-out')
370
+ output.add_argument('-n', dest='outname', required=False,
371
+ help='Define output file name - Mandatory is -o is provided: <outname>_<contig_id>_ORF_Comparison.csv')
372
+
373
+ misc = parser.add_argument_group('Misc')
374
+ misc.add_argument('-v', dest='verbose', default='False', type=eval, choices=[True, False],
375
+ help='Default - False: Print out runtime status')
376
+ options = parser.parse_args()
377
+ comparator(options)
378
+
379
+ if __name__ == "__main__":
380
+ main()
381
+ print("Complete")
382
+