resfinder-parser 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- resfinder_parser-0.1.0/LICENSE +21 -0
- resfinder_parser-0.1.0/PKG-INFO +158 -0
- resfinder_parser-0.1.0/README.md +136 -0
- resfinder_parser-0.1.0/pyproject.toml +22 -0
- resfinder_parser-0.1.0/src/resfinder_parser/__init__.py +14 -0
- resfinder_parser-0.1.0/src/resfinder_parser/__main__.py +26 -0
- resfinder_parser-0.1.0/src/resfinder_parser/data_classes.py +123 -0
- resfinder_parser-0.1.0/src/resfinder_parser/resfinder_result_parser.py +422 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 João Dourado Santos
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: resfinder-parser
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Parser for ResFinder output data
|
|
5
|
+
License: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Author: João Dourado Santos
|
|
8
|
+
Author-email: joao.dourado@insa.min-saude.pt
|
|
9
|
+
Requires-Python: >=3.8,<4.0
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
19
|
+
Requires-Dist: pandas (>=2.0,<3.0)
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# ResFinderParser
|
|
23
|
+
|
|
24
|
+
_Harmonizing ResFinder outputs for scalable AMR analysis_
|
|
25
|
+
|
|
26
|
+
**ResFinderParser** is a Python tool that parses and standardizes JSON outputs generated by ResFinder (CGE/DTU) across multiple isolates. It compiles per-sample resistance detection results into structured, analysis-ready tabular datasets.
|
|
27
|
+
|
|
28
|
+
ResFinder integrates multiple resistance detection layers, including acquired resistance genes (ResFinder database), chromosomal mutations (PointFinder), disinfectant-associated determinants (DisinFinder), and predicted phenotypic resistance. Although biologically comprehensive, these outputs are generated independently for each isolate and are not directly optimized for cross-sample analyses.
|
|
29
|
+
|
|
30
|
+
The parser restructures these results into harmonized matrices suitable for:
|
|
31
|
+
|
|
32
|
+
- Antimicrobial resistance (AMR) surveillance
|
|
33
|
+
- Comparative resistance profiling
|
|
34
|
+
- Resistance frequency and trend analysis
|
|
35
|
+
- Integration into automated genomic analysis pipeline
|
|
36
|
+
|
|
37
|
+
By transforming nested JSON outputs into standardized TSV files, the tool enables scalable genomic resistance analyses across research and surveillance contexts.
|
|
38
|
+
|
|
39
|
+
## Requirements
|
|
40
|
+
|
|
41
|
+
- Python 3.8+
|
|
42
|
+
- pandas 2.0+
|
|
43
|
+
|
|
44
|
+
## Installation
|
|
45
|
+
|
|
46
|
+
### Using Poetry (recommended)
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
poetry install
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### Using pip
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install resfinder-parser
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Input
|
|
59
|
+
|
|
60
|
+
**ResFinderParser** expects the JSON output files generated by ResFinder. The tool should be run on a directory containing one subfolder per isolate, each including the corresponding ResFinder `*.json` file (and, if available, PointFinder and DisinFinder results).
|
|
61
|
+
|
|
62
|
+
Example structure:
|
|
63
|
+
|
|
64
|
+
```
|
|
65
|
+
RESFINDER_DIR/
|
|
66
|
+
├── ISOLATE_001/
|
|
67
|
+
│ └── ResFinder_results.json
|
|
68
|
+
├── ISOLATE_002/
|
|
69
|
+
│ └── ResFinder_results.json
|
|
70
|
+
└── ...
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
The parser extracts resistance genes, chromosomal mutations, disinfectant-associated determinants, and predicted phenotypes directly from the JSON structure.
|
|
74
|
+
|
|
75
|
+
### Outputs
|
|
76
|
+
|
|
77
|
+
The parser generates four standardized TSV files:
|
|
78
|
+
|
|
79
|
+
1. **isolate_summaries.tsv**
|
|
80
|
+
One row per isolate containing:
|
|
81
|
+
- isolate_id
|
|
82
|
+
- analysis_date
|
|
83
|
+
- ResFinder version
|
|
84
|
+
- databases used
|
|
85
|
+
- provided species
|
|
86
|
+
- predicted phenotype summary
|
|
87
|
+
Useful for run tracking and dataset documentation.
|
|
88
|
+
|
|
89
|
+
2. **resfinder_results.tsv**
|
|
90
|
+
Long-format table (isolate × antibiotic) including:
|
|
91
|
+
- antibiotic
|
|
92
|
+
- resistance class
|
|
93
|
+
- amr_resistant (True/False)
|
|
94
|
+
- identity
|
|
95
|
+
- coverage
|
|
96
|
+
- grade
|
|
97
|
+
Suitable for resistance frequency analysis and comparative profiling.
|
|
98
|
+
|
|
99
|
+
3. **pointfinder_results.tsv**
|
|
100
|
+
Long-format mutation table including:
|
|
101
|
+
- gene
|
|
102
|
+
- mutation
|
|
103
|
+
- nucleotide change
|
|
104
|
+
- associated phenotype
|
|
105
|
+
- PMID
|
|
106
|
+
Enables mutation-level resistance analysis.
|
|
107
|
+
|
|
108
|
+
---
|
|
109
|
+
|
|
110
|
+
4. **combined_presence_absence.tsv**
|
|
111
|
+
Wide-format matrix:
|
|
112
|
+
- One row per isolate
|
|
113
|
+
- One column per detected gene (ResFinder + DisinFinder)
|
|
114
|
+
- One column per relevant mutation (PointFinder)
|
|
115
|
+
Designed for comparative analyses, clustering, and integration with epidemiological metadata.
|
|
116
|
+
All outputs are directly compatible with R, Python (pandas), and automated genomic analysis pipelines.
|
|
117
|
+
|
|
118
|
+
## Usage
|
|
119
|
+
|
|
120
|
+
### Command Line
|
|
121
|
+
|
|
122
|
+
```bash
|
|
123
|
+
resfinder-parser -r /path/to/RESFINDER_DIR -o /path/to/output_dir
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
### As a Python Module
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
from resfinder_parser import ResfinderCollector
|
|
130
|
+
|
|
131
|
+
collector = ResfinderCollector("/path/to/RESFINDER_DIR", "/path/to/output_dir")
|
|
132
|
+
collector.collect()
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
Arguments:
|
|
136
|
+
|
|
137
|
+
- `-r`, `--resfinder_dir` (required): directory containing isolate subfolders
|
|
138
|
+
- `-o`, `--output_dir` (optional): directory where output files will be written
|
|
139
|
+
|
|
140
|
+
### Development
|
|
141
|
+
|
|
142
|
+
Run tests:
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
poetry run pytest
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
## Citation
|
|
149
|
+
|
|
150
|
+
If you use resfinder_parser in your work, please cite:
|
|
151
|
+
Santos JG et al. resfinder_parser. GitHub repository.
|
|
152
|
+
|
|
153
|
+
We also recommend citing ResFinder and its associated databases:
|
|
154
|
+
|
|
155
|
+
- Zankari et al., 2012. Journal of Antimicrobial Chemotherapy.
|
|
156
|
+
- Florensa et al., 2022. Microbial Genomics.
|
|
157
|
+
- CGE/DTU ResFinder platform.
|
|
158
|
+
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
# ResFinderParser
|
|
2
|
+
|
|
3
|
+
_Harmonizing ResFinder outputs for scalable AMR analysis_
|
|
4
|
+
|
|
5
|
+
**ResFinderParser** is a Python tool that parses and standardizes JSON outputs generated by ResFinder (CGE/DTU) across multiple isolates. It compiles per-sample resistance detection results into structured, analysis-ready tabular datasets.
|
|
6
|
+
|
|
7
|
+
ResFinder integrates multiple resistance detection layers, including acquired resistance genes (ResFinder database), chromosomal mutations (PointFinder), disinfectant-associated determinants (DisinFinder), and predicted phenotypic resistance. Although biologically comprehensive, these outputs are generated independently for each isolate and are not directly optimized for cross-sample analyses.
|
|
8
|
+
|
|
9
|
+
The parser restructures these results into harmonized matrices suitable for:
|
|
10
|
+
|
|
11
|
+
- Antimicrobial resistance (AMR) surveillance
|
|
12
|
+
- Comparative resistance profiling
|
|
13
|
+
- Resistance frequency and trend analysis
|
|
14
|
+
- Integration into automated genomic analysis pipeline
|
|
15
|
+
|
|
16
|
+
By transforming nested JSON outputs into standardized TSV files, the tool enables scalable genomic resistance analyses across research and surveillance contexts.
|
|
17
|
+
|
|
18
|
+
## Requirements
|
|
19
|
+
|
|
20
|
+
- Python 3.8+
|
|
21
|
+
- pandas 2.0+
|
|
22
|
+
|
|
23
|
+
## Installation
|
|
24
|
+
|
|
25
|
+
### Using Poetry (recommended)
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
poetry install
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
### Using pip
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install resfinder-parser
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Input
|
|
38
|
+
|
|
39
|
+
**ResFinderParser** expects the JSON output files generated by ResFinder. The tool should be run on a directory containing one subfolder per isolate, each including the corresponding ResFinder `*.json` file (and, if available, PointFinder and DisinFinder results).
|
|
40
|
+
|
|
41
|
+
Example structure:
|
|
42
|
+
|
|
43
|
+
```
|
|
44
|
+
RESFINDER_DIR/
|
|
45
|
+
├── ISOLATE_001/
|
|
46
|
+
│ └── ResFinder_results.json
|
|
47
|
+
├── ISOLATE_002/
|
|
48
|
+
│ └── ResFinder_results.json
|
|
49
|
+
└── ...
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
The parser extracts resistance genes, chromosomal mutations, disinfectant-associated determinants, and predicted phenotypes directly from the JSON structure.
|
|
53
|
+
|
|
54
|
+
### Outputs
|
|
55
|
+
|
|
56
|
+
The parser generates four standardized TSV files:
|
|
57
|
+
|
|
58
|
+
1. **isolate_summaries.tsv**
|
|
59
|
+
One row per isolate containing:
|
|
60
|
+
- isolate_id
|
|
61
|
+
- analysis_date
|
|
62
|
+
- ResFinder version
|
|
63
|
+
- databases used
|
|
64
|
+
- provided species
|
|
65
|
+
- predicted phenotype summary
|
|
66
|
+
Useful for run tracking and dataset documentation.
|
|
67
|
+
|
|
68
|
+
2. **resfinder_results.tsv**
|
|
69
|
+
Long-format table (isolate × antibiotic) including:
|
|
70
|
+
- antibiotic
|
|
71
|
+
- resistance class
|
|
72
|
+
- amr_resistant (True/False)
|
|
73
|
+
- identity
|
|
74
|
+
- coverage
|
|
75
|
+
- grade
|
|
76
|
+
Suitable for resistance frequency analysis and comparative profiling.
|
|
77
|
+
|
|
78
|
+
3. **pointfinder_results.tsv**
|
|
79
|
+
Long-format mutation table including:
|
|
80
|
+
- gene
|
|
81
|
+
- mutation
|
|
82
|
+
- nucleotide change
|
|
83
|
+
- associated phenotype
|
|
84
|
+
- PMID
|
|
85
|
+
Enables mutation-level resistance analysis.
|
|
86
|
+
|
|
87
|
+
---
|
|
88
|
+
|
|
89
|
+
4. **combined_presence_absence.tsv**
|
|
90
|
+
Wide-format matrix:
|
|
91
|
+
- One row per isolate
|
|
92
|
+
- One column per detected gene (ResFinder + DisinFinder)
|
|
93
|
+
- One column per relevant mutation (PointFinder)
|
|
94
|
+
Designed for comparative analyses, clustering, and integration with epidemiological metadata.
|
|
95
|
+
All outputs are directly compatible with R, Python (pandas), and automated genomic analysis pipelines.
|
|
96
|
+
|
|
97
|
+
## Usage
|
|
98
|
+
|
|
99
|
+
### Command Line
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
resfinder-parser -r /path/to/RESFINDER_DIR -o /path/to/output_dir
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### As a Python Module
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
from resfinder_parser import ResfinderCollector
|
|
109
|
+
|
|
110
|
+
collector = ResfinderCollector("/path/to/RESFINDER_DIR", "/path/to/output_dir")
|
|
111
|
+
collector.collect()
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Arguments:
|
|
115
|
+
|
|
116
|
+
- `-r`, `--resfinder_dir` (required): directory containing isolate subfolders
|
|
117
|
+
- `-o`, `--output_dir` (optional): directory where output files will be written
|
|
118
|
+
|
|
119
|
+
### Development
|
|
120
|
+
|
|
121
|
+
Run tests:
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
poetry run pytest
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
## Citation
|
|
128
|
+
|
|
129
|
+
If you use resfinder_parser in your work, please cite:
|
|
130
|
+
Santos JG et al. resfinder_parser. GitHub repository.
|
|
131
|
+
|
|
132
|
+
We also recommend citing ResFinder and its associated databases:
|
|
133
|
+
|
|
134
|
+
- Zankari et al., 2012. Journal of Antimicrobial Chemotherapy.
|
|
135
|
+
- Florensa et al., 2022. Microbial Genomics.
|
|
136
|
+
- CGE/DTU ResFinder platform.
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "resfinder-parser"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Parser for ResFinder output data"
|
|
5
|
+
authors = ["João Dourado Santos <joao.dourado@insa.min-saude.pt>"]
|
|
6
|
+
license = "MIT"
|
|
7
|
+
readme = "README.md"
|
|
8
|
+
packages = [{include = "resfinder_parser", from = "src"}]
|
|
9
|
+
|
|
10
|
+
[tool.poetry.dependencies]
|
|
11
|
+
python = "^3.8"
|
|
12
|
+
pandas = "^2.0"
|
|
13
|
+
|
|
14
|
+
[tool.poetry.scripts]
|
|
15
|
+
resfinder-parser = "resfinder_parser.__main__:main"
|
|
16
|
+
|
|
17
|
+
[tool.poetry.group.test.dependencies]
|
|
18
|
+
pytest = "^7.0"
|
|
19
|
+
|
|
20
|
+
[build-system]
|
|
21
|
+
requires = ["poetry-core"]
|
|
22
|
+
build-backend = "poetry.core.masonry.api"
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
|
2
|
+
|
|
3
|
+
from .data_classes import Phenotype, IsolatePhenotypes, SeqRegion, IsolateSummary
|
|
4
|
+
from .resfinder_result_parser import ResfinderParser, ResfinderCollector
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"__version__",
|
|
8
|
+
"Phenotype",
|
|
9
|
+
"IsolatePhenotypes",
|
|
10
|
+
"SeqRegion",
|
|
11
|
+
"IsolateSummary",
|
|
12
|
+
"ResfinderParser",
|
|
13
|
+
"ResfinderCollector",
|
|
14
|
+
]
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
from resfinder_parser import ResfinderCollector
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def main():
|
|
6
|
+
parser = argparse.ArgumentParser()
|
|
7
|
+
parser.add_argument(
|
|
8
|
+
"-r",
|
|
9
|
+
"--resfinder_dir",
|
|
10
|
+
help="Path to the directory containing the resfinder results.",
|
|
11
|
+
required=True,
|
|
12
|
+
)
|
|
13
|
+
parser.add_argument(
|
|
14
|
+
"-o",
|
|
15
|
+
"--output_dir",
|
|
16
|
+
help="Path to the directory to write the results to.",
|
|
17
|
+
required=False,
|
|
18
|
+
)
|
|
19
|
+
args = parser.parse_args()
|
|
20
|
+
|
|
21
|
+
collector = ResfinderCollector(args.resfinder_dir, args.output_dir)
|
|
22
|
+
collector.collect()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
if __name__ == "__main__":
|
|
26
|
+
main()
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import List, Dict
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class IsolateSummary:
|
|
8
|
+
key: str
|
|
9
|
+
provided_species: str
|
|
10
|
+
result_summary: str
|
|
11
|
+
databases: str
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class SeqRegion:
|
|
15
|
+
def __init__(self, seq_region_output: str):
|
|
16
|
+
seq_region_output = seq_region_output.split(";;")
|
|
17
|
+
|
|
18
|
+
self.gene = seq_region_output[0].strip()
|
|
19
|
+
self.version = seq_region_output[1].strip()
|
|
20
|
+
self.homolog = seq_region_output[2].strip()
|
|
21
|
+
|
|
22
|
+
def __str__(self):
|
|
23
|
+
return f"{self.gene};;{self.version};;{self.homolog}"
|
|
24
|
+
|
|
25
|
+
def __repr__(self):
|
|
26
|
+
return str(self)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class Phenotype:
|
|
30
|
+
phenotype_fields = [
|
|
31
|
+
"amr_classes",
|
|
32
|
+
"amr_resistant",
|
|
33
|
+
"amr_species_relevant",
|
|
34
|
+
"grade",
|
|
35
|
+
"seq_regions",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
def __init__(self, name):
|
|
39
|
+
self.name = name
|
|
40
|
+
self.seq_regions: List[SeqRegion] = []
|
|
41
|
+
|
|
42
|
+
def add_seq_region(self, seq_region: str):
|
|
43
|
+
self.seq_regions.append(SeqRegion(seq_region))
|
|
44
|
+
|
|
45
|
+
def add_regions(self, regions: List[str]):
|
|
46
|
+
for region in regions:
|
|
47
|
+
self.add_seq_region(region)
|
|
48
|
+
|
|
49
|
+
def feed_data(self, data: dict):
|
|
50
|
+
for field in self.phenotype_fields:
|
|
51
|
+
found = data.get(field, None)
|
|
52
|
+
if found is not None:
|
|
53
|
+
if isinstance(found, list):
|
|
54
|
+
if field == "seq_regions":
|
|
55
|
+
self.add_regions(found)
|
|
56
|
+
continue
|
|
57
|
+
found = ";".join(found)
|
|
58
|
+
|
|
59
|
+
setattr(self, field, str(found))
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class IsolatePhenotypes:
|
|
63
|
+
def __init__(self, isolate_id: str, result_summary: str):
|
|
64
|
+
self.isolate_id = isolate_id
|
|
65
|
+
self.result_summary = result_summary
|
|
66
|
+
self.phenotypes: Dict[str, Phenotype] = {}
|
|
67
|
+
self.all_genes_affected: Dict[str, list] = {}
|
|
68
|
+
|
|
69
|
+
def add_phenotype(self, phenotype: Phenotype):
|
|
70
|
+
self.phenotypes[phenotype.name] = phenotype
|
|
71
|
+
|
|
72
|
+
def phenotype_dataframe(self):
|
|
73
|
+
data_keep = []
|
|
74
|
+
default_columns = ["antibiotic"] + Phenotype.phenotype_fields
|
|
75
|
+
|
|
76
|
+
for phenotype_name, phenotype in self.phenotypes.items():
|
|
77
|
+
phenotype_data = [phenotype_name]
|
|
78
|
+
for field in phenotype.phenotype_fields:
|
|
79
|
+
found = getattr(phenotype, field, None)
|
|
80
|
+
if found is not None:
|
|
81
|
+
if isinstance(found, list):
|
|
82
|
+
if len(found) == 0:
|
|
83
|
+
found = "0"
|
|
84
|
+
else:
|
|
85
|
+
found = ";".join([str(x) for x in found])
|
|
86
|
+
|
|
87
|
+
phenotype_data.append(str(found))
|
|
88
|
+
|
|
89
|
+
data_keep.append(phenotype_data)
|
|
90
|
+
|
|
91
|
+
if data_keep:
|
|
92
|
+
columns = ["antibiotic"] + phenotype.phenotype_fields
|
|
93
|
+
else:
|
|
94
|
+
columns = default_columns
|
|
95
|
+
|
|
96
|
+
df = pd.DataFrame(data_keep, columns=columns)
|
|
97
|
+
|
|
98
|
+
return df
|
|
99
|
+
|
|
100
|
+
def collect_all_genes_affected(self):
|
|
101
|
+
for phenotype_name, phenotype in self.phenotypes.items():
|
|
102
|
+
for seq_region in phenotype.seq_regions:
|
|
103
|
+
if seq_region.gene not in self.all_genes_affected.keys():
|
|
104
|
+
self.all_genes_affected[seq_region.gene] = [phenotype_name]
|
|
105
|
+
else:
|
|
106
|
+
if phenotype_name not in self.all_genes_affected[seq_region.gene]:
|
|
107
|
+
self.all_genes_affected[seq_region.gene].append(phenotype_name)
|
|
108
|
+
|
|
109
|
+
def all_genes(self):
|
|
110
|
+
genes = []
|
|
111
|
+
for _, phenotype in self.phenotypes.items():
|
|
112
|
+
for seq_region in phenotype.seq_regions:
|
|
113
|
+
genes.append(seq_region.gene)
|
|
114
|
+
|
|
115
|
+
return genes
|
|
116
|
+
|
|
117
|
+
def gene_affected(self, gene: str):
|
|
118
|
+
gene_present = self.all_genes_affected.get(gene, None)
|
|
119
|
+
|
|
120
|
+
if gene_present is None:
|
|
121
|
+
return ""
|
|
122
|
+
else:
|
|
123
|
+
return "; ".join(gene_present)
|
|
@@ -0,0 +1,422 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import datetime
|
|
6
|
+
import logging
|
|
7
|
+
from typing import List
|
|
8
|
+
from .data_classes import IsolatePhenotypes, Phenotype, IsolateSummary
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ResfinderParser:
|
|
12
|
+
pointfinder_results_filename: str = "PointFinder_results.txt"
|
|
13
|
+
resfinder_results_filename = "ResFinder_results_tab.txt"
|
|
14
|
+
resfinder_results_suffix = ".json"
|
|
15
|
+
databases_to_exclude = []
|
|
16
|
+
|
|
17
|
+
def __init__(self, RESFINDER_dir, isolate_dir):
|
|
18
|
+
self.isolate_id = isolate_dir
|
|
19
|
+
|
|
20
|
+
if os.path.isdir(os.path.join(RESFINDER_dir, isolate_dir, "resfinder_results")):
|
|
21
|
+
isolate_dir = os.path.join(isolate_dir, "resfinder_results")
|
|
22
|
+
|
|
23
|
+
self.pointfinder_results_filepath = os.path.join(
|
|
24
|
+
RESFINDER_dir, isolate_dir, self.pointfinder_results_filename
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
json_files = [
|
|
28
|
+
f
|
|
29
|
+
for f in os.listdir(os.path.join(RESFINDER_dir, isolate_dir))
|
|
30
|
+
if f.endswith(self.resfinder_results_suffix)
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
if len(json_files) == 0:
|
|
34
|
+
logging.error(
|
|
35
|
+
f"No resfinder json files found for isolate {self.isolate_id}."
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
if len(json_files) > 1:
|
|
40
|
+
logging.warning(
|
|
41
|
+
f"Multiple resfinder json files found for isolate {self.isolate_id}. Using {json_files[0]}"
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
self.time_now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
47
|
+
|
|
48
|
+
self.has_data = len(json_files) > 0
|
|
49
|
+
|
|
50
|
+
if self.has_data:
|
|
51
|
+
self.resfinder_results_filepath = os.path.join(
|
|
52
|
+
RESFINDER_dir, isolate_dir, json_files[0]
|
|
53
|
+
)
|
|
54
|
+
self.passport = self.collect_summary()
|
|
55
|
+
|
|
56
|
+
else:
|
|
57
|
+
self.passport = self.empty_passport
|
|
58
|
+
self.resfinder_results_filepath = ""
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
self.has_pointfinder_data = (
|
|
62
|
+
os.path.isfile(self.pointfinder_results_filepath)
|
|
63
|
+
and os.path.getsize(self.pointfinder_results_filepath) > 0
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
self.phenotypes = IsolatePhenotypes(
|
|
67
|
+
self.isolate_id, self.passport.result_summary
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
def json_parse_antibiotics(self):
|
|
71
|
+
"""Parse the resfinder json output into a pandas dataframe."""
|
|
72
|
+
content = self.read_json()
|
|
73
|
+
## info to get:
|
|
74
|
+
|
|
75
|
+
for phenotype_str, data in content["phenotypes"].items():
|
|
76
|
+
antibiotic = Phenotype(phenotype_str)
|
|
77
|
+
antibiotic.feed_data(data)
|
|
78
|
+
self.phenotypes.add_phenotype(antibiotic)
|
|
79
|
+
|
|
80
|
+
def resfinder_json_summary(self, content) -> IsolateSummary:
|
|
81
|
+
"""
|
|
82
|
+
Parse the resfinder json output into a IsolateSummary object."""
|
|
83
|
+
analysis_key = content["key"]
|
|
84
|
+
provided_species = content["provided_species"]
|
|
85
|
+
result_summary = content["result_summary"]
|
|
86
|
+
databases: dict = content["databases"]
|
|
87
|
+
databases = "; ".join(
|
|
88
|
+
[
|
|
89
|
+
f"{v['database_name']}-{v['database_version']}"
|
|
90
|
+
for k, v in databases.items()
|
|
91
|
+
if v["database_name"].lower() not in self.databases_to_exclude
|
|
92
|
+
]
|
|
93
|
+
)
|
|
94
|
+
return IsolateSummary(analysis_key, provided_species, result_summary, databases)
|
|
95
|
+
|
|
96
|
+
@property
|
|
97
|
+
def empty_passport(self):
|
|
98
|
+
return IsolateSummary("", "", "No results found.", "")
|
|
99
|
+
|
|
100
|
+
def read_json(self):
|
|
101
|
+
with open(self.resfinder_results_filepath, "r") as f:
|
|
102
|
+
results = json.load(f)
|
|
103
|
+
|
|
104
|
+
return results
|
|
105
|
+
|
|
106
|
+
def collect_summary(self) -> IsolateSummary:
|
|
107
|
+
results = self.read_json()
|
|
108
|
+
return self.resfinder_json_summary(results)
|
|
109
|
+
|
|
110
|
+
def add_analyis_columns(self, df: pd.DataFrame):
|
|
111
|
+
df["isolate_id"] = self.isolate_id
|
|
112
|
+
df["analysis_date"] = self.time_now
|
|
113
|
+
df["version"] = self.passport.key
|
|
114
|
+
|
|
115
|
+
# place the isolate_id, analysis_date and resfinder version columns at the front
|
|
116
|
+
cols = df.columns.tolist()
|
|
117
|
+
cols = cols[-3:] + cols[:-3]
|
|
118
|
+
df = df[cols]
|
|
119
|
+
|
|
120
|
+
return df
|
|
121
|
+
|
|
122
|
+
def collect_pointfinder_results(self):
|
|
123
|
+
results = pd.read_csv(self.pointfinder_results_filepath, sep="\t")
|
|
124
|
+
|
|
125
|
+
results = self.add_analyis_columns(results)
|
|
126
|
+
|
|
127
|
+
return results
|
|
128
|
+
|
|
129
|
+
def collect_phenotype_results(self):
|
|
130
|
+
self.json_parse_antibiotics()
|
|
131
|
+
isolate_results = self.phenotypes.phenotype_dataframe()
|
|
132
|
+
isolate_results = self.add_analyis_columns(isolate_results)
|
|
133
|
+
|
|
134
|
+
return isolate_results
|
|
135
|
+
|
|
136
|
+
def seq_regions_parse(self):
|
|
137
|
+
results = self.read_json()
|
|
138
|
+
|
|
139
|
+
seq_reg = []
|
|
140
|
+
for reg, map in results["seq_regions"].items():
|
|
141
|
+
|
|
142
|
+
## filter out if all databases are in the exclude list
|
|
143
|
+
databases = map["ref_database"]
|
|
144
|
+
databases_simple = [x.split("-")[0].lower() for x in databases]
|
|
145
|
+
if all(db in self.databases_to_exclude for db in databases_simple):
|
|
146
|
+
continue
|
|
147
|
+
|
|
148
|
+
for phenotype in map["phenotypes"]:
|
|
149
|
+
seq_reg.append([reg, phenotype, map["query_id"], map["identity"]])
|
|
150
|
+
|
|
151
|
+
seq_reg_df = pd.DataFrame(
|
|
152
|
+
seq_reg, columns=["seq_region", "antibiotic", "query_id", "identity"]
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
return seq_reg_df
|
|
156
|
+
|
|
157
|
+
def extend_resfinder_results(self, resfinder_df):
|
|
158
|
+
|
|
159
|
+
seq_reg_df = self.seq_regions_parse()
|
|
160
|
+
|
|
161
|
+
def contig_id_info(phenotype: pd.DataFrame):
|
|
162
|
+
pheno_df = seq_reg_df[seq_reg_df.antibiotic == phenotype]
|
|
163
|
+
|
|
164
|
+
if pheno_df.shape[0] == 0:
|
|
165
|
+
return pd.Series(["", 0])
|
|
166
|
+
|
|
167
|
+
output = pd.Series(
|
|
168
|
+
["; ".join(pheno_df.query_id.values), pheno_df.identity.values[0]]
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
return output
|
|
172
|
+
|
|
173
|
+
resfinder_df[["contigs", "identity"]] = resfinder_df.antibiotic.apply(
|
|
174
|
+
contig_id_info
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
return resfinder_df
|
|
178
|
+
|
|
179
|
+
def isolate_summary(self):
|
|
180
|
+
"""
|
|
181
|
+
Provide an overview of the isolate's results.
|
|
182
|
+
"""
|
|
183
|
+
summary_df = pd.DataFrame(
|
|
184
|
+
[
|
|
185
|
+
[
|
|
186
|
+
self.passport.databases,
|
|
187
|
+
self.passport.provided_species,
|
|
188
|
+
self.passport.result_summary,
|
|
189
|
+
]
|
|
190
|
+
],
|
|
191
|
+
columns=["databases", "provided_species", "result_summary"],
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
summary_df = self.add_analyis_columns(summary_df)
|
|
195
|
+
|
|
196
|
+
return summary_df
|
|
197
|
+
|
|
198
|
+
@property
|
|
199
|
+
def empty_isolate_summary(self):
|
|
200
|
+
return pd.DataFrame(
|
|
201
|
+
[
|
|
202
|
+
[
|
|
203
|
+
self.isolate_id,
|
|
204
|
+
self.time_now,
|
|
205
|
+
self.passport.key,
|
|
206
|
+
self.passport.databases,
|
|
207
|
+
"",
|
|
208
|
+
"No results found.",
|
|
209
|
+
]
|
|
210
|
+
],
|
|
211
|
+
columns=[
|
|
212
|
+
"isolate_id",
|
|
213
|
+
"analysis_date",
|
|
214
|
+
"version",
|
|
215
|
+
"databases",
|
|
216
|
+
"provided_species",
|
|
217
|
+
"result_summary",
|
|
218
|
+
],
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
class ResfinderCollector:
|
|
223
|
+
def __init__(self, RESFINDER_dir: str, output_dir: Optional[str] = None):
|
|
224
|
+
self.RESFINDER_dir = RESFINDER_dir
|
|
225
|
+
self.output_dir = output_dir
|
|
226
|
+
self.isolate_dirs = self.resfinder_dirs()
|
|
227
|
+
self.logger = logging.getLogger(__name__)
|
|
228
|
+
# set level to info to get the info messages
|
|
229
|
+
self.logger.setLevel(logging.INFO)
|
|
230
|
+
# log to console
|
|
231
|
+
self.logger.addHandler(logging.StreamHandler())
|
|
232
|
+
self.logger.info(
|
|
233
|
+
f"Found {len(self.isolate_dirs)} isolate directories in {self.RESFINDER_dir}"
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
def _is_isolate_dir(self, dirpath: str):
|
|
237
|
+
if dirpath.startswith("."):
|
|
238
|
+
return False
|
|
239
|
+
|
|
240
|
+
if os.path.isdir(os.path.join(self.RESFINDER_dir, dirpath)) is False:
|
|
241
|
+
return False
|
|
242
|
+
|
|
243
|
+
return True
|
|
244
|
+
|
|
245
|
+
def resfinder_dirs(self):
|
|
246
|
+
isolate_dirs = os.listdir(self.RESFINDER_dir)
|
|
247
|
+
isolate_dirs = [x for x in isolate_dirs if self._is_isolate_dir(x)]
|
|
248
|
+
return isolate_dirs
|
|
249
|
+
|
|
250
|
+
def genes_affected(self, isolate_phenotypes: List[IsolatePhenotypes]):
|
|
251
|
+
all_genes = [
|
|
252
|
+
gene for isolate in isolate_phenotypes for gene in isolate.all_genes()
|
|
253
|
+
]
|
|
254
|
+
all_genes = list(set(all_genes))
|
|
255
|
+
|
|
256
|
+
data_to_keep = []
|
|
257
|
+
|
|
258
|
+
for isolate in isolate_phenotypes:
|
|
259
|
+
isolate_line = [isolate.isolate_id, isolate.result_summary]
|
|
260
|
+
isolate.collect_all_genes_affected()
|
|
261
|
+
for gene in all_genes:
|
|
262
|
+
isolate_line.append(isolate.gene_affected(gene))
|
|
263
|
+
|
|
264
|
+
data_to_keep.append(isolate_line)
|
|
265
|
+
|
|
266
|
+
df = pd.DataFrame(
|
|
267
|
+
data_to_keep, columns=["isolate_id", "result_summary"] + all_genes
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
return df
|
|
271
|
+
|
|
272
|
+
def pointfinder_summary(
|
|
273
|
+
self, pointfinder_results: pd.DataFrame
|
|
274
|
+
) -> Optional[pd.DataFrame]:
|
|
275
|
+
|
|
276
|
+
if pointfinder_results.empty:
|
|
277
|
+
return None
|
|
278
|
+
|
|
279
|
+
isolate_ids = pointfinder_results["isolate_id"].unique()
|
|
280
|
+
|
|
281
|
+
pointfinder_known = pointfinder_results[
|
|
282
|
+
pointfinder_results["Resistance"] != "Unknown"
|
|
283
|
+
]
|
|
284
|
+
|
|
285
|
+
groups = []
|
|
286
|
+
# for _, group in pointfinder_results.groupby("isolate_id"):
|
|
287
|
+
for isolate_id in isolate_ids:
|
|
288
|
+
group = pointfinder_known[pointfinder_known["isolate_id"] == isolate_id]
|
|
289
|
+
|
|
290
|
+
## merge rows with the same mutation, but different resistance. concatenate the resistance
|
|
291
|
+
group = group.groupby(["isolate_id", "Mutation"]).agg(
|
|
292
|
+
{"Resistance": lambda x: "; ".join(x)}
|
|
293
|
+
)
|
|
294
|
+
group = group.reset_index()
|
|
295
|
+
|
|
296
|
+
matrix_df = group.pivot(
|
|
297
|
+
index="isolate_id", columns="Mutation", values="Resistance"
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
matrix_df = matrix_df.fillna("")
|
|
301
|
+
matrix_df = matrix_df.reset_index()
|
|
302
|
+
|
|
303
|
+
groups.append(matrix_df)
|
|
304
|
+
|
|
305
|
+
matrix_df = pd.concat(groups, axis=0)
|
|
306
|
+
|
|
307
|
+
return matrix_df
|
|
308
|
+
|
|
309
|
+
def collect_all_results(self):
|
|
310
|
+
pointfinder_results = []
|
|
311
|
+
resfinder_results = []
|
|
312
|
+
isolate_summaries = []
|
|
313
|
+
isolate_phenotypes = []
|
|
314
|
+
|
|
315
|
+
for isolate_dir in self.isolate_dirs:
|
|
316
|
+
if isolate_dir.startswith("."):
|
|
317
|
+
continue
|
|
318
|
+
|
|
319
|
+
if os.path.isdir(os.path.join(self.RESFINDER_dir, isolate_dir)) is False:
|
|
320
|
+
continue
|
|
321
|
+
|
|
322
|
+
isolate_parser = ResfinderParser(self.RESFINDER_dir, isolate_dir)
|
|
323
|
+
|
|
324
|
+
if isolate_parser.has_data is False:
|
|
325
|
+
self.logger.info(
|
|
326
|
+
f"No results found for isolate {isolate_parser.isolate_id}"
|
|
327
|
+
)
|
|
328
|
+
isolate_summary = isolate_parser.empty_isolate_summary
|
|
329
|
+
isolate_summaries.append(isolate_summary)
|
|
330
|
+
|
|
331
|
+
continue
|
|
332
|
+
|
|
333
|
+
isolate_phenotype_results = isolate_parser.collect_phenotype_results()
|
|
334
|
+
isolate_phenotype_results = isolate_parser.extend_resfinder_results(
|
|
335
|
+
isolate_phenotype_results
|
|
336
|
+
)
|
|
337
|
+
isolate_summary = isolate_parser.isolate_summary()
|
|
338
|
+
|
|
339
|
+
if isolate_parser.has_pointfinder_data:
|
|
340
|
+
|
|
341
|
+
isolate_pointfinder_results = (
|
|
342
|
+
isolate_parser.collect_pointfinder_results()
|
|
343
|
+
)
|
|
344
|
+
pointfinder_results.append(isolate_pointfinder_results)
|
|
345
|
+
|
|
346
|
+
resfinder_results.append(isolate_phenotype_results)
|
|
347
|
+
isolate_summaries.append(isolate_summary)
|
|
348
|
+
isolate_phenotypes.append(isolate_parser.phenotypes)
|
|
349
|
+
|
|
350
|
+
if len(isolate_phenotypes) == 0:
|
|
351
|
+
return None, None, None, None
|
|
352
|
+
|
|
353
|
+
resfinder_results = pd.concat(resfinder_results, axis=0)
|
|
354
|
+
isolate_summaries = pd.concat(isolate_summaries, axis=0)
|
|
355
|
+
|
|
356
|
+
genes_affected = self.genes_affected(isolate_phenotypes)
|
|
357
|
+
combined_presence_absence = genes_affected.copy()
|
|
358
|
+
|
|
359
|
+
if len(pointfinder_results) > 0:
|
|
360
|
+
pointfinder_results = pd.concat(pointfinder_results, axis=0)
|
|
361
|
+
|
|
362
|
+
pointfinder_results_summary = self.pointfinder_summary(pointfinder_results)
|
|
363
|
+
|
|
364
|
+
combined_presence_absence = pd.merge(
|
|
365
|
+
genes_affected,
|
|
366
|
+
pointfinder_results_summary,
|
|
367
|
+
on="isolate_id",
|
|
368
|
+
how="outer",
|
|
369
|
+
)
|
|
370
|
+
else:
|
|
371
|
+
pointfinder_results = pd.DataFrame()
|
|
372
|
+
|
|
373
|
+
return (
|
|
374
|
+
pointfinder_results,
|
|
375
|
+
resfinder_results,
|
|
376
|
+
isolate_summaries,
|
|
377
|
+
combined_presence_absence,
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
def collect(self):
|
|
381
|
+
self.logger.info(f"Collecting results from {len(self.isolate_dirs)} isolates..")
|
|
382
|
+
|
|
383
|
+
if self.output_dir is None:
|
|
384
|
+
self.output_dir = self.RESFINDER_dir
|
|
385
|
+
else:
|
|
386
|
+
if os.path.isdir(self.output_dir) is False:
|
|
387
|
+
os.mkdir(self.output_dir)
|
|
388
|
+
|
|
389
|
+
(
|
|
390
|
+
pointfinder_results,
|
|
391
|
+
resfinder_results,
|
|
392
|
+
isolate_summaries,
|
|
393
|
+
combinbined_presence_absence,
|
|
394
|
+
) = self.collect_all_results()
|
|
395
|
+
|
|
396
|
+
if pointfinder_results is None:
|
|
397
|
+
self.logger.info("No results found.")
|
|
398
|
+
return
|
|
399
|
+
|
|
400
|
+
pointfinder_results.to_csv(
|
|
401
|
+
os.path.join(self.output_dir, "pointfinder_results.tsv"),
|
|
402
|
+
sep="\t",
|
|
403
|
+
index=False,
|
|
404
|
+
)
|
|
405
|
+
resfinder_results.to_csv(
|
|
406
|
+
os.path.join(self.output_dir, "resfinder_results.tsv"),
|
|
407
|
+
sep="\t",
|
|
408
|
+
index=False,
|
|
409
|
+
)
|
|
410
|
+
isolate_summaries.to_csv(
|
|
411
|
+
os.path.join(self.output_dir, "isolate_summaries.tsv"),
|
|
412
|
+
sep="\t",
|
|
413
|
+
index=False,
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
combinbined_presence_absence.to_csv(
|
|
417
|
+
os.path.join(self.output_dir, "combined_presence_absence.tsv"),
|
|
418
|
+
sep="\t",
|
|
419
|
+
index=False,
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
self.logger.info(f"Results written to {self.output_dir}")
|