ssi-analysis-result-parsers 0.0.1__tar.gz → 0.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {ssi_analysis_result_parsers-0.0.1/ssi_analysis_result_parsers.egg-info → ssi_analysis_result_parsers-0.0.3}/PKG-INFO +2 -2
  2. {ssi_analysis_result_parsers-0.0.1 → ssi_analysis_result_parsers-0.0.3}/settings.ini +4 -3
  3. ssi_analysis_result_parsers-0.0.3/ssi_analysis_result_parsers/Legionella_parser.py +219 -0
  4. ssi_analysis_result_parsers-0.0.3/ssi_analysis_result_parsers/__init__.py +1 -0
  5. ssi_analysis_result_parsers-0.0.3/ssi_analysis_result_parsers/_modidx.py +78 -0
  6. {ssi_analysis_result_parsers-0.0.1 → ssi_analysis_result_parsers-0.0.3}/ssi_analysis_result_parsers/blast_parser.py +26 -24
  7. {ssi_analysis_result_parsers-0.0.1 → ssi_analysis_result_parsers-0.0.3}/ssi_analysis_result_parsers/core.py +83 -33
  8. {ssi_analysis_result_parsers-0.0.1 → ssi_analysis_result_parsers-0.0.3/ssi_analysis_result_parsers.egg-info}/PKG-INFO +2 -2
  9. {ssi_analysis_result_parsers-0.0.1 → ssi_analysis_result_parsers-0.0.3}/ssi_analysis_result_parsers.egg-info/SOURCES.txt +7 -1
  10. {ssi_analysis_result_parsers-0.0.1 → ssi_analysis_result_parsers-0.0.3}/ssi_analysis_result_parsers.egg-info/entry_points.txt +2 -1
  11. ssi_analysis_result_parsers-0.0.3/test_input/Legionella/batch_parser_file_paths.tsv +3 -0
  12. ssi_analysis_result_parsers-0.0.3/test_input/Legionella/lag-1_blast_2.tsv +0 -0
  13. ssi_analysis_result_parsers-0.0.3/test_input/Legionella/test2.sbt.tsv +2 -0
  14. ssi_analysis_result_parsers-0.0.3/test_input/blast_parser/empty_gene_presence_absense_test.tsv +0 -0
  15. ssi_analysis_result_parsers-0.0.3/test_output/test.tsv +3 -0
  16. ssi_analysis_result_parsers-0.0.3/test_output/test_batch_output.tsv +3 -0
  17. ssi_analysis_result_parsers-0.0.1/ssi_analysis_result_parsers/Legionella_parser.py +0 -88
  18. ssi_analysis_result_parsers-0.0.1/ssi_analysis_result_parsers/__init__.py +0 -1
  19. ssi_analysis_result_parsers-0.0.1/ssi_analysis_result_parsers/_modidx.py +0 -38
  20. {ssi_analysis_result_parsers-0.0.1 → ssi_analysis_result_parsers-0.0.3}/LICENSE +0 -0
  21. {ssi_analysis_result_parsers-0.0.1 → ssi_analysis_result_parsers-0.0.3}/MANIFEST.in +0 -0
  22. {ssi_analysis_result_parsers-0.0.1 → ssi_analysis_result_parsers-0.0.3}/README.md +0 -0
  23. {ssi_analysis_result_parsers-0.0.1 → ssi_analysis_result_parsers-0.0.3}/pyproject.toml +0 -0
  24. {ssi_analysis_result_parsers-0.0.1 → ssi_analysis_result_parsers-0.0.3}/setup.cfg +0 -0
  25. {ssi_analysis_result_parsers-0.0.1 → ssi_analysis_result_parsers-0.0.3}/setup.py +0 -0
  26. {ssi_analysis_result_parsers-0.0.1 → ssi_analysis_result_parsers-0.0.3}/ssi_analysis_result_parsers/config/config.default.env +0 -0
  27. {ssi_analysis_result_parsers-0.0.1 → ssi_analysis_result_parsers-0.0.3}/ssi_analysis_result_parsers/config/config.default.yaml +0 -0
  28. {ssi_analysis_result_parsers-0.0.1 → ssi_analysis_result_parsers-0.0.3}/ssi_analysis_result_parsers/hello_world.py +0 -0
  29. {ssi_analysis_result_parsers-0.0.1 → ssi_analysis_result_parsers-0.0.3}/ssi_analysis_result_parsers/some_string.py +0 -0
  30. {ssi_analysis_result_parsers-0.0.1 → ssi_analysis_result_parsers-0.0.3}/ssi_analysis_result_parsers.egg-info/dependency_links.txt +0 -0
  31. {ssi_analysis_result_parsers-0.0.1 → ssi_analysis_result_parsers-0.0.3}/ssi_analysis_result_parsers.egg-info/not-zip-safe +0 -0
  32. {ssi_analysis_result_parsers-0.0.1 → ssi_analysis_result_parsers-0.0.3}/ssi_analysis_result_parsers.egg-info/requires.txt +0 -0
  33. {ssi_analysis_result_parsers-0.0.1 → ssi_analysis_result_parsers-0.0.3}/ssi_analysis_result_parsers.egg-info/top_level.txt +0 -0
  34. {ssi_analysis_result_parsers-0.0.1 → ssi_analysis_result_parsers-0.0.3}/test_input/.DS_Store +0 -0
  35. {ssi_analysis_result_parsers-0.0.1 → ssi_analysis_result_parsers-0.0.3}/test_input/Legionella/lag-1_blast.tsv +0 -0
  36. {ssi_analysis_result_parsers-0.0.1 → ssi_analysis_result_parsers-0.0.3}/test_input/Legionella/test.sbt.tsv +0 -0
  37. {ssi_analysis_result_parsers-0.0.1 → ssi_analysis_result_parsers-0.0.3}/test_input/blast_parser/allele_matches_test.tsv +0 -0
  38. {ssi_analysis_result_parsers-0.0.1 → ssi_analysis_result_parsers-0.0.3}/test_input/blast_parser/gene_presence_absence_test.tsv +0 -0
  39. {ssi_analysis_result_parsers-0.0.1 → ssi_analysis_result_parsers-0.0.3}/test_output/output_with_sample_name.tsv +0 -0
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ssi_analysis_result_parsers
3
- Version: 0.0.1
3
+ Version: 0.0.3
4
4
  Summary: TODO
5
5
  Home-page: https://github.com/thej-ssi/ssi_analysis_result_parsers
6
- Author: thej-ssi
6
+ Author: Thor Bech Johannesen
7
7
  Author-email: thej@ssi.dk
8
8
  License: MIT License
9
9
  Keywords: nbdev jupyter notebook python
@@ -5,7 +5,7 @@
5
5
  ### Python library ###
6
6
  repo = ssi_analysis_result_parsers
7
7
  lib_name = %(repo)s
8
- version = 0.0.1
8
+ version = 0.0.3
9
9
  min_python = 3.9
10
10
  license = MIT
11
11
  black_formatting = True
@@ -29,7 +29,7 @@ title = %(lib_name)s
29
29
 
30
30
  ### PyPI ###
31
31
  audience = Developers
32
- author = thej-ssi
32
+ author = Thor Bech Johannesen
33
33
  author_email = thej@ssi.dk
34
34
  copyright = 2025 onwards, %(author)s
35
35
  description = TODO
@@ -49,4 +49,5 @@ pip_requirements = python_dotenv envyaml pandas black
49
49
  console_scripts =
50
50
  blast_parser_presence_absence=ssi_analysis_result_parsers.blast_parser:presence_absence
51
51
  blast_parser_allele_matches=ssi_analysis_result_parsers.blast_parser:allele_matches
52
- legionella_parser=ssi_analysis_result_parsers.Legionella_parser:legionella_parser
52
+ get_leg_results=ssi_analysis_result_parsers.Legionella_parser:legionella_parser
53
+ get_leg_results_batch=ssi_analysis_result_parsers.Legionella_parser:legionella_batch_parser
@@ -0,0 +1,219 @@
1
+ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/39_Legionella_parser.ipynb.
2
+
3
+ # %% auto 0
4
+ __all__ = ['extract_legionella_sbt', 'legionella_summary', 'legionella_batch_from_sheet', 'LegionellaResults',
5
+ 'legionella_batch_from_dict', 'legionella_parser', 'legionella_batch_parser']
6
+
7
+ # %% ../nbs/39_Legionella_parser.ipynb 3
8
+ # standard libs
9
+ import os
10
+ import re
11
+
12
+ # Common to template
13
+ # add into settings.ini, requirements, package name is python-dotenv, for conda build ensure `conda config --add channels conda-forge`
14
+ import dotenv # for loading config from .env files, https://pypi.org/project/python-dotenv/
15
+ import envyaml # Allows to loads env vars into a yaml file, https://github.com/thesimj/envyaml
16
+ import fastcore # To add functionality related to nbdev development, https://github.com/fastai/fastcore/
17
+ from fastcore import (
18
+ test,
19
+ )
20
+ from fastcore.script import (
21
+ call_parse,
22
+ ) # for @call_parse, https://fastcore.fast.ai/script
23
+ import json # for nicely printing json and yaml
24
+
25
+ # import functions from core module (optional, but most likely needed).
26
+ from ssi_analysis_result_parsers import (
27
+ core,
28
+ )
29
+ from .blast_parser import extract_presence_absence
30
+
31
+ # Project specific libraries
32
+ from pathlib import Path
33
+ import pandas
34
+ import sys
35
+
36
+ # %% ../nbs/39_Legionella_parser.ipynb 6
37
+ def extract_legionella_sbt(legionella_sbt_results_tsv: Path) -> dict:
38
+ """
39
+ Returns dictionary of results found in the Legionella SBT summary output
40
+ """
41
+ if os.path.exists(legionella_sbt_results_tsv):
42
+ df = pandas.read_csv(legionella_sbt_results_tsv, sep="\t")
43
+ df.set_index("sample", inplace=True, drop=True)
44
+ d = df.to_dict(orient="index")
45
+ fname = next(iter(d))
46
+ return d[fname]
47
+ else:
48
+ print(
49
+ f"No Legionella SBT output found at {legionella_sbt_results_tsv}",
50
+ file=sys.stderr,
51
+ )
52
+ return None
53
+
54
+
55
+ def legionella_summary(legionella_sbt_results_tsv: Path, lag1_blast_tsv: Path) -> dict:
56
+ sbt_results_dict = extract_legionella_sbt(
57
+ legionella_sbt_results_tsv=legionella_sbt_results_tsv
58
+ )
59
+ lag1_blast_dict = extract_presence_absence(
60
+ blast_output_tsv=lag1_blast_tsv,
61
+ hits_as_string=False,
62
+ include_match_stats=False,
63
+ gene_names=["lag-1"],
64
+ )
65
+ results_dict = core.update_results_dict(
66
+ sbt_results_dict, lag1_blast_dict, old_duplicate_key_prefix="SBT: "
67
+ )
68
+ return results_dict
69
+
70
+
71
+ def legionella_batch_from_sheet(file_paths: dict, output_file: Path = None):
72
+ results_dict = {}
73
+ for sample_name, path_dict in file_paths.items():
74
+ legionella_results = legionella_summary(
75
+ legionella_sbt_results_tsv=Path(path_dict["sbt_results"]),
76
+ lag1_blast_tsv=Path(path_dict["lag1_blast_results"]),
77
+ )
78
+ results_dict[sample_name] = legionella_results
79
+ if output_file is not None:
80
+ df = pandas.DataFrame.from_dict(results_dict, orient="index").reset_index(
81
+ names="sample_name"
82
+ )
83
+ df.to_csv(output_file, sep="\t", index=False)
84
+ return results_dict
85
+
86
+
87
+ class LegionellaResults(core.PipelineResults):
88
+
89
+ @classmethod
90
+ def from_tool_paths(
91
+ cls, legionella_sbt_results_tsv: Path, lag1_blast_tsv: Path, sample_name=None
92
+ ):
93
+ legionella_results = cls.legionella_summary(
94
+ legionella_sbt_results_tsv=legionella_sbt_results_tsv,
95
+ lag1_blast_tsv=lag1_blast_tsv,
96
+ )
97
+ return cls({sample_name: legionella_results})
98
+
99
+ @classmethod
100
+ def from_tool_paths_dict(cls, file_paths: dict):
101
+ results_dict = {}
102
+ for sample_name, path_dict in file_paths.items():
103
+ legionella_results = cls.legionella_summary(
104
+ legionella_sbt_results_tsv=Path(path_dict["sbt_results"]),
105
+ lag1_blast_tsv=Path(path_dict["lag1_blast_results"]),
106
+ )
107
+ results_dict[sample_name] = legionella_results
108
+ return cls(results_dict)
109
+
110
+ @classmethod
111
+ def from_tool_paths_dataframe(cls, file_paths_df: pandas.DataFrame):
112
+ file_paths = file_paths_df.to_dict(orient="index")
113
+ results_dict = {}
114
+ for sample_name, path_dict in file_paths.items():
115
+ legionella_results = cls.legionella_summary(
116
+ legionella_sbt_results_tsv=Path(path_dict["sbt_results"]),
117
+ lag1_blast_tsv=Path(path_dict["lag1_blast_results"]),
118
+ )
119
+ print(legionella_results)
120
+ results_dict[sample_name] = legionella_results
121
+ return cls(results_dict)
122
+
123
+ @classmethod
124
+ def from_tool_paths_tsv(cls, tool_paths_tsv: Path):
125
+ file_paths_df = pandas.read_csv(tool_paths_tsv, sep="\t")
126
+ file_paths_df.set_index("sample_name", inplace=True, drop=True)
127
+ # return_cls =
128
+ # results_dict = file_paths_df.to_dict(orient="index")
129
+ return cls.from_tool_paths_dataframe(file_paths_df)
130
+ """for sample_name, path_dict in file_paths.items():
131
+ legionella_results = cls.legionella_summary(legionella_sbt_results_tsv=Path(path_dict["sbt_results"]),
132
+ lag1_blast_tsv=Path(path_dict["lag1_blast_results"]))
133
+ results_dict[sample_name] = legionella_results
134
+ return cls(results_dict)"""
135
+
136
+ def legionella_summary(
137
+ legionella_sbt_results_tsv: Path, lag1_blast_tsv: Path
138
+ ) -> dict:
139
+ sbt_results_dict = extract_legionella_sbt(
140
+ legionella_sbt_results_tsv=legionella_sbt_results_tsv
141
+ )
142
+ lag1_blast_dict = extract_presence_absence(
143
+ blast_output_tsv=lag1_blast_tsv,
144
+ hits_as_string=False,
145
+ include_match_stats=False,
146
+ gene_names=["lag-1"],
147
+ )
148
+ print(lag1_blast_dict)
149
+ results_dict = core.update_results_dict(
150
+ sbt_results_dict, lag1_blast_dict, old_duplicate_key_prefix="SBT: "
151
+ )
152
+ return results_dict
153
+
154
+ def __repr__(self):
155
+ return f"< Legionella analysis results object. {len(self.results_df)} samples with {len(self.results_df.columns)} result variables > "
156
+
157
+
158
+ def legionella_batch_from_dict(file_paths: dict, output_file: Path = None):
159
+ results_dict = {}
160
+ for sample_name, path_dict in file_paths.items():
161
+ legionella_results = legionella_summary(
162
+ legionella_sbt_results_tsv=Path(path_dict["sbt_results"]),
163
+ lag1_blast_tsv=Path(path_dict["lag1_blast_results"]),
164
+ )
165
+ results_dict[sample_name] = legionella_results
166
+ if output_file is not None:
167
+ df = pandas.DataFrame.from_dict(results_dict, orient="index").reset_index(
168
+ names="sample_name"
169
+ )
170
+ df.to_csv(output_file, sep="\t", index=False)
171
+ return results_dict
172
+
173
+
174
+ def legionella_batch_from_sheet(file_paths: dict, output_file: Path = None):
175
+ results_dict = {}
176
+ for sample_name, path_dict in file_paths.items():
177
+ legionella_results = legionella_summary(
178
+ legionella_sbt_results_tsv=Path(path_dict["sbt_results"]),
179
+ lag1_blast_tsv=Path(path_dict["lag1_blast_results"]),
180
+ )
181
+ results_dict[sample_name] = legionella_results
182
+ if output_file is not None:
183
+ df = pandas.DataFrame.from_dict(results_dict, orient="index").reset_index(
184
+ names="sample_name"
185
+ )
186
+ df.to_csv(output_file, sep="\t", index=False)
187
+ return results_dict
188
+
189
+ # %% ../nbs/39_Legionella_parser.ipynb 9
190
+ @call_parse
191
+ def legionella_parser(
192
+ legionella_sbt_file: Path = None, # Path "*.sbt.tsv from legionella_sbt program"
193
+ lag_1_blast_output: Path = None, # Path to output from lag1_blast. Generated with blastn -query lag-1.fasta -subject assembly.fasta -outfmt "6 qseqid sseqid pident length qlen qstart qend sstart send sseq evalue bitscore"
194
+ output_file: Path = None, # Path to output tsv
195
+ sample_name: str = None,
196
+ config_file: str = None, # config file to set env vars from
197
+ ) -> None:
198
+ """ """
199
+ # config = core.get_config(config_file) # Set env vars and get config variables
200
+ legionella_results = LegionellaResults.from_tool_paths(
201
+ legionella_sbt_results_tsv=legionella_sbt_file,
202
+ lag1_blast_tsv=lag_1_blast_output,
203
+ sample_name=sample_name,
204
+ )
205
+ legionella_results.write_tsv(output_file=output_file)
206
+
207
+
208
+ @call_parse
209
+ def legionella_batch_parser(
210
+ file_path_tsv: Path = None, # Path to tsv containing file paths to the outputs from tools to be parsed. Must contain headers "sample_name", "sbt_results", and "lag1_blast_results"
211
+ output_file: Path = None, # Path to output tsv
212
+ config_file: str = None, # config file to set env vars from
213
+ ) -> None:
214
+ """ """
215
+ # config = core.get_config(config_file) # Set env vars and get config variables
216
+ legionella_results = LegionellaResults.from_tool_paths_tsv(
217
+ tool_paths_tsv=file_path_tsv
218
+ )
219
+ legionella_results.write_tsv(output_file)
@@ -0,0 +1 @@
1
+ __version__ = "0.0.2"
@@ -0,0 +1,78 @@
1
+ # Autogenerated by nbdev
2
+
3
+ d = { 'settings': { 'branch': 'main',
4
+ 'doc_baseurl': '/ssi_analysis_result_parsers',
5
+ 'doc_host': 'https://thej-ssi.github.io',
6
+ 'git_url': 'https://github.com/thej-ssi/ssi_analysis_result_parsers',
7
+ 'lib_path': 'ssi_analysis_result_parsers'},
8
+ 'syms': { 'ssi_analysis_result_parsers.Legionella_parser': { 'ssi_analysis_result_parsers.Legionella_parser.LegionellaResults': ( 'legionella_parser.html#legionellaresults',
9
+ 'ssi_analysis_result_parsers/Legionella_parser.py'),
10
+ 'ssi_analysis_result_parsers.Legionella_parser.LegionellaResults.__repr__': ( 'legionella_parser.html#legionellaresults.__repr__',
11
+ 'ssi_analysis_result_parsers/Legionella_parser.py'),
12
+ 'ssi_analysis_result_parsers.Legionella_parser.LegionellaResults.from_tool_paths': ( 'legionella_parser.html#legionellaresults.from_tool_paths',
13
+ 'ssi_analysis_result_parsers/Legionella_parser.py'),
14
+ 'ssi_analysis_result_parsers.Legionella_parser.LegionellaResults.from_tool_paths_dataframe': ( 'legionella_parser.html#legionellaresults.from_tool_paths_dataframe',
15
+ 'ssi_analysis_result_parsers/Legionella_parser.py'),
16
+ 'ssi_analysis_result_parsers.Legionella_parser.LegionellaResults.from_tool_paths_dict': ( 'legionella_parser.html#legionellaresults.from_tool_paths_dict',
17
+ 'ssi_analysis_result_parsers/Legionella_parser.py'),
18
+ 'ssi_analysis_result_parsers.Legionella_parser.LegionellaResults.from_tool_paths_tsv': ( 'legionella_parser.html#legionellaresults.from_tool_paths_tsv',
19
+ 'ssi_analysis_result_parsers/Legionella_parser.py'),
20
+ 'ssi_analysis_result_parsers.Legionella_parser.LegionellaResults.legionella_summary': ( 'legionella_parser.html#legionellaresults.legionella_summary',
21
+ 'ssi_analysis_result_parsers/Legionella_parser.py'),
22
+ 'ssi_analysis_result_parsers.Legionella_parser.extract_legionella_sbt': ( 'legionella_parser.html#extract_legionella_sbt',
23
+ 'ssi_analysis_result_parsers/Legionella_parser.py'),
24
+ 'ssi_analysis_result_parsers.Legionella_parser.legionella_batch_from_dict': ( 'legionella_parser.html#legionella_batch_from_dict',
25
+ 'ssi_analysis_result_parsers/Legionella_parser.py'),
26
+ 'ssi_analysis_result_parsers.Legionella_parser.legionella_batch_from_sheet': ( 'legionella_parser.html#legionella_batch_from_sheet',
27
+ 'ssi_analysis_result_parsers/Legionella_parser.py'),
28
+ 'ssi_analysis_result_parsers.Legionella_parser.legionella_batch_parser': ( 'legionella_parser.html#legionella_batch_parser',
29
+ 'ssi_analysis_result_parsers/Legionella_parser.py'),
30
+ 'ssi_analysis_result_parsers.Legionella_parser.legionella_parser': ( 'legionella_parser.html#legionella_parser',
31
+ 'ssi_analysis_result_parsers/Legionella_parser.py'),
32
+ 'ssi_analysis_result_parsers.Legionella_parser.legionella_summary': ( 'legionella_parser.html#legionella_summary',
33
+ 'ssi_analysis_result_parsers/Legionella_parser.py')},
34
+ 'ssi_analysis_result_parsers.blast_parser': { 'ssi_analysis_result_parsers.blast_parser.allele_matches': ( 'blast_parser.html#allele_matches',
35
+ 'ssi_analysis_result_parsers/blast_parser.py'),
36
+ 'ssi_analysis_result_parsers.blast_parser.extract_allele_matches': ( 'blast_parser.html#extract_allele_matches',
37
+ 'ssi_analysis_result_parsers/blast_parser.py'),
38
+ 'ssi_analysis_result_parsers.blast_parser.extract_presence_absence': ( 'blast_parser.html#extract_presence_absence',
39
+ 'ssi_analysis_result_parsers/blast_parser.py'),
40
+ 'ssi_analysis_result_parsers.blast_parser.presence_absence': ( 'blast_parser.html#presence_absence',
41
+ 'ssi_analysis_result_parsers/blast_parser.py')},
42
+ 'ssi_analysis_result_parsers.core': { 'ssi_analysis_result_parsers.core.PipelineResults': ( 'core.html#pipelineresults',
43
+ 'ssi_analysis_result_parsers/core.py'),
44
+ 'ssi_analysis_result_parsers.core.PipelineResults.__init__': ( 'core.html#pipelineresults.__init__',
45
+ 'ssi_analysis_result_parsers/core.py'),
46
+ 'ssi_analysis_result_parsers.core.PipelineResults.__iter__': ( 'core.html#pipelineresults.__iter__',
47
+ 'ssi_analysis_result_parsers/core.py'),
48
+ 'ssi_analysis_result_parsers.core.PipelineResults.__len__': ( 'core.html#pipelineresults.__len__',
49
+ 'ssi_analysis_result_parsers/core.py'),
50
+ 'ssi_analysis_result_parsers.core.PipelineResults.__repr__': ( 'core.html#pipelineresults.__repr__',
51
+ 'ssi_analysis_result_parsers/core.py'),
52
+ 'ssi_analysis_result_parsers.core.PipelineResults.from_results_dataframe': ( 'core.html#pipelineresults.from_results_dataframe',
53
+ 'ssi_analysis_result_parsers/core.py'),
54
+ 'ssi_analysis_result_parsers.core.PipelineResults.from_results_tsv': ( 'core.html#pipelineresults.from_results_tsv',
55
+ 'ssi_analysis_result_parsers/core.py'),
56
+ 'ssi_analysis_result_parsers.core.PipelineResults.items': ( 'core.html#pipelineresults.items',
57
+ 'ssi_analysis_result_parsers/core.py'),
58
+ 'ssi_analysis_result_parsers.core.PipelineResults.results': ( 'core.html#pipelineresults.results',
59
+ 'ssi_analysis_result_parsers/core.py'),
60
+ 'ssi_analysis_result_parsers.core.PipelineResults.write_tsv': ( 'core.html#pipelineresults.write_tsv',
61
+ 'ssi_analysis_result_parsers/core.py'),
62
+ 'ssi_analysis_result_parsers.core.get_config': ( 'core.html#get_config',
63
+ 'ssi_analysis_result_parsers/core.py'),
64
+ 'ssi_analysis_result_parsers.core.get_samplesheet': ( 'core.html#get_samplesheet',
65
+ 'ssi_analysis_result_parsers/core.py'),
66
+ 'ssi_analysis_result_parsers.core.print_results_dict_to_tsv': ( 'core.html#print_results_dict_to_tsv',
67
+ 'ssi_analysis_result_parsers/core.py'),
68
+ 'ssi_analysis_result_parsers.core.set_env_variables': ( 'core.html#set_env_variables',
69
+ 'ssi_analysis_result_parsers/core.py'),
70
+ 'ssi_analysis_result_parsers.core.show_project_env_vars': ( 'core.html#show_project_env_vars',
71
+ 'ssi_analysis_result_parsers/core.py'),
72
+ 'ssi_analysis_result_parsers.core.update_results_dict': ( 'core.html#update_results_dict',
73
+ 'ssi_analysis_result_parsers/core.py')},
74
+ 'ssi_analysis_result_parsers.hello_world': { 'ssi_analysis_result_parsers.hello_world.cli': ( 'hello_world.html#cli',
75
+ 'ssi_analysis_result_parsers/hello_world.py'),
76
+ 'ssi_analysis_result_parsers.hello_world.hello_world': ( 'hello_world.html#hello_world',
77
+ 'ssi_analysis_result_parsers/hello_world.py')},
78
+ 'ssi_analysis_result_parsers.some_string': {}}}
@@ -49,36 +49,38 @@ def extract_presence_absence(
49
49
 
50
50
  """
51
51
  if os.path.exists(blast_output_tsv):
52
- blast_df = pandas.read_csv(blast_output_tsv, sep="\t", header=None)
53
- blast_df.columns = tsv_header.split(" ")
54
- blast_df["plen"] = blast_df["length"] / blast_df["qlen"] * 100
55
- blast_df_unique = (
56
- blast_df.sort_values(by=["bitscore"], ascending=False)
57
- .groupby("qseqid")
58
- .first()
59
- )
60
- blast_df_filtered = blast_df_unique.query(
61
- "plen > @plen_threshold and pident > @pident_threshold"
62
- )
52
+ try:
53
+ blast_df = pandas.read_csv(blast_output_tsv, sep="\t", header=None)
54
+
55
+ blast_df.columns = tsv_header.split(" ")
56
+ blast_df["plen"] = blast_df["length"] / blast_df["qlen"] * 100
57
+ blast_df_unique = (
58
+ blast_df.sort_values(by=["bitscore"], ascending=False)
59
+ .groupby("qseqid")
60
+ .first()
61
+ )
62
+ blast_df_filtered = blast_df_unique.query(
63
+ "plen > @plen_threshold and pident > @pident_threshold"
64
+ )
65
+ blast_dict = dict(blast_df_filtered.to_dict(orient="index"))
66
+ except pandas.errors.EmptyDataError:
67
+ blast_dict = {}
68
+ print(f"Blast output file {blast_output_tsv} empty. Assuming 0 blast hits.")
63
69
  if hits_as_string:
64
- if include_match_stats:
65
- results = []
66
- for gene, d in blast_df_filtered.to_dict(orient="index").items():
67
- results.append(f"{gene}__{d['pident']}__{d['plen']}")
68
- result_dict = {"genes_found": ", ".join(results)}
69
- return result_dict
70
70
 
71
- else:
72
- result_dict = {
73
- "genes_found": ", ".join(list(blast_df_filtered.index.values))
74
- }
75
- return result_dict
71
+ results = []
72
+ for gene, d in blast_dict.items():
73
+ if include_match_stats:
74
+ results.append(f"{gene}__{d['pident']}__{d['plen']}")
75
+ else:
76
+ results.append(gene)
77
+ result_dict = {"genes_found": ", ".join(results)}
78
+ return result_dict
76
79
 
77
80
  else:
78
81
  result_dict = {}
79
- blast_dict = dict(blast_df_filtered.to_dict(orient="index").items())
80
82
  if gene_names is None:
81
- gene_names = blast_dict.keys()
83
+ gene_names = list(blast_dict.keys())
82
84
  for gene in gene_names:
83
85
  if gene in blast_dict:
84
86
  if include_match_stats:
@@ -2,7 +2,8 @@
2
2
 
3
3
  # %% auto 0
4
4
  __all__ = ['PACKAGE_NAME', 'DEV_MODE', 'PACKAGE_DIR', 'PROJECT_DIR', 'config', 'set_env_variables', 'get_config',
5
- 'show_project_env_vars', 'get_samplesheet', 'update_results_dict', 'print_results_dict_to_tsv']
5
+ 'show_project_env_vars', 'get_samplesheet', 'PipelineResults', 'update_results_dict',
6
+ 'print_results_dict_to_tsv']
6
7
 
7
8
  # %% ../nbs/00_core.ipynb 4
8
9
  # Need the ssi_analysis_result_parsers for a few functions, this can be considered a static var
@@ -192,48 +193,97 @@ def get_samplesheet(sample_sheet_config: dict) -> pd.DataFrame:
192
193
  return df
193
194
 
194
195
  # %% ../nbs/00_core.ipynb 24
196
+ class PipelineResults:
197
+
198
+ def __init__(self, results_dict):
199
+ print(results_dict)
200
+ self.results_dict = results_dict
201
+ self.results_df = pandas.DataFrame.from_dict(results_dict, orient="index")
202
+
203
+ def write_tsv(self, output_file: Path) -> None:
204
+ print_df = self.results_df.reset_index(names="sample_name")
205
+ print_df.to_csv(output_file, sep="\t", index=False)
206
+ return None
207
+
208
+ @classmethod
209
+ def from_results_dataframe(cls, results_df: pandas.DataFrame):
210
+ # results_df = results_df.set_index("sample_name")
211
+ results_dict = results_df.to_dict(orient="index")
212
+ return cls(results_dict)
213
+
214
+ @classmethod
215
+ def from_results_tsv(cls, results_tsv: Path):
216
+ results_df = pandas.read_csv(results_tsv, sep="\t")
217
+ results_df.set_index("sample_name", inplace=True, drop=True)
218
+ results_dict = results_df.to_dict(orient="index")
219
+ return cls(results_dict)
220
+
221
+ def __repr__(self):
222
+ return f"< Generic analysis results object. {len(self.results_df)} samples with {len(self.results_df.columns)} result variables > "
223
+
224
+ def __len__(self):
225
+ return len(self.results_dict)
226
+
227
+ def __iter__(self):
228
+ for sample_name in self.results_dict:
229
+ yield sample_name
230
+
231
+ def items(self):
232
+ for sample_name, results_d in self.results_dict:
233
+ yield sample_name, results_d
234
+
235
+ def results(self):
236
+ for results_d in self.results_dict.values():
237
+ yield results_d
238
+
239
+
195
240
  def update_results_dict(
196
241
  old_results: dict,
197
242
  new_results: dict,
198
243
  old_duplicate_key_prefix: str = None,
199
244
  new_duplicate_key_prefix: str = None,
200
245
  ):
201
- duplicate_keys = list(set(old_results.keys()) & set(new_results.keys()))
202
- if len(duplicate_keys) == 0:
203
- old_results.update(new_results)
246
+ if old_results is None:
247
+ return new_results
248
+ elif new_results is None:
204
249
  return old_results
205
250
  else:
206
- if old_duplicate_key_prefix is None and new_duplicate_key_prefix is None:
207
- raise ValueError(
208
- "Provided dictionaries contain duplicate keys. old_duplicate_key_prefix and/or new_duplicate_key_prefix must be provided"
209
- )
210
- elif old_duplicate_key_prefix == new_duplicate_key_prefix:
211
- raise ValueError(
212
- "old_duplicate_key_prefix and new_duplicate_key_prefix cannot be identical"
213
- )
251
+ duplicate_keys = list(set(old_results.keys()) & set(new_results.keys()))
252
+ if len(duplicate_keys) == 0:
253
+ old_results.update(new_results)
254
+ return old_results
214
255
  else:
215
- combined_dict = {}
216
- if old_duplicate_key_prefix is None:
217
- combined_dict.update(old_results)
218
- else:
219
- for key, value in old_results.items():
220
- if key in duplicate_keys:
221
- combined_dict.update(
222
- {f"{old_duplicate_key_prefix}{key}": value}
223
- )
224
- else:
225
- combined_dict.update({key: value})
226
- if new_duplicate_key_prefix is None:
227
- combined_dict.update(new_results)
256
+ if old_duplicate_key_prefix is None and new_duplicate_key_prefix is None:
257
+ raise ValueError(
258
+ "Provided dictionaries contain duplicate keys. Old_duplicate_key_prefix and/or new_duplicate_key_prefix must be provided"
259
+ )
260
+ elif old_duplicate_key_prefix == new_duplicate_key_prefix:
261
+ raise ValueError(
262
+ "old_duplicate_key_prefix and new_duplicate_key_prefix cannot be identical"
263
+ )
228
264
  else:
229
- for key, value in new_results.items():
230
- if key in duplicate_keys:
231
- combined_dict.update(
232
- {f"{new_duplicate_key_prefix}{key}": value}
233
- )
234
- else:
235
- combined_dict.update({key: value})
236
- return combined_dict
265
+ combined_dict = {}
266
+ if old_duplicate_key_prefix is None:
267
+ combined_dict.update(old_results)
268
+ else:
269
+ for key, value in old_results.items():
270
+ if key in duplicate_keys:
271
+ combined_dict.update(
272
+ {f"{old_duplicate_key_prefix}{key}": value}
273
+ )
274
+ else:
275
+ combined_dict.update({key: value})
276
+ if new_duplicate_key_prefix is None:
277
+ combined_dict.update(new_results)
278
+ else:
279
+ for key, value in new_results.items():
280
+ if key in duplicate_keys:
281
+ combined_dict.update(
282
+ {f"{new_duplicate_key_prefix}{key}": value}
283
+ )
284
+ else:
285
+ combined_dict.update({key: value})
286
+ return combined_dict
237
287
 
238
288
 
239
289
  def print_results_dict_to_tsv(
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ssi_analysis_result_parsers
3
- Version: 0.0.1
3
+ Version: 0.0.3
4
4
  Summary: TODO
5
5
  Home-page: https://github.com/thej-ssi/ssi_analysis_result_parsers
6
- Author: thej-ssi
6
+ Author: Thor Bech Johannesen
7
7
  Author-email: thej@ssi.dk
8
8
  License: MIT License
9
9
  Keywords: nbdev jupyter notebook python
@@ -21,8 +21,14 @@ ssi_analysis_result_parsers.egg-info/top_level.txt
21
21
  ssi_analysis_result_parsers/config/config.default.env
22
22
  ssi_analysis_result_parsers/config/config.default.yaml
23
23
  test_input/.DS_Store
24
+ test_input/Legionella/batch_parser_file_paths.tsv
24
25
  test_input/Legionella/lag-1_blast.tsv
26
+ test_input/Legionella/lag-1_blast_2.tsv
25
27
  test_input/Legionella/test.sbt.tsv
28
+ test_input/Legionella/test2.sbt.tsv
26
29
  test_input/blast_parser/allele_matches_test.tsv
30
+ test_input/blast_parser/empty_gene_presence_absense_test.tsv
27
31
  test_input/blast_parser/gene_presence_absence_test.tsv
28
- test_output/output_with_sample_name.tsv
32
+ test_output/output_with_sample_name.tsv
33
+ test_output/test.tsv
34
+ test_output/test_batch_output.tsv
@@ -1,7 +1,8 @@
1
1
  [console_scripts]
2
2
  blast_parser_allele_matches = ssi_analysis_result_parsers.blast_parser:allele_matches
3
3
  blast_parser_presence_absence = ssi_analysis_result_parsers.blast_parser:presence_absence
4
- legionella_parser = ssi_analysis_result_parsers.Legionella_parser:legionella_parser
4
+ get_leg_results = ssi_analysis_result_parsers.Legionella_parser:legionella_parser
5
+ get_leg_results_batch = ssi_analysis_result_parsers.Legionella_parser:legionella_batch_parser
5
6
 
6
7
  [nbdev]
7
8
  ssi_analysis_result_parsers = ssi_analysis_result_parsers._modidx:d
@@ -0,0 +1,3 @@
1
+ sample_name sbt_results lag1_blast_results
2
+ sample_1 test_input/Legionella/test.sbt.tsv test_input/Legionella/lag-1_blast.tsv
3
+ sample_2 test_input/Legionella/test2.sbt.tsv test_input/Legionella/lag-1_blast_2.tsv
@@ -0,0 +1,2 @@
1
+ sample ST flaA pilE asd mip mompS proA neuA notes
2
+ LEG-2024-R11031.fasta 182 3 4 1 3 35 9 11 Exact ST match, Heterozygous mompS alleles, High confidence mompS allele call
@@ -0,0 +1,3 @@
1
+ sample_name ST flaA pilE asd mip mompS proA neuA notes lag-1
2
+ sample_1 23 2 3 9 10 2 1 6 Exact ST match, Heterozygous mompS alleles, High confidence mompS allele call 1
3
+ sample_2 182 3 4 1 3 35 9 11 Exact ST match, Heterozygous mompS alleles, High confidence mompS allele call 0
@@ -0,0 +1,3 @@
1
+ sample_name ST flaA pilE asd mip mompS proA neuA notes lag-1
2
+ sample_1 23 2 3 9 10 2 1 6 Exact ST match, Heterozygous mompS alleles, High confidence mompS allele call 1
3
+ sample_2 182 3 4 1 3 35 9 11 Exact ST match, Heterozygous mompS alleles, High confidence mompS allele call 0
@@ -1,88 +0,0 @@
1
- # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/39_Legionella_parser.ipynb.
2
-
3
- # %% auto 0
4
- __all__ = ['extract_legionella_sbt', 'legionella_summary', 'legionella_parser']
5
-
6
- # %% ../nbs/39_Legionella_parser.ipynb 3
7
- # standard libs
8
- import os
9
- import re
10
-
11
- # Common to template
12
- # add into settings.ini, requirements, package name is python-dotenv, for conda build ensure `conda config --add channels conda-forge`
13
- import dotenv # for loading config from .env files, https://pypi.org/project/python-dotenv/
14
- import envyaml # Allows to loads env vars into a yaml file, https://github.com/thesimj/envyaml
15
- import fastcore # To add functionality related to nbdev development, https://github.com/fastai/fastcore/
16
- from fastcore import (
17
- test,
18
- )
19
- from fastcore.script import (
20
- call_parse,
21
- ) # for @call_parse, https://fastcore.fast.ai/script
22
- import json # for nicely printing json and yaml
23
-
24
- # import functions from core module (optional, but most likely needed).
25
- from ssi_analysis_result_parsers import (
26
- core,
27
- )
28
- from .blast_parser import extract_presence_absence
29
-
30
- # Project specific libraries
31
- from pathlib import Path
32
- import pandas
33
- import sys
34
-
35
- # %% ../nbs/39_Legionella_parser.ipynb 6
36
- def extract_legionella_sbt(legionella_sbt_results_tsv: Path) -> dict:
37
- """
38
- Returns dictionary of results found in the Legionella SBT summary output
39
- """
40
- if os.path.exists(legionella_sbt_results_tsv):
41
- df = pandas.read_csv(legionella_sbt_results_tsv, sep="\t")
42
- df.set_index("sample", inplace=True, drop=True)
43
- d = df.to_dict(orient="index")
44
- fname = next(iter(d))
45
- return d[fname]
46
- else:
47
- print(
48
- f"No Legionella SBT output found at {legionella_sbt_results_tsv}",
49
- file=sys.stderr,
50
- )
51
- return None
52
-
53
-
54
- def legionella_summary(legionella_sbt_results_tsv: Path, lag1_blast_tsv: Path) -> dict:
55
- sbt_results_dict = extract_legionella_sbt(
56
- legionella_sbt_results_tsv=legionella_sbt_results_tsv
57
- )
58
- lag1_blast_dict = extract_presence_absence(
59
- blast_output_tsv=lag1_blast_tsv,
60
- hits_as_string=False,
61
- include_match_stats=False,
62
- gene_names=["lag-1"],
63
- )
64
- results_dict = core.update_results_dict(
65
- sbt_results_dict, lag1_blast_dict, old_duplicate_key_prefix="SBT: "
66
- )
67
- return results_dict
68
-
69
- # %% ../nbs/39_Legionella_parser.ipynb 9
70
- @call_parse
71
- def legionella_parser(
72
- legionella_sbt_file: Path = None, # Path "*.sbt.tsv from legionella_sbt program"
73
- lag_1_blast_output: Path = None, # Path to output from lag1_blast. Generated with blastn -query lag-1.fasta -subject assembly.fasta -outfmt "6 qseqid sseqid pident length qlen qstart qend sstart send sseq evalue bitscore"
74
- output_file: Path = None, # Path to output tsv
75
- sample_name: str = None,
76
- config_file: str = None, # config file to set env vars from
77
- ) -> None:
78
- """ """
79
- # config = core.get_config(config_file) # Set env vars and get config variables
80
- legionella_summary_dict = legionella_summary(
81
- legionella_sbt_results_tsv=legionella_sbt_file,
82
- lag1_blast_tsv=lag_1_blast_output,
83
- )
84
- core.print_results_dict_to_tsv(
85
- results_dict=legionella_summary_dict,
86
- output_file=output_file,
87
- sample_name=sample_name,
88
- )
@@ -1 +0,0 @@
1
- __version__ = "0.0.1"
@@ -1,38 +0,0 @@
1
- # Autogenerated by nbdev
2
-
3
- d = { 'settings': { 'branch': 'main',
4
- 'doc_baseurl': '/ssi_analysis_result_parsers',
5
- 'doc_host': 'https://$GIT_USER_NAME.github.io',
6
- 'git_url': 'https://github.com/$GIT_USER_NAME/ssi_analysis_result_parsers',
7
- 'lib_path': 'ssi_analysis_result_parsers'},
8
- 'syms': { 'ssi_analysis_result_parsers.Legionella_parser': { 'ssi_analysis_result_parsers.Legionella_parser.extract_legionella_sbt': ( 'legionella_parser.html#extract_legionella_sbt',
9
- 'ssi_analysis_result_parsers/Legionella_parser.py'),
10
- 'ssi_analysis_result_parsers.Legionella_parser.legionella_parser': ( 'legionella_parser.html#legionella_parser',
11
- 'ssi_analysis_result_parsers/Legionella_parser.py'),
12
- 'ssi_analysis_result_parsers.Legionella_parser.legionella_summary': ( 'legionella_parser.html#legionella_summary',
13
- 'ssi_analysis_result_parsers/Legionella_parser.py')},
14
- 'ssi_analysis_result_parsers.blast_parser': { 'ssi_analysis_result_parsers.blast_parser.allele_matches': ( 'blast_parser.html#allele_matches',
15
- 'ssi_analysis_result_parsers/blast_parser.py'),
16
- 'ssi_analysis_result_parsers.blast_parser.extract_allele_matches': ( 'blast_parser.html#extract_allele_matches',
17
- 'ssi_analysis_result_parsers/blast_parser.py'),
18
- 'ssi_analysis_result_parsers.blast_parser.extract_presence_absence': ( 'blast_parser.html#extract_presence_absence',
19
- 'ssi_analysis_result_parsers/blast_parser.py'),
20
- 'ssi_analysis_result_parsers.blast_parser.presence_absence': ( 'blast_parser.html#presence_absence',
21
- 'ssi_analysis_result_parsers/blast_parser.py')},
22
- 'ssi_analysis_result_parsers.core': { 'ssi_analysis_result_parsers.core.get_config': ( 'core.html#get_config',
23
- 'ssi_analysis_result_parsers/core.py'),
24
- 'ssi_analysis_result_parsers.core.get_samplesheet': ( 'core.html#get_samplesheet',
25
- 'ssi_analysis_result_parsers/core.py'),
26
- 'ssi_analysis_result_parsers.core.print_results_dict_to_tsv': ( 'core.html#print_results_dict_to_tsv',
27
- 'ssi_analysis_result_parsers/core.py'),
28
- 'ssi_analysis_result_parsers.core.set_env_variables': ( 'core.html#set_env_variables',
29
- 'ssi_analysis_result_parsers/core.py'),
30
- 'ssi_analysis_result_parsers.core.show_project_env_vars': ( 'core.html#show_project_env_vars',
31
- 'ssi_analysis_result_parsers/core.py'),
32
- 'ssi_analysis_result_parsers.core.update_results_dict': ( 'core.html#update_results_dict',
33
- 'ssi_analysis_result_parsers/core.py')},
34
- 'ssi_analysis_result_parsers.hello_world': { 'ssi_analysis_result_parsers.hello_world.cli': ( 'hello_world.html#cli',
35
- 'ssi_analysis_result_parsers/hello_world.py'),
36
- 'ssi_analysis_result_parsers.hello_world.hello_world': ( 'hello_world.html#hello_world',
37
- 'ssi_analysis_result_parsers/hello_world.py')},
38
- 'ssi_analysis_result_parsers.some_string': {}}}