ssi-analysis-result-parsers 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ssi_analysis_result_parsers/Legionella_parser.py +88 -0
- ssi_analysis_result_parsers/__init__.py +1 -0
- ssi_analysis_result_parsers/_modidx.py +38 -0
- ssi_analysis_result_parsers/blast_parser.py +178 -0
- ssi_analysis_result_parsers/config/config.default.env +24 -0
- ssi_analysis_result_parsers/config/config.default.yaml +9 -0
- ssi_analysis_result_parsers/core.py +252 -0
- ssi_analysis_result_parsers/hello_world.py +61 -0
- ssi_analysis_result_parsers/some_string.py +27 -0
- ssi_analysis_result_parsers-0.0.1.dist-info/METADATA +109 -0
- ssi_analysis_result_parsers-0.0.1.dist-info/RECORD +21 -0
- ssi_analysis_result_parsers-0.0.1.dist-info/WHEEL +5 -0
- ssi_analysis_result_parsers-0.0.1.dist-info/entry_points.txt +7 -0
- ssi_analysis_result_parsers-0.0.1.dist-info/licenses/LICENSE +8 -0
- ssi_analysis_result_parsers-0.0.1.dist-info/top_level.txt +8 -0
- test_input/.DS_Store +0 -0
- test_input/Legionella/lag-1_blast.tsv +1 -0
- test_input/Legionella/test.sbt.tsv +2 -0
- test_input/blast_parser/allele_matches_test.tsv +536 -0
- test_input/blast_parser/gene_presence_absence_test.tsv +3 -0
- test_output/output_with_sample_name.tsv +2 -0
@@ -0,0 +1,88 @@
|
|
1
|
+
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/39_Legionella_parser.ipynb.
|
2
|
+
|
3
|
+
# %% auto 0
|
4
|
+
__all__ = ['extract_legionella_sbt', 'legionella_summary', 'legionella_parser']
|
5
|
+
|
6
|
+
# %% ../nbs/39_Legionella_parser.ipynb 3
|
7
|
+
# standard libs
|
8
|
+
import os
|
9
|
+
import re
|
10
|
+
|
11
|
+
# Common to template
|
12
|
+
# add into settings.ini, requirements, package name is python-dotenv, for conda build ensure `conda config --add channels conda-forge`
|
13
|
+
import dotenv # for loading config from .env files, https://pypi.org/project/python-dotenv/
|
14
|
+
import envyaml # Allows to loads env vars into a yaml file, https://github.com/thesimj/envyaml
|
15
|
+
import fastcore # To add functionality related to nbdev development, https://github.com/fastai/fastcore/
|
16
|
+
from fastcore import (
|
17
|
+
test,
|
18
|
+
)
|
19
|
+
from fastcore.script import (
|
20
|
+
call_parse,
|
21
|
+
) # for @call_parse, https://fastcore.fast.ai/script
|
22
|
+
import json # for nicely printing json and yaml
|
23
|
+
|
24
|
+
# import functions from core module (optional, but most likely needed).
|
25
|
+
from ssi_analysis_result_parsers import (
|
26
|
+
core,
|
27
|
+
)
|
28
|
+
from .blast_parser import extract_presence_absence
|
29
|
+
|
30
|
+
# Project specific libraries
|
31
|
+
from pathlib import Path
|
32
|
+
import pandas
|
33
|
+
import sys
|
34
|
+
|
35
|
+
# %% ../nbs/39_Legionella_parser.ipynb 6
|
36
|
+
def extract_legionella_sbt(legionella_sbt_results_tsv: Path) -> dict:
|
37
|
+
"""
|
38
|
+
Returns dictionary of results found in the Legionella SBT summary output
|
39
|
+
"""
|
40
|
+
if os.path.exists(legionella_sbt_results_tsv):
|
41
|
+
df = pandas.read_csv(legionella_sbt_results_tsv, sep="\t")
|
42
|
+
df.set_index("sample", inplace=True, drop=True)
|
43
|
+
d = df.to_dict(orient="index")
|
44
|
+
fname = next(iter(d))
|
45
|
+
return d[fname]
|
46
|
+
else:
|
47
|
+
print(
|
48
|
+
f"No Legionella SBT output found at {legionella_sbt_results_tsv}",
|
49
|
+
file=sys.stderr,
|
50
|
+
)
|
51
|
+
return None
|
52
|
+
|
53
|
+
|
54
|
+
def legionella_summary(legionella_sbt_results_tsv: Path, lag1_blast_tsv: Path) -> dict:
|
55
|
+
sbt_results_dict = extract_legionella_sbt(
|
56
|
+
legionella_sbt_results_tsv=legionella_sbt_results_tsv
|
57
|
+
)
|
58
|
+
lag1_blast_dict = extract_presence_absence(
|
59
|
+
blast_output_tsv=lag1_blast_tsv,
|
60
|
+
hits_as_string=False,
|
61
|
+
include_match_stats=False,
|
62
|
+
gene_names=["lag-1"],
|
63
|
+
)
|
64
|
+
results_dict = core.update_results_dict(
|
65
|
+
sbt_results_dict, lag1_blast_dict, old_duplicate_key_prefix="SBT: "
|
66
|
+
)
|
67
|
+
return results_dict
|
68
|
+
|
69
|
+
# %% ../nbs/39_Legionella_parser.ipynb 9
|
70
|
+
@call_parse
|
71
|
+
def legionella_parser(
|
72
|
+
legionella_sbt_file: Path = None, # Path "*.sbt.tsv from legionella_sbt program"
|
73
|
+
lag_1_blast_output: Path = None, # Path to output from lag1_blast. Generated with blastn -query lag-1.fasta -subject assembly.fasta -outfmt "6 qseqid sseqid pident length qlen qstart qend sstart send sseq evalue bitscore"
|
74
|
+
output_file: Path = None, # Path to output tsv
|
75
|
+
sample_name: str = None,
|
76
|
+
config_file: str = None, # config file to set env vars from
|
77
|
+
) -> None:
|
78
|
+
""" """
|
79
|
+
# config = core.get_config(config_file) # Set env vars and get config variables
|
80
|
+
legionella_summary_dict = legionella_summary(
|
81
|
+
legionella_sbt_results_tsv=legionella_sbt_file,
|
82
|
+
lag1_blast_tsv=lag_1_blast_output,
|
83
|
+
)
|
84
|
+
core.print_results_dict_to_tsv(
|
85
|
+
results_dict=legionella_summary_dict,
|
86
|
+
output_file=output_file,
|
87
|
+
sample_name=sample_name,
|
88
|
+
)
|
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = "0.0.1"
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# Autogenerated by nbdev
|
2
|
+
|
3
|
+
d = { 'settings': { 'branch': 'main',
|
4
|
+
'doc_baseurl': '/ssi_analysis_result_parsers',
|
5
|
+
'doc_host': 'https://$GIT_USER_NAME.github.io',
|
6
|
+
'git_url': 'https://github.com/$GIT_USER_NAME/ssi_analysis_result_parsers',
|
7
|
+
'lib_path': 'ssi_analysis_result_parsers'},
|
8
|
+
'syms': { 'ssi_analysis_result_parsers.Legionella_parser': { 'ssi_analysis_result_parsers.Legionella_parser.extract_legionella_sbt': ( 'legionella_parser.html#extract_legionella_sbt',
|
9
|
+
'ssi_analysis_result_parsers/Legionella_parser.py'),
|
10
|
+
'ssi_analysis_result_parsers.Legionella_parser.legionella_parser': ( 'legionella_parser.html#legionella_parser',
|
11
|
+
'ssi_analysis_result_parsers/Legionella_parser.py'),
|
12
|
+
'ssi_analysis_result_parsers.Legionella_parser.legionella_summary': ( 'legionella_parser.html#legionella_summary',
|
13
|
+
'ssi_analysis_result_parsers/Legionella_parser.py')},
|
14
|
+
'ssi_analysis_result_parsers.blast_parser': { 'ssi_analysis_result_parsers.blast_parser.allele_matches': ( 'blast_parser.html#allele_matches',
|
15
|
+
'ssi_analysis_result_parsers/blast_parser.py'),
|
16
|
+
'ssi_analysis_result_parsers.blast_parser.extract_allele_matches': ( 'blast_parser.html#extract_allele_matches',
|
17
|
+
'ssi_analysis_result_parsers/blast_parser.py'),
|
18
|
+
'ssi_analysis_result_parsers.blast_parser.extract_presence_absence': ( 'blast_parser.html#extract_presence_absence',
|
19
|
+
'ssi_analysis_result_parsers/blast_parser.py'),
|
20
|
+
'ssi_analysis_result_parsers.blast_parser.presence_absence': ( 'blast_parser.html#presence_absence',
|
21
|
+
'ssi_analysis_result_parsers/blast_parser.py')},
|
22
|
+
'ssi_analysis_result_parsers.core': { 'ssi_analysis_result_parsers.core.get_config': ( 'core.html#get_config',
|
23
|
+
'ssi_analysis_result_parsers/core.py'),
|
24
|
+
'ssi_analysis_result_parsers.core.get_samplesheet': ( 'core.html#get_samplesheet',
|
25
|
+
'ssi_analysis_result_parsers/core.py'),
|
26
|
+
'ssi_analysis_result_parsers.core.print_results_dict_to_tsv': ( 'core.html#print_results_dict_to_tsv',
|
27
|
+
'ssi_analysis_result_parsers/core.py'),
|
28
|
+
'ssi_analysis_result_parsers.core.set_env_variables': ( 'core.html#set_env_variables',
|
29
|
+
'ssi_analysis_result_parsers/core.py'),
|
30
|
+
'ssi_analysis_result_parsers.core.show_project_env_vars': ( 'core.html#show_project_env_vars',
|
31
|
+
'ssi_analysis_result_parsers/core.py'),
|
32
|
+
'ssi_analysis_result_parsers.core.update_results_dict': ( 'core.html#update_results_dict',
|
33
|
+
'ssi_analysis_result_parsers/core.py')},
|
34
|
+
'ssi_analysis_result_parsers.hello_world': { 'ssi_analysis_result_parsers.hello_world.cli': ( 'hello_world.html#cli',
|
35
|
+
'ssi_analysis_result_parsers/hello_world.py'),
|
36
|
+
'ssi_analysis_result_parsers.hello_world.hello_world': ( 'hello_world.html#hello_world',
|
37
|
+
'ssi_analysis_result_parsers/hello_world.py')},
|
38
|
+
'ssi_analysis_result_parsers.some_string': {}}}
|
@@ -0,0 +1,178 @@
|
|
1
|
+
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/11_blast_parser.ipynb.
|
2
|
+
|
3
|
+
# %% auto 0
|
4
|
+
__all__ = ['extract_presence_absence', 'extract_allele_matches', 'presence_absence', 'allele_matches']
|
5
|
+
|
6
|
+
# %% ../nbs/11_blast_parser.ipynb 3
|
7
|
+
# standard libs
|
8
|
+
import os
|
9
|
+
import re
|
10
|
+
|
11
|
+
# Common to template
|
12
|
+
# add into settings.ini, requirements, package name is python-dotenv, for conda build ensure `conda config --add channels conda-forge`
|
13
|
+
import dotenv # for loading config from .env files, https://pypi.org/project/python-dotenv/
|
14
|
+
import envyaml # Allows to loads env vars into a yaml file, https://github.com/thesimj/envyaml
|
15
|
+
import fastcore # To add functionality related to nbdev development, https://github.com/fastai/fastcore/
|
16
|
+
from fastcore import (
|
17
|
+
test,
|
18
|
+
)
|
19
|
+
from fastcore.script import (
|
20
|
+
call_parse,
|
21
|
+
) # for @call_parse, https://fastcore.fast.ai/script
|
22
|
+
import json # for nicely printing json and yaml
|
23
|
+
|
24
|
+
# import functions from core module (optional, but most likely needed).
|
25
|
+
from . import core
|
26
|
+
|
27
|
+
# Project specific libraries
|
28
|
+
from pathlib import Path
|
29
|
+
import pandas
|
30
|
+
import sys
|
31
|
+
|
32
|
+
# %% ../nbs/11_blast_parser.ipynb 6
|
33
|
+
def extract_presence_absence(
|
34
|
+
blast_output_tsv: Path,
|
35
|
+
tsv_header: str = "qseqid sseqid pident length qlen qstart qend sstart send sseq evalue bitscore",
|
36
|
+
hits_as_string: bool = True,
|
37
|
+
include_match_stats=False,
|
38
|
+
pident_threshold: float = 90,
|
39
|
+
plen_threshold: float = 60,
|
40
|
+
gene_names: list = None,
|
41
|
+
) -> dict:
|
42
|
+
"""
|
43
|
+
Parse blast output tsv for best matching alleles
|
44
|
+
returns:
|
45
|
+
if include_match stats:
|
46
|
+
{ <gene_name_1>: <allele_number>, <gene_name_2>: <allele_number>, <gene_name_3>: <allele_number> ...}
|
47
|
+
else:
|
48
|
+
a dictionary (allele_dict) in the format { <gene_name_1>: <allele_number>, <gene_name_2>: <allele_number>, <gene_name_3>: <allele_number> ...}
|
49
|
+
|
50
|
+
"""
|
51
|
+
if os.path.exists(blast_output_tsv):
|
52
|
+
blast_df = pandas.read_csv(blast_output_tsv, sep="\t", header=None)
|
53
|
+
blast_df.columns = tsv_header.split(" ")
|
54
|
+
blast_df["plen"] = blast_df["length"] / blast_df["qlen"] * 100
|
55
|
+
blast_df_unique = (
|
56
|
+
blast_df.sort_values(by=["bitscore"], ascending=False)
|
57
|
+
.groupby("qseqid")
|
58
|
+
.first()
|
59
|
+
)
|
60
|
+
blast_df_filtered = blast_df_unique.query(
|
61
|
+
"plen > @plen_threshold and pident > @pident_threshold"
|
62
|
+
)
|
63
|
+
if hits_as_string:
|
64
|
+
if include_match_stats:
|
65
|
+
results = []
|
66
|
+
for gene, d in blast_df_filtered.to_dict(orient="index").items():
|
67
|
+
results.append(f"{gene}__{d['pident']}__{d['plen']}")
|
68
|
+
result_dict = {"genes_found": ", ".join(results)}
|
69
|
+
return result_dict
|
70
|
+
|
71
|
+
else:
|
72
|
+
result_dict = {
|
73
|
+
"genes_found": ", ".join(list(blast_df_filtered.index.values))
|
74
|
+
}
|
75
|
+
return result_dict
|
76
|
+
|
77
|
+
else:
|
78
|
+
result_dict = {}
|
79
|
+
blast_dict = dict(blast_df_filtered.to_dict(orient="index").items())
|
80
|
+
if gene_names is None:
|
81
|
+
gene_names = blast_dict.keys()
|
82
|
+
for gene in gene_names:
|
83
|
+
if gene in blast_dict:
|
84
|
+
if include_match_stats:
|
85
|
+
result_dict[gene] = (
|
86
|
+
f"{blast_dict[gene]['pident']}__{blast_dict[gene]['plen']}"
|
87
|
+
)
|
88
|
+
else:
|
89
|
+
result_dict[gene] = "1"
|
90
|
+
else:
|
91
|
+
result_dict[gene] = "0"
|
92
|
+
return result_dict
|
93
|
+
|
94
|
+
else:
|
95
|
+
print(f"No blast output found at {blast_output_tsv}", file=sys.stderr)
|
96
|
+
|
97
|
+
|
98
|
+
def extract_allele_matches(
|
99
|
+
blast_output_tsv: Path, tsv_header: str, include_match_stats=False
|
100
|
+
) -> dict:
|
101
|
+
"""
|
102
|
+
Parse blast output tsv for best matching alleles
|
103
|
+
returns:
|
104
|
+
if include_match stats:
|
105
|
+
{ <gene_name_1>: <allele_number>__<pident>__<plen>, <gene_name_2>: <allele_number>__<pident>__<plen>, <gene_name_3>: <allele_number>__<pident>__<plen> ...}
|
106
|
+
else:
|
107
|
+
a dictionary (allele_dict) in the format { <gene_name_1>: <allele_number>, <gene_name_2>: <allele_number>, <gene_name_3>: <allele_number> ...}
|
108
|
+
|
109
|
+
"""
|
110
|
+
allele_dict = {}
|
111
|
+
detailed_dict = {}
|
112
|
+
if os.path.exists(blast_output_tsv):
|
113
|
+
blast_df = pandas.read_csv(blast_output_tsv, sep="\t", header=None)
|
114
|
+
blast_df.columns = tsv_header.split(" ")
|
115
|
+
blast_df.set_index("qseqid", drop=False)
|
116
|
+
blast_df["plen"] = blast_df["length"] / blast_df["qlen"] * 100
|
117
|
+
blast_df[["gene", "allele"]] = blast_df["qseqid"].str.split("_", expand=True)
|
118
|
+
blast_df_unique = (
|
119
|
+
blast_df.sort_values(by=["bitscore"], ascending=False)
|
120
|
+
.groupby("gene")
|
121
|
+
.first()
|
122
|
+
)
|
123
|
+
for gene, d in blast_df_unique.to_dict(orient="index").items():
|
124
|
+
allele_dict[gene] = d["allele"]
|
125
|
+
detailed_dict[gene] = f"{d['allele']}__{d['pident']}__{d['plen']}"
|
126
|
+
else:
|
127
|
+
print(f"No blast output found at {blast_output_tsv}", file=sys.stderr)
|
128
|
+
|
129
|
+
if include_match_stats:
|
130
|
+
return detailed_dict
|
131
|
+
else:
|
132
|
+
return allele_dict
|
133
|
+
|
134
|
+
# %% ../nbs/11_blast_parser.ipynb 9
|
135
|
+
from fastcore.script import call_parse
|
136
|
+
|
137
|
+
|
138
|
+
@call_parse
|
139
|
+
def presence_absence(
|
140
|
+
blast_output: Path = None, # Path to blast output file. Generated with --outfmt 6 option
|
141
|
+
blast_tsv_header: str = "qseqid sseqid pident length qlen qstart qend sstart send sseq evalue bitscore", # headers in blast output
|
142
|
+
hits_as_string: bool = True, # True to print a comma separated list of found genes on a single line. False to return a key: value pair for each gene
|
143
|
+
include_match_stats: bool = False, # True to include percent identity and percent length in output, false to only include present/absent
|
144
|
+
percent_identityt: float = 90, # percent identity threshold for considering a gene present
|
145
|
+
percent_length: float = 60, # percent length threshold for considering a gene present
|
146
|
+
gene_names: list = None, # name of genes to look for when hits_as_string = False
|
147
|
+
output_file: Path = None,
|
148
|
+
config_file: str = None, # config file to set env vars from
|
149
|
+
) -> None:
|
150
|
+
""" """
|
151
|
+
# config = core.get_config(config_file) # Set env vars and get config variables
|
152
|
+
gene_presence_dict = extract_presence_absence(
|
153
|
+
blast_output_tsv=blast_output,
|
154
|
+
tsv_header=blast_tsv_header,
|
155
|
+
hits_as_string=hits_as_string,
|
156
|
+
include_match_stats=include_match_stats,
|
157
|
+
pident_threshold=percent_identityt,
|
158
|
+
plen_threshold=percent_length,
|
159
|
+
gene_names=gene_names,
|
160
|
+
)
|
161
|
+
|
162
|
+
|
163
|
+
@call_parse
|
164
|
+
def allele_matches(
|
165
|
+
blast_output: Path = None, # Path to blast output file. Generated with --outfmt 6 option
|
166
|
+
blast_tsv_header: str = "qseqid sseqid pident length qlen qstart qend sstart send sseq evalue bitscore", # headers in blast output
|
167
|
+
include_match_stats: bool = False, # True to include percent identity and percent length in output, false to only include allele number
|
168
|
+
output_file: Path = None,
|
169
|
+
config_file: str = None, # config file to set env vars from
|
170
|
+
) -> None:
|
171
|
+
""" """
|
172
|
+
# config = core.get_config(config_file) # Set env vars and get config variables
|
173
|
+
allele_dict = extract_allele_matches(
|
174
|
+
blast_output_tsv=blast_output,
|
175
|
+
tsv_header=blast_tsv_header,
|
176
|
+
include_match_stats=include_match_stats,
|
177
|
+
output_file=None,
|
178
|
+
)
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# environmental (ENV) variables. These are written as SSI_ANALYSIS_RESULT_PARSERS_VARIABLENAME=VALUE to avoid conflicts with other ENV variables.
|
2
|
+
# Using the standard template these values can be overwritten by:
|
3
|
+
# - defining SSI_ANALYSIS_RESULT_PARSERS_CONFIG_FILE pointing to a similar file with a subset of values
|
4
|
+
# - setting the values as environmental variables.
|
5
|
+
# The priority goes env variables > config file > default file.
|
6
|
+
# The all configs other than config.default.env are in .gitignore
|
7
|
+
# All .env config files should have an associated .yaml config file with it which the program interacts with.
|
8
|
+
|
9
|
+
# NOTE: remember if referencing another ENV var as a variable it needs to be defined first
|
10
|
+
|
11
|
+
# If more structured variables are needed use config.default.yaml or another of your own creation
|
12
|
+
# This file path is stored as CORE_CONFIG_FILE when overriding
|
13
|
+
# It is commented out because of the default use case, but should be included for all non default cases.
|
14
|
+
# CORE_YAML_CONFIG_FILE=
|
15
|
+
CORE_PROJECT_VARIABLE_PREFIX=SSI_ANALYSIS_RESULT_PARSERS_
|
16
|
+
# For testing purposes
|
17
|
+
CORE_TEST_VAR="Test"
|
18
|
+
|
19
|
+
# Example variable please exchange with relevant variables
|
20
|
+
SSI_ANALYSIS_RESULT_PARSERS_INPUT_DIR=./input
|
21
|
+
SSI_ANALYSIS_RESULT_PARSERS_OUTPUT_DIR=./output
|
22
|
+
SSI_ANALYSIS_RESULT_PARSERS_OUTPUT_FILE=${SSI_ANALYSIS_RESULT_PARSERS_OUTPUT_DIR}/output.txt
|
23
|
+
SSI_ANALYSIS_RESULT_PARSERS_USER_INPUT_NAME=Kim
|
24
|
+
|
@@ -0,0 +1,9 @@
|
|
1
|
+
# When accessing this in the code you'll work with it as a dict.
|
2
|
+
# ENV variables will be replaced with their values. This is done with the envyaml package that is in the code template `load_config`.
|
3
|
+
# By convention all variables for the project should have the SSI_ANALYSIS_RESULT_PARSERS_* prefix.
|
4
|
+
# e.g
|
5
|
+
# name: ${SSI_ANALYSIS_RESULT_PARSERS_NAME}
|
6
|
+
example:
|
7
|
+
input:
|
8
|
+
name: ${SSI_ANALYSIS_RESULT_PARSERS_USER_INPUT_NAME}
|
9
|
+
alternative_name: Lee
|
@@ -0,0 +1,252 @@
|
|
1
|
+
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/00_core.ipynb.
|
2
|
+
|
3
|
+
# %% auto 0
|
4
|
+
__all__ = ['PACKAGE_NAME', 'DEV_MODE', 'PACKAGE_DIR', 'PROJECT_DIR', 'config', 'set_env_variables', 'get_config',
|
5
|
+
'show_project_env_vars', 'get_samplesheet', 'update_results_dict', 'print_results_dict_to_tsv']
|
6
|
+
|
7
|
+
# %% ../nbs/00_core.ipynb 4
|
8
|
+
# Need the ssi_analysis_result_parsers for a few functions, this can be considered a static var
|
9
|
+
|
10
|
+
import importlib
|
11
|
+
import importlib.util
|
12
|
+
import os
|
13
|
+
|
14
|
+
PACKAGE_NAME: str = (
|
15
|
+
"ssi_analysis_result_parsers" # Make sure to adjust this to your package name
|
16
|
+
)
|
17
|
+
DEV_MODE: bool = (
|
18
|
+
False # set below to override, as this is in an export block it'll be exported while the dev mode section is not
|
19
|
+
)
|
20
|
+
|
21
|
+
PACKAGE_DIR = None
|
22
|
+
try:
|
23
|
+
spec = importlib.util.find_spec(PACKAGE_NAME)
|
24
|
+
module = importlib.util.module_from_spec(spec)
|
25
|
+
spec.loader.exec_module(module)
|
26
|
+
PACKAGE_DIR = os.path.dirname(module.__file__)
|
27
|
+
except ImportError:
|
28
|
+
DEV_MODE = True
|
29
|
+
except AttributeError:
|
30
|
+
DEV_MODE = True
|
31
|
+
PROJECT_DIR = os.getcwd() # override value in dev mode
|
32
|
+
if PROJECT_DIR.endswith("nbs"):
|
33
|
+
DEV_MODE = True
|
34
|
+
PROJECT_DIR = os.path.split(PROJECT_DIR)[0]
|
35
|
+
|
36
|
+
# %% ../nbs/00_core.ipynb 10
|
37
|
+
# standard libs
|
38
|
+
import os
|
39
|
+
import re
|
40
|
+
|
41
|
+
# Common to template
|
42
|
+
# add into settings.ini, requirements, package name is python-dotenv, for conda build ensure `conda config --add channels conda-forge`
|
43
|
+
import dotenv # for loading config from .env files, https://pypi.org/project/python-dotenv/
|
44
|
+
import envyaml # Allows to loads env vars into a yaml file, https://github.com/thesimj/envyaml
|
45
|
+
import fastcore # To add functionality related to nbdev development, https://github.com/fastai/fastcore/
|
46
|
+
import pandas # For sample sheet manipulation
|
47
|
+
from fastcore import (
|
48
|
+
test,
|
49
|
+
)
|
50
|
+
from fastcore.script import (
|
51
|
+
call_parse,
|
52
|
+
) # for @call_parse, https://fastcore.fast.ai/script
|
53
|
+
|
54
|
+
# Project specific libraries
|
55
|
+
|
56
|
+
from pathlib import Path
|
57
|
+
from hashlib import sha256
|
58
|
+
|
59
|
+
# %% ../nbs/00_core.ipynb 13
|
60
|
+
import importlib
|
61
|
+
import importlib.util
|
62
|
+
|
63
|
+
|
64
|
+
def set_env_variables(config_path: str, overide_env_vars: bool = True) -> bool:
|
65
|
+
# Load dot env sets environmental values from a file, if the value already exists it will not be overwritten unless override is set to True.
|
66
|
+
# If we have multiple .env files then we need to apply the one which we want to take precedence last with overide.
|
67
|
+
|
68
|
+
# Order of precedence: .env file > environment variables > default values
|
69
|
+
# When developing, making a change to the config will not be reflected until the environment is restarted
|
70
|
+
|
71
|
+
# Set the env vars first, this is needed for the card.yaml to replace ENV variables
|
72
|
+
# NOTE: You need to adjust PROJECT_NAME to your package name for this to work, the exception is only for dev purposes
|
73
|
+
# This here checks if your package is installed, such as through pypi or through pip install -e [.dev] for development. If it is then it'll go there and use the config files there as your default values.
|
74
|
+
try:
|
75
|
+
dotenv.load_dotenv(f"{PACKAGE_DIR}/config/config.default.env", override=False)
|
76
|
+
except Exception as e:
|
77
|
+
print(f"Error: {PACKAGE_DIR}/config/config.default.env does not exist")
|
78
|
+
return False
|
79
|
+
|
80
|
+
# 2. set values from file:
|
81
|
+
if os.path.isfile(config_path):
|
82
|
+
dotenv.load_dotenv(config_path, override=overide_env_vars)
|
83
|
+
|
84
|
+
return True
|
85
|
+
|
86
|
+
# %% ../nbs/00_core.ipynb 15
|
87
|
+
import importlib
|
88
|
+
import importlib.util
|
89
|
+
|
90
|
+
|
91
|
+
def get_config(config_path: str = None, overide_env_vars: bool = True) -> dict:
|
92
|
+
if config_path is None:
|
93
|
+
config_path = ""
|
94
|
+
# First sets environment with variables from config_path, then uses those variables to fill in appropriate values in the config.yaml file, the yaml file is then returned as a dict
|
95
|
+
# If you want user env variables to take precedence over the config.yaml file then set overide_env_vars to False
|
96
|
+
set_env_variables(config_path, overide_env_vars)
|
97
|
+
|
98
|
+
config: dict = envyaml.EnvYAML(
|
99
|
+
os.environ.get(
|
100
|
+
"CORE_YAML_CONFIG_FILE", f"{PACKAGE_DIR}/config/config.default.yaml"
|
101
|
+
),
|
102
|
+
strict=False,
|
103
|
+
).export()
|
104
|
+
|
105
|
+
return config
|
106
|
+
|
107
|
+
# %% ../nbs/00_core.ipynb 17
|
108
|
+
# create a os.PathLike object
|
109
|
+
config = get_config(os.environ.get("CORE_CONFIG_FILE", ""))
|
110
|
+
|
111
|
+
# %% ../nbs/00_core.ipynb 19
|
112
|
+
def show_project_env_vars(config: dict) -> None:
|
113
|
+
# Prints out all the project environment variables
|
114
|
+
# This is useful for debugging and seeing what is being set
|
115
|
+
for k, v in config.items():
|
116
|
+
# If ENV var starts with PROJECTNAME_ then print
|
117
|
+
if k.startswith(config["CORE_PROJECT_VARIABLE_PREFIX"]):
|
118
|
+
print(f"{k}={v}")
|
119
|
+
|
120
|
+
# %% ../nbs/00_core.ipynb 23
|
121
|
+
import pandas as pd
|
122
|
+
|
123
|
+
|
124
|
+
def get_samplesheet(sample_sheet_config: dict) -> pd.DataFrame:
|
125
|
+
# Load the sample sheet into a pandas dataframe
|
126
|
+
# If columns is not None then it will only load those columns
|
127
|
+
# If the sample sheet is a csv then it will load it as a csv, otherwise it will assume it's a tsv
|
128
|
+
|
129
|
+
# Expected sample_sheet_config:
|
130
|
+
# sample_sheet:
|
131
|
+
# path: path/to/sample_sheet.tsv
|
132
|
+
# delimiter: '\t' # Optional, will assume , for csv and \t otherwises
|
133
|
+
# header: 0 # Optional, 0 indicates first row is header, None indicates no header
|
134
|
+
# columns: ['column1', 'column2', 'column3'] # Optional, if not provided all columns will be used
|
135
|
+
|
136
|
+
# Example sample sheet:
|
137
|
+
# #sample_id file_path metadata1 metadata2
|
138
|
+
# Sample1 /path/to/sample1.fasta value1 option1
|
139
|
+
# Sample2 /path/to/sample2.fasta value2 option2
|
140
|
+
# Sample3 /path/to/sample3.fasta value3 option1
|
141
|
+
# Sample4 /path/to/sample4.fasta value1 option2
|
142
|
+
# Sample5 /path/to/sample5.fasta value2 option1
|
143
|
+
|
144
|
+
# This function should also handle ensuring the sample sheet is in the correct format, such as ensuring the columns are correct and that the sample names are unique.
|
145
|
+
if not os.path.isfile(sample_sheet_config["path"]):
|
146
|
+
raise FileNotFoundError(f"File {sample_sheet_config['path']} does not exist")
|
147
|
+
if "delimiter" in sample_sheet_config:
|
148
|
+
delimiter = sample_sheet_config["delimiter"]
|
149
|
+
else:
|
150
|
+
# do a best guess based on file extension
|
151
|
+
delimiter = "," if sample_sheet_config["path"].endswith(".csv") else "\t"
|
152
|
+
header = 0
|
153
|
+
# if "header" in sample_sheet_config:
|
154
|
+
# header = sample_sheet_config["header"]
|
155
|
+
# else:
|
156
|
+
# # check if the first line starts with a #, if so lets assume it's a header otherwise assume there isn't one
|
157
|
+
# with open(sample_sheet_config["path"], "r") as f:
|
158
|
+
# first_line = f.readline()
|
159
|
+
# header = 0 if first_line.startswith("#") else None
|
160
|
+
if "columns" in sample_sheet_config:
|
161
|
+
columns = sample_sheet_config[
|
162
|
+
"columns"
|
163
|
+
] # note the # for the first item needs to be stripped to compare to the columns
|
164
|
+
else:
|
165
|
+
columns = None # implies all columns
|
166
|
+
try:
|
167
|
+
# note when we have a header the first column may begin with a #, so we need to remove it
|
168
|
+
df = pd.read_csv(
|
169
|
+
sample_sheet_config["path"],
|
170
|
+
delimiter=delimiter,
|
171
|
+
header=header,
|
172
|
+
comment=None,
|
173
|
+
)
|
174
|
+
except Exception as e:
|
175
|
+
print(
|
176
|
+
"Error: Could not load sample sheet into dataframe, you have a problem with your sample sheet or the configuration."
|
177
|
+
)
|
178
|
+
raise e
|
179
|
+
|
180
|
+
# Check the first header has a # in it, if so remove it for only that item
|
181
|
+
if df.columns[0].startswith("#"):
|
182
|
+
df.columns = [col.lstrip("#") for col in df.columns]
|
183
|
+
# Ensure the sample sheet has the correct columns
|
184
|
+
if columns is not None and not all([col in df.columns for col in columns]):
|
185
|
+
raise ValueError("Error: Sample sheet does not have the correct columns")
|
186
|
+
# also drop columns which are not needed
|
187
|
+
if columns is not None:
|
188
|
+
df = df[columns]
|
189
|
+
|
190
|
+
# Clean the df of any extra rows that can be caused by empty lines in the sample sheet
|
191
|
+
df = df.dropna(how="all")
|
192
|
+
return df
|
193
|
+
|
194
|
+
# %% ../nbs/00_core.ipynb 24
|
195
|
+
def update_results_dict(
|
196
|
+
old_results: dict,
|
197
|
+
new_results: dict,
|
198
|
+
old_duplicate_key_prefix: str = None,
|
199
|
+
new_duplicate_key_prefix: str = None,
|
200
|
+
):
|
201
|
+
duplicate_keys = list(set(old_results.keys()) & set(new_results.keys()))
|
202
|
+
if len(duplicate_keys) == 0:
|
203
|
+
old_results.update(new_results)
|
204
|
+
return old_results
|
205
|
+
else:
|
206
|
+
if old_duplicate_key_prefix is None and new_duplicate_key_prefix is None:
|
207
|
+
raise ValueError(
|
208
|
+
"Provided dictionaries contain duplicate keys. old_duplicate_key_prefix and/or new_duplicate_key_prefix must be provided"
|
209
|
+
)
|
210
|
+
elif old_duplicate_key_prefix == new_duplicate_key_prefix:
|
211
|
+
raise ValueError(
|
212
|
+
"old_duplicate_key_prefix and new_duplicate_key_prefix cannot be identical"
|
213
|
+
)
|
214
|
+
else:
|
215
|
+
combined_dict = {}
|
216
|
+
if old_duplicate_key_prefix is None:
|
217
|
+
combined_dict.update(old_results)
|
218
|
+
else:
|
219
|
+
for key, value in old_results.items():
|
220
|
+
if key in duplicate_keys:
|
221
|
+
combined_dict.update(
|
222
|
+
{f"{old_duplicate_key_prefix}{key}": value}
|
223
|
+
)
|
224
|
+
else:
|
225
|
+
combined_dict.update({key: value})
|
226
|
+
if new_duplicate_key_prefix is None:
|
227
|
+
combined_dict.update(new_results)
|
228
|
+
else:
|
229
|
+
for key, value in new_results.items():
|
230
|
+
if key in duplicate_keys:
|
231
|
+
combined_dict.update(
|
232
|
+
{f"{new_duplicate_key_prefix}{key}": value}
|
233
|
+
)
|
234
|
+
else:
|
235
|
+
combined_dict.update({key: value})
|
236
|
+
return combined_dict
|
237
|
+
|
238
|
+
|
239
|
+
def print_results_dict_to_tsv(
|
240
|
+
results_dict: dict, output_file: Path, sample_name: str = None
|
241
|
+
) -> None:
|
242
|
+
if sample_name is None:
|
243
|
+
header = "\t".join(str(x) for x in results_dict.keys())
|
244
|
+
values = "\t".join(str(x) for x in results_dict.values())
|
245
|
+
else:
|
246
|
+
header = "sample_name\t" + "\t".join([str(x) for x in results_dict.keys()])
|
247
|
+
values = sample_name + "\t" + "\t".join([str(x) for x in results_dict.values()])
|
248
|
+
o = open(output_file, "w")
|
249
|
+
o.write(header + "\n")
|
250
|
+
o.write(values + "\n")
|
251
|
+
o.close()
|
252
|
+
return None
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_hello_world.ipynb.
|
2
|
+
|
3
|
+
# %% auto 0
|
4
|
+
__all__ = ['hello_world', 'cli']
|
5
|
+
|
6
|
+
# %% ../nbs/01_hello_world.ipynb 6
|
7
|
+
# That export there, it makes sure this code goes into the module.
|
8
|
+
|
9
|
+
# standard libs
|
10
|
+
import os
|
11
|
+
import re
|
12
|
+
|
13
|
+
# Common to template
|
14
|
+
# add into settings.ini, requirements, package name is python-dotenv, for conda build ensure `conda config --add channels conda-forge`
|
15
|
+
import dotenv # for loading config from .env files, https://pypi.org/project/python-dotenv/
|
16
|
+
import envyaml # Allows to loads env vars into a yaml file, https://github.com/thesimj/envyaml
|
17
|
+
import fastcore # To add functionality related to nbdev development, https://github.com/fastai/fastcore/
|
18
|
+
from fastcore import (
|
19
|
+
test,
|
20
|
+
)
|
21
|
+
from fastcore.script import (
|
22
|
+
call_parse,
|
23
|
+
) # for @call_parse, https://fastcore.fast.ai/script
|
24
|
+
import json # for nicely printing json and yaml
|
25
|
+
|
26
|
+
# Project specific libraries
|
27
|
+
|
28
|
+
# %% ../nbs/01_hello_world.ipynb 8
|
29
|
+
from ssi_analysis_result_parsers import (
|
30
|
+
core,
|
31
|
+
)
|
32
|
+
|
33
|
+
# %% ../nbs/01_hello_world.ipynb 17
|
34
|
+
def hello_world(name1: str, name2: str) -> str:
|
35
|
+
return f"Hello {name1} and {name2}!"
|
36
|
+
|
37
|
+
# %% ../nbs/01_hello_world.ipynb 19
|
38
|
+
from fastcore.script import call_parse
|
39
|
+
|
40
|
+
|
41
|
+
@call_parse
|
42
|
+
def cli(
|
43
|
+
name: str = None, # A name
|
44
|
+
alternative_name: str = None, # An alternative name
|
45
|
+
config_file: str = None, # config file to set env vars from
|
46
|
+
) -> None:
|
47
|
+
"""
|
48
|
+
This will print Hello World! with your name
|
49
|
+
"""
|
50
|
+
config = core.get_config(config_file) # Set env vars and get config variables
|
51
|
+
if name is not None:
|
52
|
+
config["example"]["input"]["name"] = name
|
53
|
+
if alternative_name is not None:
|
54
|
+
config["example"]["input"]["alternative_name"] = alternative_name
|
55
|
+
|
56
|
+
print(
|
57
|
+
hello_world(
|
58
|
+
config["example"]["input"]["name"],
|
59
|
+
config["example"]["input"]["alternative_name"],
|
60
|
+
)
|
61
|
+
)
|