ssi-analysis-result-parsers 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,88 @@
1
+ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/39_Legionella_parser.ipynb.
2
+
3
+ # %% auto 0
4
+ __all__ = ['extract_legionella_sbt', 'legionella_summary', 'legionella_parser']
5
+
6
+ # %% ../nbs/39_Legionella_parser.ipynb 3
7
+ # standard libs
8
+ import os
9
+ import re
10
+
11
+ # Common to template
12
+ # add into settings.ini, requirements, package name is python-dotenv, for conda build ensure `conda config --add channels conda-forge`
13
+ import dotenv # for loading config from .env files, https://pypi.org/project/python-dotenv/
14
+ import envyaml # Allows to loads env vars into a yaml file, https://github.com/thesimj/envyaml
15
+ import fastcore # To add functionality related to nbdev development, https://github.com/fastai/fastcore/
16
+ from fastcore import (
17
+ test,
18
+ )
19
+ from fastcore.script import (
20
+ call_parse,
21
+ ) # for @call_parse, https://fastcore.fast.ai/script
22
+ import json # for nicely printing json and yaml
23
+
24
+ # import functions from core module (optional, but most likely needed).
25
+ from ssi_analysis_result_parsers import (
26
+ core,
27
+ )
28
+ from .blast_parser import extract_presence_absence
29
+
30
+ # Project specific libraries
31
+ from pathlib import Path
32
+ import pandas
33
+ import sys
34
+
35
+ # %% ../nbs/39_Legionella_parser.ipynb 6
36
+ def extract_legionella_sbt(legionella_sbt_results_tsv: Path) -> dict:
37
+ """
38
+ Returns dictionary of results found in the Legionella SBT summary output
39
+ """
40
+ if os.path.exists(legionella_sbt_results_tsv):
41
+ df = pandas.read_csv(legionella_sbt_results_tsv, sep="\t")
42
+ df.set_index("sample", inplace=True, drop=True)
43
+ d = df.to_dict(orient="index")
44
+ fname = next(iter(d))
45
+ return d[fname]
46
+ else:
47
+ print(
48
+ f"No Legionella SBT output found at {legionella_sbt_results_tsv}",
49
+ file=sys.stderr,
50
+ )
51
+ return None
52
+
53
+
54
+ def legionella_summary(legionella_sbt_results_tsv: Path, lag1_blast_tsv: Path) -> dict:
55
+ sbt_results_dict = extract_legionella_sbt(
56
+ legionella_sbt_results_tsv=legionella_sbt_results_tsv
57
+ )
58
+ lag1_blast_dict = extract_presence_absence(
59
+ blast_output_tsv=lag1_blast_tsv,
60
+ hits_as_string=False,
61
+ include_match_stats=False,
62
+ gene_names=["lag-1"],
63
+ )
64
+ results_dict = core.update_results_dict(
65
+ sbt_results_dict, lag1_blast_dict, old_duplicate_key_prefix="SBT: "
66
+ )
67
+ return results_dict
68
+
69
+ # %% ../nbs/39_Legionella_parser.ipynb 9
70
+ @call_parse
71
+ def legionella_parser(
72
+ legionella_sbt_file: Path = None, # Path "*.sbt.tsv from legionella_sbt program"
73
+ lag_1_blast_output: Path = None, # Path to output from lag1_blast. Generated with blastn -query lag-1.fasta -subject assembly.fasta -outfmt "6 qseqid sseqid pident length qlen qstart qend sstart send sseq evalue bitscore"
74
+ output_file: Path = None, # Path to output tsv
75
+ sample_name: str = None,
76
+ config_file: str = None, # config file to set env vars from
77
+ ) -> None:
78
+ """ """
79
+ # config = core.get_config(config_file) # Set env vars and get config variables
80
+ legionella_summary_dict = legionella_summary(
81
+ legionella_sbt_results_tsv=legionella_sbt_file,
82
+ lag1_blast_tsv=lag_1_blast_output,
83
+ )
84
+ core.print_results_dict_to_tsv(
85
+ results_dict=legionella_summary_dict,
86
+ output_file=output_file,
87
+ sample_name=sample_name,
88
+ )
@@ -0,0 +1 @@
1
+ __version__ = "0.0.1"
@@ -0,0 +1,38 @@
1
+ # Autogenerated by nbdev
2
+
3
+ d = { 'settings': { 'branch': 'main',
4
+ 'doc_baseurl': '/ssi_analysis_result_parsers',
5
+ 'doc_host': 'https://$GIT_USER_NAME.github.io',
6
+ 'git_url': 'https://github.com/$GIT_USER_NAME/ssi_analysis_result_parsers',
7
+ 'lib_path': 'ssi_analysis_result_parsers'},
8
+ 'syms': { 'ssi_analysis_result_parsers.Legionella_parser': { 'ssi_analysis_result_parsers.Legionella_parser.extract_legionella_sbt': ( 'legionella_parser.html#extract_legionella_sbt',
9
+ 'ssi_analysis_result_parsers/Legionella_parser.py'),
10
+ 'ssi_analysis_result_parsers.Legionella_parser.legionella_parser': ( 'legionella_parser.html#legionella_parser',
11
+ 'ssi_analysis_result_parsers/Legionella_parser.py'),
12
+ 'ssi_analysis_result_parsers.Legionella_parser.legionella_summary': ( 'legionella_parser.html#legionella_summary',
13
+ 'ssi_analysis_result_parsers/Legionella_parser.py')},
14
+ 'ssi_analysis_result_parsers.blast_parser': { 'ssi_analysis_result_parsers.blast_parser.allele_matches': ( 'blast_parser.html#allele_matches',
15
+ 'ssi_analysis_result_parsers/blast_parser.py'),
16
+ 'ssi_analysis_result_parsers.blast_parser.extract_allele_matches': ( 'blast_parser.html#extract_allele_matches',
17
+ 'ssi_analysis_result_parsers/blast_parser.py'),
18
+ 'ssi_analysis_result_parsers.blast_parser.extract_presence_absence': ( 'blast_parser.html#extract_presence_absence',
19
+ 'ssi_analysis_result_parsers/blast_parser.py'),
20
+ 'ssi_analysis_result_parsers.blast_parser.presence_absence': ( 'blast_parser.html#presence_absence',
21
+ 'ssi_analysis_result_parsers/blast_parser.py')},
22
+ 'ssi_analysis_result_parsers.core': { 'ssi_analysis_result_parsers.core.get_config': ( 'core.html#get_config',
23
+ 'ssi_analysis_result_parsers/core.py'),
24
+ 'ssi_analysis_result_parsers.core.get_samplesheet': ( 'core.html#get_samplesheet',
25
+ 'ssi_analysis_result_parsers/core.py'),
26
+ 'ssi_analysis_result_parsers.core.print_results_dict_to_tsv': ( 'core.html#print_results_dict_to_tsv',
27
+ 'ssi_analysis_result_parsers/core.py'),
28
+ 'ssi_analysis_result_parsers.core.set_env_variables': ( 'core.html#set_env_variables',
29
+ 'ssi_analysis_result_parsers/core.py'),
30
+ 'ssi_analysis_result_parsers.core.show_project_env_vars': ( 'core.html#show_project_env_vars',
31
+ 'ssi_analysis_result_parsers/core.py'),
32
+ 'ssi_analysis_result_parsers.core.update_results_dict': ( 'core.html#update_results_dict',
33
+ 'ssi_analysis_result_parsers/core.py')},
34
+ 'ssi_analysis_result_parsers.hello_world': { 'ssi_analysis_result_parsers.hello_world.cli': ( 'hello_world.html#cli',
35
+ 'ssi_analysis_result_parsers/hello_world.py'),
36
+ 'ssi_analysis_result_parsers.hello_world.hello_world': ( 'hello_world.html#hello_world',
37
+ 'ssi_analysis_result_parsers/hello_world.py')},
38
+ 'ssi_analysis_result_parsers.some_string': {}}}
@@ -0,0 +1,178 @@
1
+ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/11_blast_parser.ipynb.
2
+
3
+ # %% auto 0
4
+ __all__ = ['extract_presence_absence', 'extract_allele_matches', 'presence_absence', 'allele_matches']
5
+
6
+ # %% ../nbs/11_blast_parser.ipynb 3
7
+ # standard libs
8
+ import os
9
+ import re
10
+
11
+ # Common to template
12
+ # add into settings.ini, requirements, package name is python-dotenv, for conda build ensure `conda config --add channels conda-forge`
13
+ import dotenv # for loading config from .env files, https://pypi.org/project/python-dotenv/
14
+ import envyaml # Allows to loads env vars into a yaml file, https://github.com/thesimj/envyaml
15
+ import fastcore # To add functionality related to nbdev development, https://github.com/fastai/fastcore/
16
+ from fastcore import (
17
+ test,
18
+ )
19
+ from fastcore.script import (
20
+ call_parse,
21
+ ) # for @call_parse, https://fastcore.fast.ai/script
22
+ import json # for nicely printing json and yaml
23
+
24
+ # import functions from core module (optional, but most likely needed).
25
+ from . import core
26
+
27
+ # Project specific libraries
28
+ from pathlib import Path
29
+ import pandas
30
+ import sys
31
+
32
+ # %% ../nbs/11_blast_parser.ipynb 6
33
+ def extract_presence_absence(
34
+ blast_output_tsv: Path,
35
+ tsv_header: str = "qseqid sseqid pident length qlen qstart qend sstart send sseq evalue bitscore",
36
+ hits_as_string: bool = True,
37
+ include_match_stats=False,
38
+ pident_threshold: float = 90,
39
+ plen_threshold: float = 60,
40
+ gene_names: list = None,
41
+ ) -> dict:
42
+ """
43
+ Parse blast output tsv for best matching alleles
44
+ returns:
45
+ if include_match stats:
46
+ { <gene_name_1>: <allele_number>, <gene_name_2>: <allele_number>, <gene_name_3>: <allele_number> ...}
47
+ else:
48
+ a dictionary (allele_dict) in the format { <gene_name_1>: <allele_number>, <gene_name_2>: <allele_number>, <gene_name_3>: <allele_number> ...}
49
+
50
+ """
51
+ if os.path.exists(blast_output_tsv):
52
+ blast_df = pandas.read_csv(blast_output_tsv, sep="\t", header=None)
53
+ blast_df.columns = tsv_header.split(" ")
54
+ blast_df["plen"] = blast_df["length"] / blast_df["qlen"] * 100
55
+ blast_df_unique = (
56
+ blast_df.sort_values(by=["bitscore"], ascending=False)
57
+ .groupby("qseqid")
58
+ .first()
59
+ )
60
+ blast_df_filtered = blast_df_unique.query(
61
+ "plen > @plen_threshold and pident > @pident_threshold"
62
+ )
63
+ if hits_as_string:
64
+ if include_match_stats:
65
+ results = []
66
+ for gene, d in blast_df_filtered.to_dict(orient="index").items():
67
+ results.append(f"{gene}__{d['pident']}__{d['plen']}")
68
+ result_dict = {"genes_found": ", ".join(results)}
69
+ return result_dict
70
+
71
+ else:
72
+ result_dict = {
73
+ "genes_found": ", ".join(list(blast_df_filtered.index.values))
74
+ }
75
+ return result_dict
76
+
77
+ else:
78
+ result_dict = {}
79
+ blast_dict = dict(blast_df_filtered.to_dict(orient="index").items())
80
+ if gene_names is None:
81
+ gene_names = blast_dict.keys()
82
+ for gene in gene_names:
83
+ if gene in blast_dict:
84
+ if include_match_stats:
85
+ result_dict[gene] = (
86
+ f"{blast_dict[gene]['pident']}__{blast_dict[gene]['plen']}"
87
+ )
88
+ else:
89
+ result_dict[gene] = "1"
90
+ else:
91
+ result_dict[gene] = "0"
92
+ return result_dict
93
+
94
+ else:
95
+ print(f"No blast output found at {blast_output_tsv}", file=sys.stderr)
96
+
97
+
98
+ def extract_allele_matches(
99
+ blast_output_tsv: Path, tsv_header: str, include_match_stats=False
100
+ ) -> dict:
101
+ """
102
+ Parse blast output tsv for best matching alleles
103
+ returns:
104
+ if include_match stats:
105
+ { <gene_name_1>: <allele_number>__<pident>__<plen>, <gene_name_2>: <allele_number>__<pident>__<plen>, <gene_name_3>: <allele_number>__<pident>__<plen> ...}
106
+ else:
107
+ a dictionary (allele_dict) in the format { <gene_name_1>: <allele_number>, <gene_name_2>: <allele_number>, <gene_name_3>: <allele_number> ...}
108
+
109
+ """
110
+ allele_dict = {}
111
+ detailed_dict = {}
112
+ if os.path.exists(blast_output_tsv):
113
+ blast_df = pandas.read_csv(blast_output_tsv, sep="\t", header=None)
114
+ blast_df.columns = tsv_header.split(" ")
115
+ blast_df.set_index("qseqid", drop=False)
116
+ blast_df["plen"] = blast_df["length"] / blast_df["qlen"] * 100
117
+ blast_df[["gene", "allele"]] = blast_df["qseqid"].str.split("_", expand=True)
118
+ blast_df_unique = (
119
+ blast_df.sort_values(by=["bitscore"], ascending=False)
120
+ .groupby("gene")
121
+ .first()
122
+ )
123
+ for gene, d in blast_df_unique.to_dict(orient="index").items():
124
+ allele_dict[gene] = d["allele"]
125
+ detailed_dict[gene] = f"{d['allele']}__{d['pident']}__{d['plen']}"
126
+ else:
127
+ print(f"No blast output found at {blast_output_tsv}", file=sys.stderr)
128
+
129
+ if include_match_stats:
130
+ return detailed_dict
131
+ else:
132
+ return allele_dict
133
+
134
+ # %% ../nbs/11_blast_parser.ipynb 9
135
+ from fastcore.script import call_parse
136
+
137
+
138
+ @call_parse
139
+ def presence_absence(
140
+ blast_output: Path = None, # Path to blast output file. Generated with --outfmt 6 option
141
+ blast_tsv_header: str = "qseqid sseqid pident length qlen qstart qend sstart send sseq evalue bitscore", # headers in blast output
142
+ hits_as_string: bool = True, # True to print a comma separated list of found genes on a single line. False to return a key: value pair for each gene
143
+ include_match_stats: bool = False, # True to include percent identity and percent length in output, false to only include present/absent
144
+ percent_identityt: float = 90, # percent identity threshold for considering a gene present
145
+ percent_length: float = 60, # percent length threshold for considering a gene present
146
+ gene_names: list = None, # name of genes to look for when hits_as_string = False
147
+ output_file: Path = None,
148
+ config_file: str = None, # config file to set env vars from
149
+ ) -> None:
150
+ """ """
151
+ # config = core.get_config(config_file) # Set env vars and get config variables
152
+ gene_presence_dict = extract_presence_absence(
153
+ blast_output_tsv=blast_output,
154
+ tsv_header=blast_tsv_header,
155
+ hits_as_string=hits_as_string,
156
+ include_match_stats=include_match_stats,
157
+ pident_threshold=percent_identityt,
158
+ plen_threshold=percent_length,
159
+ gene_names=gene_names,
160
+ )
161
+
162
+
163
+ @call_parse
164
+ def allele_matches(
165
+ blast_output: Path = None, # Path to blast output file. Generated with --outfmt 6 option
166
+ blast_tsv_header: str = "qseqid sseqid pident length qlen qstart qend sstart send sseq evalue bitscore", # headers in blast output
167
+ include_match_stats: bool = False, # True to include percent identity and percent length in output, false to only include allele number
168
+ output_file: Path = None,
169
+ config_file: str = None, # config file to set env vars from
170
+ ) -> None:
171
+ """ """
172
+ # config = core.get_config(config_file) # Set env vars and get config variables
173
+ allele_dict = extract_allele_matches(
174
+ blast_output_tsv=blast_output,
175
+ tsv_header=blast_tsv_header,
176
+ include_match_stats=include_match_stats,
177
+ output_file=None,
178
+ )
@@ -0,0 +1,24 @@
1
+ # environmental (ENV) variables. These are written as SSI_ANALYSIS_RESULT_PARSERS_VARIABLENAME=VALUE to avoid conflicts with other ENV variables.
2
+ # Using the standard template these values can be overwritten by:
3
+ # - defining SSI_ANALYSIS_RESULT_PARSERS_CONFIG_FILE pointing to a similar file with a subset of values
4
+ # - setting the values as environmental variables.
5
+ # The priority goes env variables > config file > default file.
6
+ # The all configs other than config.default.env are in .gitignore
7
+ # All .env config files should have an associated .yaml config file with it which the program interacts with.
8
+
9
+ # NOTE: remember if referencing another ENV var as a variable it needs to be defined first
10
+
11
+ # If more structured variables are needed use config.default.yaml or another of your own creation
12
+ # This file path is stored as CORE_CONFIG_FILE when overriding
13
+ # It is commented out because of the default use case, but should be included for all non default cases.
14
+ # CORE_YAML_CONFIG_FILE=
15
+ CORE_PROJECT_VARIABLE_PREFIX=SSI_ANALYSIS_RESULT_PARSERS_
16
+ # For testing purposes
17
+ CORE_TEST_VAR="Test"
18
+
19
+ # Example variable please exchange with relevant variables
20
+ SSI_ANALYSIS_RESULT_PARSERS_INPUT_DIR=./input
21
+ SSI_ANALYSIS_RESULT_PARSERS_OUTPUT_DIR=./output
22
+ SSI_ANALYSIS_RESULT_PARSERS_OUTPUT_FILE=${SSI_ANALYSIS_RESULT_PARSERS_OUTPUT_DIR}/output.txt
23
+ SSI_ANALYSIS_RESULT_PARSERS_USER_INPUT_NAME=Kim
24
+
@@ -0,0 +1,9 @@
1
+ # When accessing this in the code you'll work with it as a dict.
2
+ # ENV variables will be replaced with their values. This is done with the envyaml package that is in the code template `load_config`.
3
+ # By convention all variables for the project should have the SSI_ANALYSIS_RESULT_PARSERS_* prefix.
4
+ # e.g
5
+ # name: ${SSI_ANALYSIS_RESULT_PARSERS_NAME}
6
+ example:
7
+ input:
8
+ name: ${SSI_ANALYSIS_RESULT_PARSERS_USER_INPUT_NAME}
9
+ alternative_name: Lee
@@ -0,0 +1,252 @@
1
+ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/00_core.ipynb.
2
+
3
+ # %% auto 0
4
+ __all__ = ['PACKAGE_NAME', 'DEV_MODE', 'PACKAGE_DIR', 'PROJECT_DIR', 'config', 'set_env_variables', 'get_config',
5
+ 'show_project_env_vars', 'get_samplesheet', 'update_results_dict', 'print_results_dict_to_tsv']
6
+
7
+ # %% ../nbs/00_core.ipynb 4
8
+ # Need the ssi_analysis_result_parsers for a few functions, this can be considered a static var
9
+
10
+ import importlib
11
+ import importlib.util
12
+ import os
13
+
14
+ PACKAGE_NAME: str = (
15
+ "ssi_analysis_result_parsers" # Make sure to adjust this to your package name
16
+ )
17
+ DEV_MODE: bool = (
18
+ False # set below to override, as this is in an export block it'll be exported while the dev mode section is not
19
+ )
20
+
21
+ PACKAGE_DIR = None
22
+ try:
23
+ spec = importlib.util.find_spec(PACKAGE_NAME)
24
+ module = importlib.util.module_from_spec(spec)
25
+ spec.loader.exec_module(module)
26
+ PACKAGE_DIR = os.path.dirname(module.__file__)
27
+ except ImportError:
28
+ DEV_MODE = True
29
+ except AttributeError:
30
+ DEV_MODE = True
31
+ PROJECT_DIR = os.getcwd() # override value in dev mode
32
+ if PROJECT_DIR.endswith("nbs"):
33
+ DEV_MODE = True
34
+ PROJECT_DIR = os.path.split(PROJECT_DIR)[0]
35
+
36
+ # %% ../nbs/00_core.ipynb 10
37
+ # standard libs
38
+ import os
39
+ import re
40
+
41
+ # Common to template
42
+ # add into settings.ini, requirements, package name is python-dotenv, for conda build ensure `conda config --add channels conda-forge`
43
+ import dotenv # for loading config from .env files, https://pypi.org/project/python-dotenv/
44
+ import envyaml # Allows to loads env vars into a yaml file, https://github.com/thesimj/envyaml
45
+ import fastcore # To add functionality related to nbdev development, https://github.com/fastai/fastcore/
46
+ import pandas # For sample sheet manipulation
47
+ from fastcore import (
48
+ test,
49
+ )
50
+ from fastcore.script import (
51
+ call_parse,
52
+ ) # for @call_parse, https://fastcore.fast.ai/script
53
+
54
+ # Project specific libraries
55
+
56
+ from pathlib import Path
57
+ from hashlib import sha256
58
+
59
+ # %% ../nbs/00_core.ipynb 13
60
+ import importlib
61
+ import importlib.util
62
+
63
+
64
+ def set_env_variables(config_path: str, overide_env_vars: bool = True) -> bool:
65
+ # Load dot env sets environmental values from a file, if the value already exists it will not be overwritten unless override is set to True.
66
+ # If we have multiple .env files then we need to apply the one which we want to take precedence last with overide.
67
+
68
+ # Order of precedence: .env file > environment variables > default values
69
+ # When developing, making a change to the config will not be reflected until the environment is restarted
70
+
71
+ # Set the env vars first, this is needed for the card.yaml to replace ENV variables
72
+ # NOTE: You need to adjust PROJECT_NAME to your package name for this to work, the exception is only for dev purposes
73
+ # This here checks if your package is installed, such as through pypi or through pip install -e [.dev] for development. If it is then it'll go there and use the config files there as your default values.
74
+ try:
75
+ dotenv.load_dotenv(f"{PACKAGE_DIR}/config/config.default.env", override=False)
76
+ except Exception as e:
77
+ print(f"Error: {PACKAGE_DIR}/config/config.default.env does not exist")
78
+ return False
79
+
80
+ # 2. set values from file:
81
+ if os.path.isfile(config_path):
82
+ dotenv.load_dotenv(config_path, override=overide_env_vars)
83
+
84
+ return True
85
+
86
+ # %% ../nbs/00_core.ipynb 15
87
+ import importlib
88
+ import importlib.util
89
+
90
+
91
+ def get_config(config_path: str = None, overide_env_vars: bool = True) -> dict:
92
+ if config_path is None:
93
+ config_path = ""
94
+ # First sets environment with variables from config_path, then uses those variables to fill in appropriate values in the config.yaml file, the yaml file is then returned as a dict
95
+ # If you want user env variables to take precedence over the config.yaml file then set overide_env_vars to False
96
+ set_env_variables(config_path, overide_env_vars)
97
+
98
+ config: dict = envyaml.EnvYAML(
99
+ os.environ.get(
100
+ "CORE_YAML_CONFIG_FILE", f"{PACKAGE_DIR}/config/config.default.yaml"
101
+ ),
102
+ strict=False,
103
+ ).export()
104
+
105
+ return config
106
+
107
+ # %% ../nbs/00_core.ipynb 17
108
+ # create a os.PathLike object
109
+ config = get_config(os.environ.get("CORE_CONFIG_FILE", ""))
110
+
111
+ # %% ../nbs/00_core.ipynb 19
112
+ def show_project_env_vars(config: dict) -> None:
113
+ # Prints out all the project environment variables
114
+ # This is useful for debugging and seeing what is being set
115
+ for k, v in config.items():
116
+ # If ENV var starts with PROJECTNAME_ then print
117
+ if k.startswith(config["CORE_PROJECT_VARIABLE_PREFIX"]):
118
+ print(f"{k}={v}")
119
+
120
+ # %% ../nbs/00_core.ipynb 23
121
+ import pandas as pd
122
+
123
+
124
+ def get_samplesheet(sample_sheet_config: dict) -> pd.DataFrame:
125
+ # Load the sample sheet into a pandas dataframe
126
+ # If columns is not None then it will only load those columns
127
+ # If the sample sheet is a csv then it will load it as a csv, otherwise it will assume it's a tsv
128
+
129
+ # Expected sample_sheet_config:
130
+ # sample_sheet:
131
+ # path: path/to/sample_sheet.tsv
132
+ # delimiter: '\t' # Optional, will assume , for csv and \t otherwises
133
+ # header: 0 # Optional, 0 indicates first row is header, None indicates no header
134
+ # columns: ['column1', 'column2', 'column3'] # Optional, if not provided all columns will be used
135
+
136
+ # Example sample sheet:
137
+ # #sample_id file_path metadata1 metadata2
138
+ # Sample1 /path/to/sample1.fasta value1 option1
139
+ # Sample2 /path/to/sample2.fasta value2 option2
140
+ # Sample3 /path/to/sample3.fasta value3 option1
141
+ # Sample4 /path/to/sample4.fasta value1 option2
142
+ # Sample5 /path/to/sample5.fasta value2 option1
143
+
144
+ # This function should also handle ensuring the sample sheet is in the correct format, such as ensuring the columns are correct and that the sample names are unique.
145
+ if not os.path.isfile(sample_sheet_config["path"]):
146
+ raise FileNotFoundError(f"File {sample_sheet_config['path']} does not exist")
147
+ if "delimiter" in sample_sheet_config:
148
+ delimiter = sample_sheet_config["delimiter"]
149
+ else:
150
+ # do a best guess based on file extension
151
+ delimiter = "," if sample_sheet_config["path"].endswith(".csv") else "\t"
152
+ header = 0
153
+ # if "header" in sample_sheet_config:
154
+ # header = sample_sheet_config["header"]
155
+ # else:
156
+ # # check if the first line starts with a #, if so lets assume it's a header otherwise assume there isn't one
157
+ # with open(sample_sheet_config["path"], "r") as f:
158
+ # first_line = f.readline()
159
+ # header = 0 if first_line.startswith("#") else None
160
+ if "columns" in sample_sheet_config:
161
+ columns = sample_sheet_config[
162
+ "columns"
163
+ ] # note the # for the first item needs to be stripped to compare to the columns
164
+ else:
165
+ columns = None # implies all columns
166
+ try:
167
+ # note when we have a header the first column may begin with a #, so we need to remove it
168
+ df = pd.read_csv(
169
+ sample_sheet_config["path"],
170
+ delimiter=delimiter,
171
+ header=header,
172
+ comment=None,
173
+ )
174
+ except Exception as e:
175
+ print(
176
+ "Error: Could not load sample sheet into dataframe, you have a problem with your sample sheet or the configuration."
177
+ )
178
+ raise e
179
+
180
+ # Check the first header has a # in it, if so remove it for only that item
181
+ if df.columns[0].startswith("#"):
182
+ df.columns = [col.lstrip("#") for col in df.columns]
183
+ # Ensure the sample sheet has the correct columns
184
+ if columns is not None and not all([col in df.columns for col in columns]):
185
+ raise ValueError("Error: Sample sheet does not have the correct columns")
186
+ # also drop columns which are not needed
187
+ if columns is not None:
188
+ df = df[columns]
189
+
190
+ # Clean the df of any extra rows that can be caused by empty lines in the sample sheet
191
+ df = df.dropna(how="all")
192
+ return df
193
+
194
+ # %% ../nbs/00_core.ipynb 24
195
+ def update_results_dict(
196
+ old_results: dict,
197
+ new_results: dict,
198
+ old_duplicate_key_prefix: str = None,
199
+ new_duplicate_key_prefix: str = None,
200
+ ):
201
+ duplicate_keys = list(set(old_results.keys()) & set(new_results.keys()))
202
+ if len(duplicate_keys) == 0:
203
+ old_results.update(new_results)
204
+ return old_results
205
+ else:
206
+ if old_duplicate_key_prefix is None and new_duplicate_key_prefix is None:
207
+ raise ValueError(
208
+ "Provided dictionaries contain duplicate keys. old_duplicate_key_prefix and/or new_duplicate_key_prefix must be provided"
209
+ )
210
+ elif old_duplicate_key_prefix == new_duplicate_key_prefix:
211
+ raise ValueError(
212
+ "old_duplicate_key_prefix and new_duplicate_key_prefix cannot be identical"
213
+ )
214
+ else:
215
+ combined_dict = {}
216
+ if old_duplicate_key_prefix is None:
217
+ combined_dict.update(old_results)
218
+ else:
219
+ for key, value in old_results.items():
220
+ if key in duplicate_keys:
221
+ combined_dict.update(
222
+ {f"{old_duplicate_key_prefix}{key}": value}
223
+ )
224
+ else:
225
+ combined_dict.update({key: value})
226
+ if new_duplicate_key_prefix is None:
227
+ combined_dict.update(new_results)
228
+ else:
229
+ for key, value in new_results.items():
230
+ if key in duplicate_keys:
231
+ combined_dict.update(
232
+ {f"{new_duplicate_key_prefix}{key}": value}
233
+ )
234
+ else:
235
+ combined_dict.update({key: value})
236
+ return combined_dict
237
+
238
+
239
+ def print_results_dict_to_tsv(
240
+ results_dict: dict, output_file: Path, sample_name: str = None
241
+ ) -> None:
242
+ if sample_name is None:
243
+ header = "\t".join(str(x) for x in results_dict.keys())
244
+ values = "\t".join(str(x) for x in results_dict.values())
245
+ else:
246
+ header = "sample_name\t" + "\t".join([str(x) for x in results_dict.keys()])
247
+ values = sample_name + "\t" + "\t".join([str(x) for x in results_dict.values()])
248
+ o = open(output_file, "w")
249
+ o.write(header + "\n")
250
+ o.write(values + "\n")
251
+ o.close()
252
+ return None
@@ -0,0 +1,61 @@
1
+ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_hello_world.ipynb.
2
+
3
+ # %% auto 0
4
+ __all__ = ['hello_world', 'cli']
5
+
6
+ # %% ../nbs/01_hello_world.ipynb 6
7
+ # That export there, it makes sure this code goes into the module.
8
+
9
+ # standard libs
10
+ import os
11
+ import re
12
+
13
+ # Common to template
14
+ # add into settings.ini, requirements, package name is python-dotenv, for conda build ensure `conda config --add channels conda-forge`
15
+ import dotenv # for loading config from .env files, https://pypi.org/project/python-dotenv/
16
+ import envyaml # Allows to loads env vars into a yaml file, https://github.com/thesimj/envyaml
17
+ import fastcore # To add functionality related to nbdev development, https://github.com/fastai/fastcore/
18
+ from fastcore import (
19
+ test,
20
+ )
21
+ from fastcore.script import (
22
+ call_parse,
23
+ ) # for @call_parse, https://fastcore.fast.ai/script
24
+ import json # for nicely printing json and yaml
25
+
26
+ # Project specific libraries
27
+
28
+ # %% ../nbs/01_hello_world.ipynb 8
29
+ from ssi_analysis_result_parsers import (
30
+ core,
31
+ )
32
+
33
+ # %% ../nbs/01_hello_world.ipynb 17
34
+ def hello_world(name1: str, name2: str) -> str:
35
+ return f"Hello {name1} and {name2}!"
36
+
37
+ # %% ../nbs/01_hello_world.ipynb 19
38
+ from fastcore.script import call_parse
39
+
40
+
41
+ @call_parse
42
+ def cli(
43
+ name: str = None, # A name
44
+ alternative_name: str = None, # An alternative name
45
+ config_file: str = None, # config file to set env vars from
46
+ ) -> None:
47
+ """
48
+ This will print Hello World! with your name
49
+ """
50
+ config = core.get_config(config_file) # Set env vars and get config variables
51
+ if name is not None:
52
+ config["example"]["input"]["name"] = name
53
+ if alternative_name is not None:
54
+ config["example"]["input"]["alternative_name"] = alternative_name
55
+
56
+ print(
57
+ hello_world(
58
+ config["example"]["input"]["name"],
59
+ config["example"]["input"]["alternative_name"],
60
+ )
61
+ )