edentity 1.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. edentity/__init__.py +0 -0
  2. edentity/__main__.py +114 -0
  3. edentity/_version.py +21 -0
  4. edentity/configs/multiQC_config.yaml +43 -0
  5. edentity/configs/params.yml +46 -0
  6. edentity/resources/schemas/config_schema.yaml +99 -0
  7. edentity/utils/__init__.py +0 -0
  8. edentity/utils/configs.py +75 -0
  9. edentity/workflow/Dockerfile +30 -0
  10. edentity/workflow/Snakefile +120 -0
  11. edentity/workflow/profiles/default/config.yaml +14 -0
  12. edentity/workflow/profiles/galaxy/config.yaml +14 -0
  13. edentity/workflow/profiles/slurm/config.yaml +15 -0
  14. edentity/workflow/rules/chimera.smk +14 -0
  15. edentity/workflow/rules/denoise.smk +15 -0
  16. edentity/workflow/rules/dereplication.smk +14 -0
  17. edentity/workflow/rules/filter.smk +14 -0
  18. edentity/workflow/rules/merge.smk +81 -0
  19. edentity/workflow/rules/search_exact.smk +69 -0
  20. edentity/workflow/rules/trimming.smk +14 -0
  21. edentity/workflow/scripts/chimera.py +78 -0
  22. edentity/workflow/scripts/custom_multiqc_module.py +46 -0
  23. edentity/workflow/scripts/denoise.py +72 -0
  24. edentity/workflow/scripts/dereplication.py +63 -0
  25. edentity/workflow/scripts/esv_table.py +291 -0
  26. edentity/workflow/scripts/filter.py +69 -0
  27. edentity/workflow/scripts/merge.py +89 -0
  28. edentity/workflow/scripts/primerHandler.py +150 -0
  29. edentity/workflow/scripts/runtime.py +48 -0
  30. edentity/workflow/scripts/search_exact.py +54 -0
  31. edentity/workflow/scripts/trimming.py +133 -0
  32. edentity-1.4.3.dist-info/METADATA +159 -0
  33. edentity-1.4.3.dist-info/RECORD +37 -0
  34. edentity-1.4.3.dist-info/WHEEL +5 -0
  35. edentity-1.4.3.dist-info/entry_points.txt +2 -0
  36. edentity-1.4.3.dist-info/licenses/LICENSE.md +0 -0
  37. edentity-1.4.3.dist-info/top_level.txt +1 -0
edentity/__init__.py ADDED
File without changes
edentity/__main__.py ADDED
@@ -0,0 +1,114 @@
1
+ # identity/__main__.py
2
+ from importlib.resources import files
3
+ import argparse
4
+ import subprocess
5
+ from pathlib import Path
6
+ from edentity.utils.configs import dump_config, dump_multiqc_config, dump_profile_config
7
+ import os
8
+
9
+
10
+ def main():
11
+ parser = argparse.ArgumentParser(
12
+ description="eDentity Metabarcoding Pipeline",
13
+ )
14
+
15
+ # Project-specific
16
+ parser.add_argument("--raw_data_dir", help="Path to the raw input data directory", required=True, )
17
+ parser.add_argument("--work_dir", required=True, help="Working directory for outputs and temporary files")
18
+ parser.add_argument("--profile", help="Snakemake profile to use (e.g., 'slurm', 'galaxy', 'default')",
19
+ default=None)
20
+ parser.add_argument("--make_json_reports", help="Generate an extended JSON report (default: False)", action="store_true")
21
+
22
+ # if params are given through config file
23
+ parser.add_argument("--config_file", help="Path to the config file", default = None)
24
+
25
+ # Fastp params
26
+ parser.add_argument("--average_qual", help="Minimum average quality score (default: 25)", default=25)
27
+ parser.add_argument("--length_required", help="Minimum read length after trimming (default: 100)", default=100)
28
+ parser.add_argument("--n_base_limit", help="Max N bases allowed per read (default: 0)", default=0)
29
+
30
+ # PE merging
31
+ parser.add_argument("--maxdiffpct", help="Max percentage difference in overlaps (default: 100)", default=100)
32
+ parser.add_argument("--maxdiffs", help="Max differences in overlap (default: 5)", default=5)
33
+ parser.add_argument("--minovlen", help="Minimum overlap length (default: 10)", default=10)
34
+
35
+ # Primer trimming
36
+ parser.add_argument("--forward_primer", help="Forward primer sequence", required=True)
37
+ parser.add_argument("--reverse_primer", help="Reverse primer sequence", required=True)
38
+ parser.add_argument("--anchoring", action="store_true", help="Use anchoring for primer matching")
39
+ parser.add_argument("--discard_untrimmed", action="store_true", help="Discard reads without primer match")
40
+
41
+ # Quality filtering
42
+ parser.add_argument("--min_length", help="Minimum read length after filtering (default: 100)", default=100)
43
+ parser.add_argument("--max_length", help="Maximum read length after filtering (default: 600)", default=600)
44
+ parser.add_argument("--maxEE", help="Maximum expected errors (default: 1)", default=1)
45
+
46
+ # Dereplication
47
+ parser.add_argument("--fasta_width", help="FASTA output line width (default: 0 for single-line)", default=0)
48
+
49
+ # Denoising
50
+ parser.add_argument("--alpha", help="Alpha value for chimera detection (default: 2)", default=2)
51
+ parser.add_argument("--minsize", help="Minimum size to retain sequences (default: 4)", default=4)
52
+
53
+ # Pipeline settings
54
+ parser.add_argument("--dataType", choices=["Illumina", "AVITI"], help="Sequencing data type", default="Illumina")
55
+ parser.add_argument("--cpu_cores", help="Number of CPU cores to use (default: 10)", default=12)
56
+ parser.add_argument("--log_level", help="Logging level (default: INFO)", default="INFO")
57
+
58
+ # Fixed paths
59
+ parser.add_argument("--license_file", help="Path to LICENSE file (default: LICENSE)", default="LICENSE")
60
+ parser.add_argument("--changelog_file", help="Path to CHANGELOG file (default: CHANGELOG)", default="CHANGELOG")
61
+ parser.add_argument("--conda", help="Path to conda env YAML for main tools (default: envs/vsearch.yaml)")
62
+ parser.add_argument("--bbtoolsConda", help="Path to conda env YAML for BBTools (default: envs/bbtools.yml)")
63
+
64
+ # snakemkae extra options
65
+ parser.add_argument("--dry-run", action="store_true", help="Dry run mode (equivalent to -n in snakemake)")
66
+
67
+ # add common snakemake options e.g -n, -cores e.t.c
68
+
69
+ config = vars(parser.parse_args())
70
+
71
+
72
+ # writeout a temp config file.
73
+ # the confile file has params from both the default config and the command line args
74
+ # pass this config file to snakemake
75
+
76
+ # dump temp profile config
77
+ work_dir = Path(config['work_dir'])
78
+ profile_dir = work_dir / "edentity_pipeline_settings" / f"{os.path.basename(work_dir)}_snakemake_profile"
79
+ profile_dir.mkdir(parents=True, exist_ok=True)
80
+ profile_path = profile_dir / "config.yaml"
81
+ dump_profile_config(profile_path)
82
+ profile_path = profile_path.resolve()
83
+
84
+ # dump snakemake config file (if the user provides a config file, it will be used to override the default params)
85
+ work_dir.mkdir(parents=True, exist_ok=True)
86
+ snakemake_config_path = work_dir /"edentity_pipeline_settings" / f"{os.path.basename(work_dir)}_snakemake_config.yml"
87
+ dump_config(config, snakemake_config_path)
88
+ snakemake_config_path = snakemake_config_path.resolve()
89
+
90
+ # create temp multiqc config.
91
+ multiqc_config_dir = work_dir / "edentity_pipeline_settings" / "multiqc_config"
92
+ multiqc_config_dir.mkdir(parents=True, exist_ok=True)
93
+ multiqc_config_path = multiqc_config_dir / "config.yaml"
94
+ dump_multiqc_config(multiqc_config_path)
95
+ multiqc_config_path = multiqc_config_path.resolve()
96
+
97
+
98
+ # prepare the command to run snakemake
99
+ cmd = [
100
+ "snakemake",
101
+ "--snakefile", str(files("edentity").joinpath("workflow/Snakefile").resolve()),
102
+ "--workflow-profile", profile_dir if config['profile'] is None else os.path.abspath(config['profile']),
103
+ "--configfile", snakemake_config_path if config['config_file'] is None else os.path.abspath(config['config_file']),
104
+ ]
105
+
106
+ # add snakemake extra options
107
+ if config['dry_run']:
108
+ cmd.append("--dry-run")
109
+
110
+ # run snakemake
111
+ subprocess.run(cmd, check=True)
112
+
113
+ if __name__ == "__main__":
114
+ main()
edentity/_version.py ADDED
@@ -0,0 +1,21 @@
1
+ # file generated by setuptools-scm
2
+ # don't change, don't track in version control
3
+
4
+ __all__ = ["__version__", "__version_tuple__", "version", "version_tuple"]
5
+
6
+ TYPE_CHECKING = False
7
+ if TYPE_CHECKING:
8
+ from typing import Tuple
9
+ from typing import Union
10
+
11
+ VERSION_TUPLE = Tuple[Union[int, str], ...]
12
+ else:
13
+ VERSION_TUPLE = object
14
+
15
+ version: str
16
+ __version__: str
17
+ __version_tuple__: VERSION_TUPLE
18
+ version_tuple: VERSION_TUPLE
19
+
20
+ __version__ = version = '1.4.3'
21
+ __version_tuple__ = version_tuple = (1, 4, 3)
@@ -0,0 +1,43 @@
1
+ # # This configuration file is used to control the behavior of the MultiQC report
2
+ skip_generalstats: true
3
+ show_analysis_paths: false
4
+ fn_clean_exts:
5
+ - ".gz"
6
+ - ".fastq"
7
+ - "_R1"
8
+ - "_R2"
9
+ - "_merged"
10
+ # # General formatting settings for numbers
11
+ # formatting:
12
+ # numbers:
13
+ # human_readable: false # Ensure all numbers are human-readable (e.g., 1M, 1K)
14
+
15
+ # handle custom data
16
+ custom_data:
17
+ edentity_summary:
18
+ file_format: 'tsv'
19
+ section_name: 'eDentity Pipeline Summary'
20
+ plot_type: 'table'
21
+
22
+ # Update module order to include custom_content
23
+ module_order:
24
+ - edentity_summary
25
+ - fastp
26
+ - cutadapt
27
+ # - custom_content
28
+
29
+ # Configure table display
30
+ table_columns_visible:
31
+ edentity_summary:
32
+ Sample: true
33
+ read_pairs: true
34
+ merged_percent: true
35
+ primer_trimmed_reads: true
36
+ passed_quality_filtering: true
37
+ dereplicated_reads: true
38
+ reads_passed_denoising: true
39
+ n_esv: true
40
+ # chimeric_reads: false
41
+ # borderline_reads: false
42
+
43
+
@@ -0,0 +1,46 @@
1
+
2
+ #project specific
3
+ raw_data_dir:
4
+ work_dir:
5
+ make_json_reports: False
6
+
7
+ # tool params
8
+
9
+ # Fastp params (general quality control)
10
+ average_qual: 25
11
+ length_required: 100
12
+ n_base_limit: 0
13
+ # 3_PE_merging (these are set to vsearch default values)
14
+ maxdiffpct: 100
15
+ maxdiffs: 10
16
+ minovlen: 10
17
+
18
+ #4_primer_trimming
19
+ forward_primer:
20
+ reverse_primer:
21
+ anchoring: False
22
+ discard_untrimmed: True
23
+
24
+ #5_quality_filtering (vsearch)
25
+ min_length: 100
26
+ max_length: 600
27
+ maxEE: 1
28
+ # dereplication params
29
+ fasta_width: 0
30
+ # 6_denoising (set to default vsearch values)
31
+ alpha: 2
32
+ minsize: 4
33
+
34
+
35
+ #DO not change this ones :)
36
+ conda: "../envs/vsearch.yaml" # strict channel can cause conda creation errors: disable this with: conda config --set channel_priority disabled
37
+ bbtoolsConda: "../envs/bbtools.yml"
38
+
39
+ # Pipeline settings
40
+ dataType: "Illumina" # [Illumina, AVITI], one of the two
41
+
42
+ # nbitk settings
43
+ license_file: LICENSE
44
+ changelog_file: CHANGELOG
45
+ cpu_cores: 20
46
+ log_level: INFO
@@ -0,0 +1,99 @@
1
+ $schema: "https://json-schema.org/draft/2020-12/schema"
2
+ description: Configuration of apscale tool settings
3
+ properties:
4
+ raw_data_dir:
5
+ type: string
6
+ description: Path to the raw data directory, raw fastq file are expected in this directory
7
+
8
+ dataType:
9
+ type: string
10
+ description: "Type of data, either Illumina or AVITI"
11
+ enum:
12
+ - Illumina
13
+ - AVITI
14
+ default: "Illumina"
15
+
16
+ work_dir:
17
+ type: string
18
+ description: Path to the project directory; this will be set as results directory
19
+
20
+
21
+ # 3_PE_merging (these are set to vsearch default values)
22
+ maxdiffpct:
23
+ type: integer
24
+ description: "maximum percentage diff. bases in overlap; default is 100"
25
+ default: 100
26
+
27
+ maxdiffs:
28
+ type: integer
29
+ description: "maximum number of different bases in overlap; default is 10"
30
+ default: 10
31
+
32
+ minovlen:
33
+ type: integer
34
+ description: "minimum length of overlap between reads, default is 10"
35
+ default: 10
36
+
37
+ # 4_primer_trimming
38
+
39
+ forward_primer:
40
+ type: string
41
+ description: "Sequence of the forward primer"
42
+
43
+ reverse_primer:
44
+ type: string
45
+ description: "Sequence of the reverse primer"
46
+
47
+ anchoring:
48
+ type: boolean
49
+ description: "the primer is only found if it is a suffix/prefix of the read"
50
+ default: False
51
+
52
+ # 5_quality_filtering
53
+
54
+ min_length:
55
+ type: integer
56
+ description: "discard if length of sequence is shorter than min_length"
57
+ default: 100
58
+
59
+ max_length:
60
+ type: integer
61
+ description: "discard if length of sequence is longer than max_length"
62
+ default: 600
63
+
64
+ maxEE:
65
+ type: integer
66
+ description: "maximum expected error value for merged sequence"
67
+ default: 1
68
+
69
+ # 6_denoising (set to default vsearch values)
70
+
71
+ alpha:
72
+ type: integer
73
+ description: "cluster unoise alpha parameter"
74
+ default: 2
75
+
76
+ minsize:
77
+ type: integer
78
+ description: "minimum abundance; drop ESVs if they are less than minsize"
79
+ default: 5
80
+
81
+ # 0_general_settings
82
+
83
+ cores:
84
+ type: integer
85
+ description: "Cores used for parallelism in apscale"
86
+ default: 4
87
+
88
+ compression_level:
89
+ type: integer
90
+ description: "gzip compression level"
91
+ default: 6
92
+
93
+
94
+ required:
95
+ - raw_data_dir
96
+ - work_dir
97
+ - forward_primer
98
+ - reverse_primer
99
+
File without changes
@@ -0,0 +1,75 @@
1
+ import yaml
2
+
3
+ def load_config(confile_path):
4
+ with open(confile_path, "r") as f:
5
+ return yaml.safe_load(f)
6
+
7
+ # dump dicts to yaml at a given path
8
+ def dump_config(config, path):
9
+ with open(path, "w") as f:
10
+ yaml.dump(config, f, sort_keys=False, default_flow_style=False)
11
+
12
+
13
+ # extratct the config file from the package
14
+ def extract_package_yaml(package_file_path, target_path):
15
+ # config_path = files("edentity").joinpath("workflow/profiles/default/config.yaml")
16
+ with open(target_path, "w") as f:
17
+ f.write(package_file_path.read_text())
18
+
19
+
20
+ def get_multiqc_config():
21
+ return {
22
+ "skip_generalstats": True,
23
+ "show_analysis_paths": False,
24
+ "fn_clean_exts": [
25
+ ".gz",
26
+ ".fastq",
27
+ "_R1",
28
+ "_R2",
29
+ "_merged"
30
+ ],
31
+ "custom_data": {
32
+ "edentity_summary": {
33
+ "file_format": "tsv",
34
+ "section_name": "eDentity Pipeline Summary",
35
+ "plot_type": "table"
36
+ }
37
+ },
38
+ "module_order": [
39
+ "edentity_summary",
40
+ "fastp",
41
+ "cutadapt"
42
+ ],
43
+ "table_columns_visible": {
44
+ "edentity_summary": {
45
+ "Sample": True,
46
+ "read_pairs": True,
47
+ "merged_percent": True,
48
+ "primer_trimmed_reads": True,
49
+ "passed_quality_filtering": True,
50
+ "dereplicated_reads": True,
51
+ "reads_passed_denoising": True,
52
+ "n_esv": True
53
+ }
54
+ }
55
+ }
56
+
57
+ def get_default_profile():
58
+ return {
59
+ "jobs": "30",
60
+ "latency-wait": "30",
61
+ "use-conda": "False",
62
+ "printshellcmds": "True",
63
+ "rerun-incomplete": "True",
64
+ "keep-incomplete": "True",
65
+
66
+ }
67
+
68
+ def dump_multiqc_config(target_path):
69
+ default_config = get_multiqc_config()
70
+ dump_config(default_config, target_path)
71
+
72
+
73
+ def dump_profile_config(target_path): # this should be flexible to handle different profiles, e.g slurm, galaxy, default.
74
+ default_profile = get_default_profile()
75
+ dump_config(default_profile, target_path)
@@ -0,0 +1,30 @@
1
+ FROM condaforge/mambaforge:latest
2
+
3
+ # Step 2: Retrieve conda environments
4
+
5
+ # Conda environment:
6
+ # source: workflow/envs/vsearch.yaml
7
+ # prefix: /conda-envs/163dd44b50c96433c482eacfda446195
8
+ # priority: strict
9
+ # name: vsearch
10
+ # channels:
11
+ # - conda-forge
12
+ # - bioconda
13
+ # - nodefaults
14
+ # dependencies:
15
+ # - vsearch=2.28.1
16
+ # - pip
17
+ # - biopython=1.84
18
+ # - multiqc
19
+ # - fastp
20
+ # - cutadapt=4.9
21
+ # - pip:
22
+ # - nbitk==0.3.2
23
+ #
24
+ RUN mkdir -p /conda-envs/163dd44b50c96433c482eacfda446195
25
+ COPY workflow/envs/vsearch.yaml /conda-envs/163dd44b50c96433c482eacfda446195/environment.yaml
26
+
27
+ # Step 3: Generate conda environments
28
+
29
+ RUN conda env create --prefix /conda-envs/163dd44b50c96433c482eacfda446195 --file /conda-envs/163dd44b50c96433c482eacfda446195/environment.yaml && \
30
+ conda clean --all -y
@@ -0,0 +1,120 @@
1
+ from snakemake.logging import logger
2
+ from snakemake.utils import validate, min_version, update_config
3
+ import re, yaml, json, pandas as pd
4
+ import glob, os
5
+ from datetime import datetime
6
+ import numpy as np
7
+ import shutil
8
+ import sys
9
+ import secrets
10
+ min_version("8.16.0") # declare the the lowest version of snakemake that can run this script
11
+
12
+ # configfile: "configs/params.yml"
13
+
14
+
15
+ # validate the parameter settings
16
+ # validate(config, "../resources/schemas/config_schema.yaml")
17
+
18
+
19
+ # docker [image]
20
+ container: "docker://condaforge/mambaforge:latest"
21
+ """
22
+ Organize fastq file names to fit the expected naming format
23
+ the idea is to make a copy of the fastq files to a new folder with the expected naming format
24
+ this will be removed on the sucess of the pipeline
25
+ """
26
+ GZ = True # will be set to True if the fastq files are gzipped
27
+ fastq_files = os.path.join(config['work_dir'], "input_fastq_files")
28
+ os.makedirs(fastq_files, exist_ok=True)
29
+ for file in os.listdir(config['raw_data_dir']):
30
+ if ".fastq" not in file:
31
+ logger.error(f"{file} is not a valid fastq file: skipping")
32
+ continue
33
+ base_name, extension = re.split(r'_R[12]', file)
34
+
35
+ # detect if the file is gzipped
36
+ GZ = True if file.endswith(".gz") else False
37
+
38
+ if "R1" in file:
39
+ new_file_name = f"{base_name}{extension.split('.fastq')[0]}_R1.fastq.gz" if GZ else f"{base_name}{extension.split('.fastq')[0]}_R1.fastq"
40
+ # copy file if it doesn't already exist
41
+ if not os.path.islink(os.path.abspath(os.path.join(fastq_files, new_file_name))) or not os.path.isfile(os.path.abspath(os.path.join(fastq_files, new_file_name))):
42
+ os.symlink(os.path.abspath(os.path.join(config['raw_data_dir'], file)), os.path.abspath(os.path.join(fastq_files, new_file_name)))
43
+ elif "R2" in file:
44
+ new_file_name = f"{base_name}{extension.split('.fastq')[0]}_R2.fastq.gz" if GZ else f"{base_name}{extension.split('.fastq')[0]}_R2.fastq"
45
+ if not os.path.islink(os.path.abspath(os.path.join(fastq_files, new_file_name))):
46
+ os.symlink(os.path.abspath(os.path.join(config['raw_data_dir'], file)), os.path.abspath(os.path.join(fastq_files, new_file_name)))
47
+ else:
48
+ logger.error(f"Missing file(s) for sample {base_name}: {r1_path} or {r2_path}")
49
+
50
+ """
51
+ Loop through each sample in the SAMPLES list.
52
+ For each sample, determine the base name by removing the "_R1" or "_R2" suffix if present.
53
+ Construct the file paths for the R1 and R2 fastq.gz files using the base name.
54
+ Check if both R1 and R2 files exist in the specified raw data directory.
55
+ If both files exist, append the base name to the SAMPLE_NAMES list.
56
+ If either file is missing, log an error message indicating the missing file(s) for the sample.
57
+
58
+ """
59
+ SAMPLES, EXTENSION = glob_wildcards(os.path.join(fastq_files, '{sample}_{extension}'))
60
+ SAMPLE_NAMES = []
61
+
62
+ for sample in SAMPLES:
63
+ base_name = "_".join(sample.split("_")[:-1]) if sample.endswith(("_R1", "_R2")) else sample
64
+ r1_path = os.path.join(fastq_files, f'{base_name}_R1.fastq.gz') if GZ else os.path.join(fastq_files, f'{base_name}_R1.fastq')
65
+ r2_path = os.path.join(fastq_files, f'{base_name}_R2.fastq.gz') if GZ else os.path.join(fastq_files, f'{base_name}_R2.fastq')
66
+
67
+ if (os.path.isfile(r1_path) or os.path.islink(r1_path)) and (os.path.isfile(r2_path) or os.path.islink(r2_path)) is True:
68
+ SAMPLE_NAMES.append(base_name)
69
+ else:
70
+ logger.error(f"Missing file(s) for sample {base_name}: {r1_path} or {r2_path}")
71
+
72
+ onstart:
73
+ print("Starting the pipeline")
74
+ update_config(config,
75
+ { "runID": f"MBR_{secrets.token_hex(8)}",
76
+ "start_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
77
+ "snakemake_version": snakemake.__version__,
78
+ "command_line_args": sys.argv})
79
+ # flow of execution
80
+ ruleorder: merge > trimming > filter > dereplication > denoise > removeChimera > searchExact
81
+
82
+ rule all:
83
+ input:
84
+ # Exact sequence search (this is where the magic of making ESVs happen)
85
+ expand(os.path.join(config["work_dir"], "Results", "ESV_tables",f'{{sample}}_ESV_table.tsv'), sample=SAMPLE_NAMES),
86
+
87
+ # MultiQC report
88
+ os.path.join(config["work_dir"], "Results", "report",
89
+ f"{os.path.basename(config['work_dir'])}_multiqc_reports",
90
+ f"{os.path.basename(config['work_dir'])}_multiqc_report.html"),
91
+ # custom multiqc data
92
+ os.path.join(config["work_dir"], "Results", "report",
93
+ f"{os.path.basename(config['work_dir'])}_custom_multiqc_data_mqc.txt"),
94
+
95
+ # ESV table
96
+ os.path.join(config["work_dir"], "Results",
97
+ "report", f"{os.path.basename(config['work_dir'])}_ESV_table.tsv"),
98
+
99
+
100
+ # summary report
101
+ os.path.join(config["work_dir"], "Results",
102
+ "report", f"{os.path.basename(config['work_dir'])}_summary_report.tsv")
103
+
104
+ onsuccess:
105
+ # remove the copy of fastq files (to reduce memory footprint)
106
+ # this currently is a symlink; thus no memory is used; still, it is a good idea to remove it
107
+ shutil.rmtree(fastq_files)
108
+ # remove sample ESV tables
109
+ ESV_tables = os.path.join(config["work_dir"], "Results", "ESV_tables")
110
+ if os.path.exists(ESV_tables):
111
+ logger.info(f"Removing ESV tables directory: {ESV_tables}")
112
+ shutil.rmtree(os.path.join(config["work_dir"], "Results", "ESV_tables"))
113
+
114
+ include: "rules/merge.smk"
115
+ include: "rules/trimming.smk"
116
+ include: "rules/filter.smk"
117
+ include: "rules/dereplication.smk"
118
+ include: "rules/denoise.smk"
119
+ include: "rules/chimera.smk"
120
+ include: "rules/search_exact.smk"
@@ -0,0 +1,14 @@
1
+ jobs: "30"
2
+ max-jobs-per-second: "10"
3
+ max-status-checks-per-second: "10"
4
+ local-cores: 10
5
+ latency-wait: "30"
6
+ use-conda: "False"
7
+ printshellcmds: "True"
8
+ rerun-incomplete: "False"
9
+ keep-incomplete: "True"
10
+
11
+ default-resources:
12
+ - runtime=100
13
+ - mem_mb=6000
14
+ - disk_mb=1000000
@@ -0,0 +1,14 @@
1
+ jobs: "30"
2
+ max-jobs-per-second: "10"
3
+ max-status-checks-per-second: "10"
4
+ local-cores: 44
5
+ latency-wait: "30"
6
+ # use-conda: "True"
7
+ printshellcmds: "True"
8
+ rerun-incomplete: "False"
9
+ keep-incomplete: "True"
10
+
11
+ # default-resources:
12
+ # - runtime=100
13
+ # - mem_mb=6000
14
+ # - disk_mb=1000000
@@ -0,0 +1,15 @@
1
+ executor: "slurm"
2
+ jobs: "30"
3
+ max-jobs-per-second: "10"
4
+ max-status-checks-per-second: "10"
5
+ local-cores: 44
6
+ latency-wait: "30"
7
+ use-conda: "True"
8
+ printshellcmds: "True"
9
+ rerun-incomplete: "False"
10
+ keep-incomplete: "True"
11
+
12
+ default-resources:
13
+ - runtime=100
14
+ - mem_mb=6000
15
+ - disk_mb=1000000
@@ -0,0 +1,14 @@
1
+ import os
2
+
3
+ rule removeChimera:
4
+ input:
5
+ denoised = os.path.join(config["work_dir"], "Results", "denoise", f'{{sample}}_merged_trimmed_filtered_derep_denoised.fasta'),
6
+ denoise_report = os.path.join(config["work_dir"], "Results", "report", f'{{sample}}_denoise_report.tsv')
7
+ output:
8
+ ESV_fasta = os.path.join(config["work_dir"], "Results", "ESVs_fasta", f'{{sample}}_ESV.fasta'),
9
+ summary_report = temp(os.path.join(config["work_dir"], "Results", "report", f'{{sample}}_remove_chimera_report.tsv'))
10
+ log:
11
+ log = os.path.join(config["work_dir"], "logs", "chimera", f'{{sample}}_chimera.log')
12
+ conda: config['conda']
13
+ script:
14
+ "../scripts/chimera.py"
@@ -0,0 +1,15 @@
1
+ import os
2
+
3
+ rule denoise:
4
+ input:
5
+ derep = os.path.join(config["work_dir"], "Results", "dereplication", f'{{sample}}_merged_trimmed_filtered_derep.fasta'),
6
+ derep_report = os.path.join(config["work_dir"], "Results", "report", f'{{sample}}_derep_report.tsv')
7
+ output:
8
+ denoised = temp(os.path.join(config["work_dir"], "Results", "denoise", f'{{sample}}_merged_trimmed_filtered_derep_denoised.fasta')),
9
+ summary_report = temp(os.path.join(config["work_dir"], "Results", "report", f'{{sample}}_denoise_report.tsv'))
10
+
11
+ log:
12
+ log = os.path.join(config["work_dir"], "logs", "denoise", f'{{sample}}_denoising.log')
13
+ conda: config['conda']
14
+ script:
15
+ "../scripts/denoise.py"
@@ -0,0 +1,14 @@
1
+ import os
2
+
3
+ rule dereplication:
4
+ input:
5
+ filtered = os.path.join(config["work_dir"], "Results", "filter", f'{{sample}}_merged_trimmed_filtered.fasta'),
6
+ filter_report = os.path.join(config["work_dir"], "Results", "report", f'{{sample}}_filter_report.tsv')
7
+ output:
8
+ derep = temp(os.path.join(config["work_dir"], "Results", "dereplication", f'{{sample}}_merged_trimmed_filtered_derep.fasta')),
9
+ summary_report = temp(os.path.join(config["work_dir"], "Results", "report", f'{{sample}}_derep_report.tsv'))
10
+ log:
11
+ log = os.path.join(config["work_dir"], "logs", "dereplication", f'{{sample}}_dereplication.log')
12
+ conda: config['conda']
13
+ script:
14
+ "../scripts/dereplication.py"
@@ -0,0 +1,14 @@
1
+ import os
2
+
3
+ rule filter:
4
+ input:
5
+ trimmed = os.path.join(config["work_dir"], "Results","trimming", "trimmed_seqs", f'{{sample}}_merged_trimmed.fastq'),
6
+ trimming_report = os.path.join(config["work_dir"], "Results", "report", f'{{sample}}_trimming_report.tsv')
7
+ output:
8
+ filtered = temp(os.path.join(config["work_dir"], "Results", "filter", f'{{sample}}_merged_trimmed_filtered.fasta')),
9
+ summary_report = temp(os.path.join(config["work_dir"], "Results", "report", f'{{sample}}_filter_report.tsv'))
10
+ log:
11
+ log = os.path.join(config["work_dir"], "logs", "filter", f'{{sample}}_filter.log')
12
+ conda: config['conda']
13
+ script:
14
+ "../scripts/filter.py"