edentity 1.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edentity/__init__.py +0 -0
- edentity/__main__.py +114 -0
- edentity/_version.py +21 -0
- edentity/configs/multiQC_config.yaml +43 -0
- edentity/configs/params.yml +46 -0
- edentity/resources/schemas/config_schema.yaml +99 -0
- edentity/utils/__init__.py +0 -0
- edentity/utils/configs.py +75 -0
- edentity/workflow/Dockerfile +30 -0
- edentity/workflow/Snakefile +120 -0
- edentity/workflow/profiles/default/config.yaml +14 -0
- edentity/workflow/profiles/galaxy/config.yaml +14 -0
- edentity/workflow/profiles/slurm/config.yaml +15 -0
- edentity/workflow/rules/chimera.smk +14 -0
- edentity/workflow/rules/denoise.smk +15 -0
- edentity/workflow/rules/dereplication.smk +14 -0
- edentity/workflow/rules/filter.smk +14 -0
- edentity/workflow/rules/merge.smk +81 -0
- edentity/workflow/rules/search_exact.smk +69 -0
- edentity/workflow/rules/trimming.smk +14 -0
- edentity/workflow/scripts/chimera.py +78 -0
- edentity/workflow/scripts/custom_multiqc_module.py +46 -0
- edentity/workflow/scripts/denoise.py +72 -0
- edentity/workflow/scripts/dereplication.py +63 -0
- edentity/workflow/scripts/esv_table.py +291 -0
- edentity/workflow/scripts/filter.py +69 -0
- edentity/workflow/scripts/merge.py +89 -0
- edentity/workflow/scripts/primerHandler.py +150 -0
- edentity/workflow/scripts/runtime.py +48 -0
- edentity/workflow/scripts/search_exact.py +54 -0
- edentity/workflow/scripts/trimming.py +133 -0
- edentity-1.4.3.dist-info/METADATA +159 -0
- edentity-1.4.3.dist-info/RECORD +37 -0
- edentity-1.4.3.dist-info/WHEEL +5 -0
- edentity-1.4.3.dist-info/entry_points.txt +2 -0
- edentity-1.4.3.dist-info/licenses/LICENSE.md +0 -0
- edentity-1.4.3.dist-info/top_level.txt +1 -0
edentity/__init__.py
ADDED
|
File without changes
|
edentity/__main__.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# identity/__main__.py
|
|
2
|
+
from importlib.resources import files
|
|
3
|
+
import argparse
|
|
4
|
+
import subprocess
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from edentity.utils.configs import dump_config, dump_multiqc_config, dump_profile_config
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def main():
|
|
11
|
+
parser = argparse.ArgumentParser(
|
|
12
|
+
description="eDentity Metabarcoding Pipeline",
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
# Project-specific
|
|
16
|
+
parser.add_argument("--raw_data_dir", help="Path to the raw input data directory", required=True, )
|
|
17
|
+
parser.add_argument("--work_dir", required=True, help="Working directory for outputs and temporary files")
|
|
18
|
+
parser.add_argument("--profile", help="Snakemake profile to use (e.g., 'slurm', 'galaxy', 'default')",
|
|
19
|
+
default=None)
|
|
20
|
+
parser.add_argument("--make_json_reports", help="Generate an extended JSON report (default: False)", action="store_true")
|
|
21
|
+
|
|
22
|
+
# if params are given through config file
|
|
23
|
+
parser.add_argument("--config_file", help="Path to the config file", default = None)
|
|
24
|
+
|
|
25
|
+
# Fastp params
|
|
26
|
+
parser.add_argument("--average_qual", help="Minimum average quality score (default: 25)", default=25)
|
|
27
|
+
parser.add_argument("--length_required", help="Minimum read length after trimming (default: 100)", default=100)
|
|
28
|
+
parser.add_argument("--n_base_limit", help="Max N bases allowed per read (default: 0)", default=0)
|
|
29
|
+
|
|
30
|
+
# PE merging
|
|
31
|
+
parser.add_argument("--maxdiffpct", help="Max percentage difference in overlaps (default: 100)", default=100)
|
|
32
|
+
parser.add_argument("--maxdiffs", help="Max differences in overlap (default: 5)", default=5)
|
|
33
|
+
parser.add_argument("--minovlen", help="Minimum overlap length (default: 10)", default=10)
|
|
34
|
+
|
|
35
|
+
# Primer trimming
|
|
36
|
+
parser.add_argument("--forward_primer", help="Forward primer sequence", required=True)
|
|
37
|
+
parser.add_argument("--reverse_primer", help="Reverse primer sequence", required=True)
|
|
38
|
+
parser.add_argument("--anchoring", action="store_true", help="Use anchoring for primer matching")
|
|
39
|
+
parser.add_argument("--discard_untrimmed", action="store_true", help="Discard reads without primer match")
|
|
40
|
+
|
|
41
|
+
# Quality filtering
|
|
42
|
+
parser.add_argument("--min_length", help="Minimum read length after filtering (default: 100)", default=100)
|
|
43
|
+
parser.add_argument("--max_length", help="Maximum read length after filtering (default: 600)", default=600)
|
|
44
|
+
parser.add_argument("--maxEE", help="Maximum expected errors (default: 1)", default=1)
|
|
45
|
+
|
|
46
|
+
# Dereplication
|
|
47
|
+
parser.add_argument("--fasta_width", help="FASTA output line width (default: 0 for single-line)", default=0)
|
|
48
|
+
|
|
49
|
+
# Denoising
|
|
50
|
+
parser.add_argument("--alpha", help="Alpha value for chimera detection (default: 2)", default=2)
|
|
51
|
+
parser.add_argument("--minsize", help="Minimum size to retain sequences (default: 4)", default=4)
|
|
52
|
+
|
|
53
|
+
# Pipeline settings
|
|
54
|
+
parser.add_argument("--dataType", choices=["Illumina", "AVITI"], help="Sequencing data type", default="Illumina")
|
|
55
|
+
parser.add_argument("--cpu_cores", help="Number of CPU cores to use (default: 10)", default=12)
|
|
56
|
+
parser.add_argument("--log_level", help="Logging level (default: INFO)", default="INFO")
|
|
57
|
+
|
|
58
|
+
# Fixed paths
|
|
59
|
+
parser.add_argument("--license_file", help="Path to LICENSE file (default: LICENSE)", default="LICENSE")
|
|
60
|
+
parser.add_argument("--changelog_file", help="Path to CHANGELOG file (default: CHANGELOG)", default="CHANGELOG")
|
|
61
|
+
parser.add_argument("--conda", help="Path to conda env YAML for main tools (default: envs/vsearch.yaml)")
|
|
62
|
+
parser.add_argument("--bbtoolsConda", help="Path to conda env YAML for BBTools (default: envs/bbtools.yml)")
|
|
63
|
+
|
|
64
|
+
# snakemkae extra options
|
|
65
|
+
parser.add_argument("--dry-run", action="store_true", help="Dry run mode (equivalent to -n in snakemake)")
|
|
66
|
+
|
|
67
|
+
# add common snakemake options e.g -n, -cores e.t.c
|
|
68
|
+
|
|
69
|
+
config = vars(parser.parse_args())
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# writeout a temp config file.
|
|
73
|
+
# the confile file has params from both the default config and the command line args
|
|
74
|
+
# pass this config file to snakemake
|
|
75
|
+
|
|
76
|
+
# dump temp profile config
|
|
77
|
+
work_dir = Path(config['work_dir'])
|
|
78
|
+
profile_dir = work_dir / "edentity_pipeline_settings" / f"{os.path.basename(work_dir)}_snakemake_profile"
|
|
79
|
+
profile_dir.mkdir(parents=True, exist_ok=True)
|
|
80
|
+
profile_path = profile_dir / "config.yaml"
|
|
81
|
+
dump_profile_config(profile_path)
|
|
82
|
+
profile_path = profile_path.resolve()
|
|
83
|
+
|
|
84
|
+
# dump snakemake config file (if the user provides a config file, it will be used to override the default params)
|
|
85
|
+
work_dir.mkdir(parents=True, exist_ok=True)
|
|
86
|
+
snakemake_config_path = work_dir /"edentity_pipeline_settings" / f"{os.path.basename(work_dir)}_snakemake_config.yml"
|
|
87
|
+
dump_config(config, snakemake_config_path)
|
|
88
|
+
snakemake_config_path = snakemake_config_path.resolve()
|
|
89
|
+
|
|
90
|
+
# create temp multiqc config.
|
|
91
|
+
multiqc_config_dir = work_dir / "edentity_pipeline_settings" / "multiqc_config"
|
|
92
|
+
multiqc_config_dir.mkdir(parents=True, exist_ok=True)
|
|
93
|
+
multiqc_config_path = multiqc_config_dir / "config.yaml"
|
|
94
|
+
dump_multiqc_config(multiqc_config_path)
|
|
95
|
+
multiqc_config_path = multiqc_config_path.resolve()
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
# prepare the command to run snakemake
|
|
99
|
+
cmd = [
|
|
100
|
+
"snakemake",
|
|
101
|
+
"--snakefile", str(files("edentity").joinpath("workflow/Snakefile").resolve()),
|
|
102
|
+
"--workflow-profile", profile_dir if config['profile'] is None else os.path.abspath(config['profile']),
|
|
103
|
+
"--configfile", snakemake_config_path if config['config_file'] is None else os.path.abspath(config['config_file']),
|
|
104
|
+
]
|
|
105
|
+
|
|
106
|
+
# add snakemake extra options
|
|
107
|
+
if config['dry_run']:
|
|
108
|
+
cmd.append("--dry-run")
|
|
109
|
+
|
|
110
|
+
# run snakemake
|
|
111
|
+
subprocess.run(cmd, check=True)
|
|
112
|
+
|
|
113
|
+
if __name__ == "__main__":
|
|
114
|
+
main()
|
edentity/_version.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# file generated by setuptools-scm
|
|
2
|
+
# don't change, don't track in version control
|
|
3
|
+
|
|
4
|
+
__all__ = ["__version__", "__version_tuple__", "version", "version_tuple"]
|
|
5
|
+
|
|
6
|
+
TYPE_CHECKING = False
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from typing import Tuple
|
|
9
|
+
from typing import Union
|
|
10
|
+
|
|
11
|
+
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
12
|
+
else:
|
|
13
|
+
VERSION_TUPLE = object
|
|
14
|
+
|
|
15
|
+
version: str
|
|
16
|
+
__version__: str
|
|
17
|
+
__version_tuple__: VERSION_TUPLE
|
|
18
|
+
version_tuple: VERSION_TUPLE
|
|
19
|
+
|
|
20
|
+
__version__ = version = '1.4.3'
|
|
21
|
+
__version_tuple__ = version_tuple = (1, 4, 3)
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# # This configuration file is used to control the behavior of the MultiQC report
|
|
2
|
+
skip_generalstats: true
|
|
3
|
+
show_analysis_paths: false
|
|
4
|
+
fn_clean_exts:
|
|
5
|
+
- ".gz"
|
|
6
|
+
- ".fastq"
|
|
7
|
+
- "_R1"
|
|
8
|
+
- "_R2"
|
|
9
|
+
- "_merged"
|
|
10
|
+
# # General formatting settings for numbers
|
|
11
|
+
# formatting:
|
|
12
|
+
# numbers:
|
|
13
|
+
# human_readable: false # Ensure all numbers are human-readable (e.g., 1M, 1K)
|
|
14
|
+
|
|
15
|
+
# handle custom data
|
|
16
|
+
custom_data:
|
|
17
|
+
edentity_summary:
|
|
18
|
+
file_format: 'tsv'
|
|
19
|
+
section_name: 'eDentity Pipeline Summary'
|
|
20
|
+
plot_type: 'table'
|
|
21
|
+
|
|
22
|
+
# Update module order to include custom_content
|
|
23
|
+
module_order:
|
|
24
|
+
- edentity_summary
|
|
25
|
+
- fastp
|
|
26
|
+
- cutadapt
|
|
27
|
+
# - custom_content
|
|
28
|
+
|
|
29
|
+
# Configure table display
|
|
30
|
+
table_columns_visible:
|
|
31
|
+
edentity_summary:
|
|
32
|
+
Sample: true
|
|
33
|
+
read_pairs: true
|
|
34
|
+
merged_percent: true
|
|
35
|
+
primer_trimmed_reads: true
|
|
36
|
+
passed_quality_filtering: true
|
|
37
|
+
dereplicated_reads: true
|
|
38
|
+
reads_passed_denoising: true
|
|
39
|
+
n_esv: true
|
|
40
|
+
# chimeric_reads: false
|
|
41
|
+
# borderline_reads: false
|
|
42
|
+
|
|
43
|
+
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
|
|
2
|
+
#project specific
|
|
3
|
+
raw_data_dir:
|
|
4
|
+
work_dir:
|
|
5
|
+
make_json_reports: False
|
|
6
|
+
|
|
7
|
+
# tool params
|
|
8
|
+
|
|
9
|
+
# Fastp params (general quality control)
|
|
10
|
+
average_qual: 25
|
|
11
|
+
length_required: 100
|
|
12
|
+
n_base_limit: 0
|
|
13
|
+
# 3_PE_merging (these are set to vsearch default values)
|
|
14
|
+
maxdiffpct: 100
|
|
15
|
+
maxdiffs: 10
|
|
16
|
+
minovlen: 10
|
|
17
|
+
|
|
18
|
+
#4_primer_trimming
|
|
19
|
+
forward_primer:
|
|
20
|
+
reverse_primer:
|
|
21
|
+
anchoring: False
|
|
22
|
+
discard_untrimmed: True
|
|
23
|
+
|
|
24
|
+
#5_quality_filtering (vsearch)
|
|
25
|
+
min_length: 100
|
|
26
|
+
max_length: 600
|
|
27
|
+
maxEE: 1
|
|
28
|
+
# dereplication params
|
|
29
|
+
fasta_width: 0
|
|
30
|
+
# 6_denoising (set to default vsearch values)
|
|
31
|
+
alpha: 2
|
|
32
|
+
minsize: 4
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
#DO not change this ones :)
|
|
36
|
+
conda: "../envs/vsearch.yaml" # strict channel can cause conda creation errors: disable this with: conda config --set channel_priority disabled
|
|
37
|
+
bbtoolsConda: "../envs/bbtools.yml"
|
|
38
|
+
|
|
39
|
+
# Pipeline settings
|
|
40
|
+
dataType: "Illumina" # [Illumina, AVITI], one of the two
|
|
41
|
+
|
|
42
|
+
# nbitk settings
|
|
43
|
+
license_file: LICENSE
|
|
44
|
+
changelog_file: CHANGELOG
|
|
45
|
+
cpu_cores: 20
|
|
46
|
+
log_level: INFO
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
$schema: "https://json-schema.org/draft/2020-12/schema"
|
|
2
|
+
description: Configuration of apscale tool settings
|
|
3
|
+
properties:
|
|
4
|
+
raw_data_dir:
|
|
5
|
+
type: string
|
|
6
|
+
description: Path to the raw data directory, raw fastq file are expected in this directory
|
|
7
|
+
|
|
8
|
+
dataType:
|
|
9
|
+
type: string
|
|
10
|
+
description: "Type of data, either Illumina or AVITI"
|
|
11
|
+
enum:
|
|
12
|
+
- Illumina
|
|
13
|
+
- AVITI
|
|
14
|
+
default: "Illumina"
|
|
15
|
+
|
|
16
|
+
work_dir:
|
|
17
|
+
type: string
|
|
18
|
+
description: Path to the project directory; this will be set as results directory
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# 3_PE_merging (these are set to vsearch default values)
|
|
22
|
+
maxdiffpct:
|
|
23
|
+
type: integer
|
|
24
|
+
description: "maximum percentage diff. bases in overlap; default is 100"
|
|
25
|
+
default: 100
|
|
26
|
+
|
|
27
|
+
maxdiffs:
|
|
28
|
+
type: integer
|
|
29
|
+
description: "maximum number of different bases in overlap; default is 10"
|
|
30
|
+
default: 10
|
|
31
|
+
|
|
32
|
+
minovlen:
|
|
33
|
+
type: integer
|
|
34
|
+
description: "minimum length of overlap between reads, default is 10"
|
|
35
|
+
default: 10
|
|
36
|
+
|
|
37
|
+
# 4_primer_trimming
|
|
38
|
+
|
|
39
|
+
forward_primer:
|
|
40
|
+
type: string
|
|
41
|
+
description: "Sequence of the forward primer"
|
|
42
|
+
|
|
43
|
+
reverse_primer:
|
|
44
|
+
type: string
|
|
45
|
+
description: "Sequence of the reverse primer"
|
|
46
|
+
|
|
47
|
+
anchoring:
|
|
48
|
+
type: boolean
|
|
49
|
+
description: "the primer is only found if it is a suffix/prefix of the read"
|
|
50
|
+
default: False
|
|
51
|
+
|
|
52
|
+
# 5_quality_filtering
|
|
53
|
+
|
|
54
|
+
min_length:
|
|
55
|
+
type: integer
|
|
56
|
+
description: "discard if length of sequence is shorter than min_length"
|
|
57
|
+
default: 100
|
|
58
|
+
|
|
59
|
+
max_length:
|
|
60
|
+
type: integer
|
|
61
|
+
description: "discard if length of sequence is longer than max_length"
|
|
62
|
+
default: 600
|
|
63
|
+
|
|
64
|
+
maxEE:
|
|
65
|
+
type: integer
|
|
66
|
+
description: "maximum expected error value for merged sequence"
|
|
67
|
+
default: 1
|
|
68
|
+
|
|
69
|
+
# 6_denoising (set to default vsearch values)
|
|
70
|
+
|
|
71
|
+
alpha:
|
|
72
|
+
type: integer
|
|
73
|
+
description: "cluster unoise alpha parameter"
|
|
74
|
+
default: 2
|
|
75
|
+
|
|
76
|
+
minsize:
|
|
77
|
+
type: integer
|
|
78
|
+
description: "minimum abundance; drop ESVs if they are less than minsize"
|
|
79
|
+
default: 5
|
|
80
|
+
|
|
81
|
+
# 0_general_settings
|
|
82
|
+
|
|
83
|
+
cores:
|
|
84
|
+
type: integer
|
|
85
|
+
description: "Cores used for parallelism in apscale"
|
|
86
|
+
default: 4
|
|
87
|
+
|
|
88
|
+
compression_level:
|
|
89
|
+
type: integer
|
|
90
|
+
description: "gzip compression level"
|
|
91
|
+
default: 6
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
required:
|
|
95
|
+
- raw_data_dir
|
|
96
|
+
- work_dir
|
|
97
|
+
- forward_primer
|
|
98
|
+
- reverse_primer
|
|
99
|
+
|
|
File without changes
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import yaml
|
|
2
|
+
|
|
3
|
+
def load_config(confile_path):
|
|
4
|
+
with open(confile_path, "r") as f:
|
|
5
|
+
return yaml.safe_load(f)
|
|
6
|
+
|
|
7
|
+
# dump dicts to yaml at a given path
|
|
8
|
+
def dump_config(config, path):
|
|
9
|
+
with open(path, "w") as f:
|
|
10
|
+
yaml.dump(config, f, sort_keys=False, default_flow_style=False)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# extratct the config file from the package
|
|
14
|
+
def extract_package_yaml(package_file_path, target_path):
|
|
15
|
+
# config_path = files("edentity").joinpath("workflow/profiles/default/config.yaml")
|
|
16
|
+
with open(target_path, "w") as f:
|
|
17
|
+
f.write(package_file_path.read_text())
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_multiqc_config():
|
|
21
|
+
return {
|
|
22
|
+
"skip_generalstats": True,
|
|
23
|
+
"show_analysis_paths": False,
|
|
24
|
+
"fn_clean_exts": [
|
|
25
|
+
".gz",
|
|
26
|
+
".fastq",
|
|
27
|
+
"_R1",
|
|
28
|
+
"_R2",
|
|
29
|
+
"_merged"
|
|
30
|
+
],
|
|
31
|
+
"custom_data": {
|
|
32
|
+
"edentity_summary": {
|
|
33
|
+
"file_format": "tsv",
|
|
34
|
+
"section_name": "eDentity Pipeline Summary",
|
|
35
|
+
"plot_type": "table"
|
|
36
|
+
}
|
|
37
|
+
},
|
|
38
|
+
"module_order": [
|
|
39
|
+
"edentity_summary",
|
|
40
|
+
"fastp",
|
|
41
|
+
"cutadapt"
|
|
42
|
+
],
|
|
43
|
+
"table_columns_visible": {
|
|
44
|
+
"edentity_summary": {
|
|
45
|
+
"Sample": True,
|
|
46
|
+
"read_pairs": True,
|
|
47
|
+
"merged_percent": True,
|
|
48
|
+
"primer_trimmed_reads": True,
|
|
49
|
+
"passed_quality_filtering": True,
|
|
50
|
+
"dereplicated_reads": True,
|
|
51
|
+
"reads_passed_denoising": True,
|
|
52
|
+
"n_esv": True
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
def get_default_profile():
|
|
58
|
+
return {
|
|
59
|
+
"jobs": "30",
|
|
60
|
+
"latency-wait": "30",
|
|
61
|
+
"use-conda": "False",
|
|
62
|
+
"printshellcmds": "True",
|
|
63
|
+
"rerun-incomplete": "True",
|
|
64
|
+
"keep-incomplete": "True",
|
|
65
|
+
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
def dump_multiqc_config(target_path):
|
|
69
|
+
default_config = get_multiqc_config()
|
|
70
|
+
dump_config(default_config, target_path)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def dump_profile_config(target_path): # this should be flexible to handle different profiles, e.g slurm, galaxy, default.
|
|
74
|
+
default_profile = get_default_profile()
|
|
75
|
+
dump_config(default_profile, target_path)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
FROM condaforge/mambaforge:latest
|
|
2
|
+
|
|
3
|
+
# Step 2: Retrieve conda environments
|
|
4
|
+
|
|
5
|
+
# Conda environment:
|
|
6
|
+
# source: workflow/envs/vsearch.yaml
|
|
7
|
+
# prefix: /conda-envs/163dd44b50c96433c482eacfda446195
|
|
8
|
+
# priority: strict
|
|
9
|
+
# name: vsearch
|
|
10
|
+
# channels:
|
|
11
|
+
# - conda-forge
|
|
12
|
+
# - bioconda
|
|
13
|
+
# - nodefaults
|
|
14
|
+
# dependencies:
|
|
15
|
+
# - vsearch=2.28.1
|
|
16
|
+
# - pip
|
|
17
|
+
# - biopython=1.84
|
|
18
|
+
# - multiqc
|
|
19
|
+
# - fastp
|
|
20
|
+
# - cutadapt=4.9
|
|
21
|
+
# - pip:
|
|
22
|
+
# - nbitk==0.3.2
|
|
23
|
+
#
|
|
24
|
+
RUN mkdir -p /conda-envs/163dd44b50c96433c482eacfda446195
|
|
25
|
+
COPY workflow/envs/vsearch.yaml /conda-envs/163dd44b50c96433c482eacfda446195/environment.yaml
|
|
26
|
+
|
|
27
|
+
# Step 3: Generate conda environments
|
|
28
|
+
|
|
29
|
+
RUN conda env create --prefix /conda-envs/163dd44b50c96433c482eacfda446195 --file /conda-envs/163dd44b50c96433c482eacfda446195/environment.yaml && \
|
|
30
|
+
conda clean --all -y
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
from snakemake.logging import logger
|
|
2
|
+
from snakemake.utils import validate, min_version, update_config
|
|
3
|
+
import re, yaml, json, pandas as pd
|
|
4
|
+
import glob, os
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
import numpy as np
|
|
7
|
+
import shutil
|
|
8
|
+
import sys
|
|
9
|
+
import secrets
|
|
10
|
+
min_version("8.16.0") # declare the the lowest version of snakemake that can run this script
|
|
11
|
+
|
|
12
|
+
# configfile: "configs/params.yml"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# validate the parameter settings
|
|
16
|
+
# validate(config, "../resources/schemas/config_schema.yaml")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# docker [image]
|
|
20
|
+
container: "docker://condaforge/mambaforge:latest"
|
|
21
|
+
"""
|
|
22
|
+
Organize fastq file names to fit the expected naming format
|
|
23
|
+
the idea is to make a copy of the fastq files to a new folder with the expected naming format
|
|
24
|
+
this will be removed on the sucess of the pipeline
|
|
25
|
+
"""
|
|
26
|
+
GZ = True # will be set to True if the fastq files are gzipped
|
|
27
|
+
fastq_files = os.path.join(config['work_dir'], "input_fastq_files")
|
|
28
|
+
os.makedirs(fastq_files, exist_ok=True)
|
|
29
|
+
for file in os.listdir(config['raw_data_dir']):
|
|
30
|
+
if ".fastq" not in file:
|
|
31
|
+
logger.error(f"{file} is not a valid fastq file: skipping")
|
|
32
|
+
continue
|
|
33
|
+
base_name, extension = re.split(r'_R[12]', file)
|
|
34
|
+
|
|
35
|
+
# detect if the file is gzipped
|
|
36
|
+
GZ = True if file.endswith(".gz") else False
|
|
37
|
+
|
|
38
|
+
if "R1" in file:
|
|
39
|
+
new_file_name = f"{base_name}{extension.split('.fastq')[0]}_R1.fastq.gz" if GZ else f"{base_name}{extension.split('.fastq')[0]}_R1.fastq"
|
|
40
|
+
# copy file if it doesn't already exist
|
|
41
|
+
if not os.path.islink(os.path.abspath(os.path.join(fastq_files, new_file_name))) or not os.path.isfile(os.path.abspath(os.path.join(fastq_files, new_file_name))):
|
|
42
|
+
os.symlink(os.path.abspath(os.path.join(config['raw_data_dir'], file)), os.path.abspath(os.path.join(fastq_files, new_file_name)))
|
|
43
|
+
elif "R2" in file:
|
|
44
|
+
new_file_name = f"{base_name}{extension.split('.fastq')[0]}_R2.fastq.gz" if GZ else f"{base_name}{extension.split('.fastq')[0]}_R2.fastq"
|
|
45
|
+
if not os.path.islink(os.path.abspath(os.path.join(fastq_files, new_file_name))):
|
|
46
|
+
os.symlink(os.path.abspath(os.path.join(config['raw_data_dir'], file)), os.path.abspath(os.path.join(fastq_files, new_file_name)))
|
|
47
|
+
else:
|
|
48
|
+
logger.error(f"Missing file(s) for sample {base_name}: {r1_path} or {r2_path}")
|
|
49
|
+
|
|
50
|
+
"""
|
|
51
|
+
Loop through each sample in the SAMPLES list.
|
|
52
|
+
For each sample, determine the base name by removing the "_R1" or "_R2" suffix if present.
|
|
53
|
+
Construct the file paths for the R1 and R2 fastq.gz files using the base name.
|
|
54
|
+
Check if both R1 and R2 files exist in the specified raw data directory.
|
|
55
|
+
If both files exist, append the base name to the SAMPLE_NAMES list.
|
|
56
|
+
If either file is missing, log an error message indicating the missing file(s) for the sample.
|
|
57
|
+
|
|
58
|
+
"""
|
|
59
|
+
SAMPLES, EXTENSION = glob_wildcards(os.path.join(fastq_files, '{sample}_{extension}'))
|
|
60
|
+
SAMPLE_NAMES = []
|
|
61
|
+
|
|
62
|
+
for sample in SAMPLES:
|
|
63
|
+
base_name = "_".join(sample.split("_")[:-1]) if sample.endswith(("_R1", "_R2")) else sample
|
|
64
|
+
r1_path = os.path.join(fastq_files, f'{base_name}_R1.fastq.gz') if GZ else os.path.join(fastq_files, f'{base_name}_R1.fastq')
|
|
65
|
+
r2_path = os.path.join(fastq_files, f'{base_name}_R2.fastq.gz') if GZ else os.path.join(fastq_files, f'{base_name}_R2.fastq')
|
|
66
|
+
|
|
67
|
+
if (os.path.isfile(r1_path) or os.path.islink(r1_path)) and (os.path.isfile(r2_path) or os.path.islink(r2_path)) is True:
|
|
68
|
+
SAMPLE_NAMES.append(base_name)
|
|
69
|
+
else:
|
|
70
|
+
logger.error(f"Missing file(s) for sample {base_name}: {r1_path} or {r2_path}")
|
|
71
|
+
|
|
72
|
+
onstart:
|
|
73
|
+
print("Starting the pipeline")
|
|
74
|
+
update_config(config,
|
|
75
|
+
{ "runID": f"MBR_{secrets.token_hex(8)}",
|
|
76
|
+
"start_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
77
|
+
"snakemake_version": snakemake.__version__,
|
|
78
|
+
"command_line_args": sys.argv})
|
|
79
|
+
# flow of execution
|
|
80
|
+
ruleorder: merge > trimming > filter > dereplication > denoise > removeChimera > searchExact
|
|
81
|
+
|
|
82
|
+
rule all:
|
|
83
|
+
input:
|
|
84
|
+
# Exact sequence search (this is where the magic of making ESVs happen)
|
|
85
|
+
expand(os.path.join(config["work_dir"], "Results", "ESV_tables",f'{{sample}}_ESV_table.tsv'), sample=SAMPLE_NAMES),
|
|
86
|
+
|
|
87
|
+
# MultiQC report
|
|
88
|
+
os.path.join(config["work_dir"], "Results", "report",
|
|
89
|
+
f"{os.path.basename(config['work_dir'])}_multiqc_reports",
|
|
90
|
+
f"{os.path.basename(config['work_dir'])}_multiqc_report.html"),
|
|
91
|
+
# custom multiqc data
|
|
92
|
+
os.path.join(config["work_dir"], "Results", "report",
|
|
93
|
+
f"{os.path.basename(config['work_dir'])}_custom_multiqc_data_mqc.txt"),
|
|
94
|
+
|
|
95
|
+
# ESV table
|
|
96
|
+
os.path.join(config["work_dir"], "Results",
|
|
97
|
+
"report", f"{os.path.basename(config['work_dir'])}_ESV_table.tsv"),
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
# summary report
|
|
101
|
+
os.path.join(config["work_dir"], "Results",
|
|
102
|
+
"report", f"{os.path.basename(config['work_dir'])}_summary_report.tsv")
|
|
103
|
+
|
|
104
|
+
onsuccess:
|
|
105
|
+
# remove the copy of fastq files (to reduce memory footprint)
|
|
106
|
+
# this currently is a symlink; thus no memory is used; still, it is a good idea to remove it
|
|
107
|
+
shutil.rmtree(fastq_files)
|
|
108
|
+
# remove sample ESV tables
|
|
109
|
+
ESV_tables = os.path.join(config["work_dir"], "Results", "ESV_tables")
|
|
110
|
+
if os.path.exists(ESV_tables):
|
|
111
|
+
logger.info(f"Removing ESV tables directory: {ESV_tables}")
|
|
112
|
+
shutil.rmtree(os.path.join(config["work_dir"], "Results", "ESV_tables"))
|
|
113
|
+
|
|
114
|
+
include: "rules/merge.smk"
|
|
115
|
+
include: "rules/trimming.smk"
|
|
116
|
+
include: "rules/filter.smk"
|
|
117
|
+
include: "rules/dereplication.smk"
|
|
118
|
+
include: "rules/denoise.smk"
|
|
119
|
+
include: "rules/chimera.smk"
|
|
120
|
+
include: "rules/search_exact.smk"
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
jobs: "30"
|
|
2
|
+
max-jobs-per-second: "10"
|
|
3
|
+
max-status-checks-per-second: "10"
|
|
4
|
+
local-cores: 10
|
|
5
|
+
latency-wait: "30"
|
|
6
|
+
use-conda: "False"
|
|
7
|
+
printshellcmds: "True"
|
|
8
|
+
rerun-incomplete: "False"
|
|
9
|
+
keep-incomplete: "True"
|
|
10
|
+
|
|
11
|
+
default-resources:
|
|
12
|
+
- runtime=100
|
|
13
|
+
- mem_mb=6000
|
|
14
|
+
- disk_mb=1000000
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
jobs: "30"
|
|
2
|
+
max-jobs-per-second: "10"
|
|
3
|
+
max-status-checks-per-second: "10"
|
|
4
|
+
local-cores: 44
|
|
5
|
+
latency-wait: "30"
|
|
6
|
+
# use-conda: "True"
|
|
7
|
+
printshellcmds: "True"
|
|
8
|
+
rerun-incomplete: "False"
|
|
9
|
+
keep-incomplete: "True"
|
|
10
|
+
|
|
11
|
+
# default-resources:
|
|
12
|
+
# - runtime=100
|
|
13
|
+
# - mem_mb=6000
|
|
14
|
+
# - disk_mb=1000000
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
executor: "slurm"
|
|
2
|
+
jobs: "30"
|
|
3
|
+
max-jobs-per-second: "10"
|
|
4
|
+
max-status-checks-per-second: "10"
|
|
5
|
+
local-cores: 44
|
|
6
|
+
latency-wait: "30"
|
|
7
|
+
use-conda: "True"
|
|
8
|
+
printshellcmds: "True"
|
|
9
|
+
rerun-incomplete: "False"
|
|
10
|
+
keep-incomplete: "True"
|
|
11
|
+
|
|
12
|
+
default-resources:
|
|
13
|
+
- runtime=100
|
|
14
|
+
- mem_mb=6000
|
|
15
|
+
- disk_mb=1000000
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
rule removeChimera:
|
|
4
|
+
input:
|
|
5
|
+
denoised = os.path.join(config["work_dir"], "Results", "denoise", f'{{sample}}_merged_trimmed_filtered_derep_denoised.fasta'),
|
|
6
|
+
denoise_report = os.path.join(config["work_dir"], "Results", "report", f'{{sample}}_denoise_report.tsv')
|
|
7
|
+
output:
|
|
8
|
+
ESV_fasta = os.path.join(config["work_dir"], "Results", "ESVs_fasta", f'{{sample}}_ESV.fasta'),
|
|
9
|
+
summary_report = temp(os.path.join(config["work_dir"], "Results", "report", f'{{sample}}_remove_chimera_report.tsv'))
|
|
10
|
+
log:
|
|
11
|
+
log = os.path.join(config["work_dir"], "logs", "chimera", f'{{sample}}_chimera.log')
|
|
12
|
+
conda: config['conda']
|
|
13
|
+
script:
|
|
14
|
+
"../scripts/chimera.py"
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
rule denoise:
|
|
4
|
+
input:
|
|
5
|
+
derep = os.path.join(config["work_dir"], "Results", "dereplication", f'{{sample}}_merged_trimmed_filtered_derep.fasta'),
|
|
6
|
+
derep_report = os.path.join(config["work_dir"], "Results", "report", f'{{sample}}_derep_report.tsv')
|
|
7
|
+
output:
|
|
8
|
+
denoised = temp(os.path.join(config["work_dir"], "Results", "denoise", f'{{sample}}_merged_trimmed_filtered_derep_denoised.fasta')),
|
|
9
|
+
summary_report = temp(os.path.join(config["work_dir"], "Results", "report", f'{{sample}}_denoise_report.tsv'))
|
|
10
|
+
|
|
11
|
+
log:
|
|
12
|
+
log = os.path.join(config["work_dir"], "logs", "denoise", f'{{sample}}_denoising.log')
|
|
13
|
+
conda: config['conda']
|
|
14
|
+
script:
|
|
15
|
+
"../scripts/denoise.py"
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
rule dereplication:
|
|
4
|
+
input:
|
|
5
|
+
filtered = os.path.join(config["work_dir"], "Results", "filter", f'{{sample}}_merged_trimmed_filtered.fasta'),
|
|
6
|
+
filter_report = os.path.join(config["work_dir"], "Results", "report", f'{{sample}}_filter_report.tsv')
|
|
7
|
+
output:
|
|
8
|
+
derep = temp(os.path.join(config["work_dir"], "Results", "dereplication", f'{{sample}}_merged_trimmed_filtered_derep.fasta')),
|
|
9
|
+
summary_report = temp(os.path.join(config["work_dir"], "Results", "report", f'{{sample}}_derep_report.tsv'))
|
|
10
|
+
log:
|
|
11
|
+
log = os.path.join(config["work_dir"], "logs", "dereplication", f'{{sample}}_dereplication.log')
|
|
12
|
+
conda: config['conda']
|
|
13
|
+
script:
|
|
14
|
+
"../scripts/dereplication.py"
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
rule filter:
|
|
4
|
+
input:
|
|
5
|
+
trimmed = os.path.join(config["work_dir"], "Results","trimming", "trimmed_seqs", f'{{sample}}_merged_trimmed.fastq'),
|
|
6
|
+
trimming_report = os.path.join(config["work_dir"], "Results", "report", f'{{sample}}_trimming_report.tsv')
|
|
7
|
+
output:
|
|
8
|
+
filtered = temp(os.path.join(config["work_dir"], "Results", "filter", f'{{sample}}_merged_trimmed_filtered.fasta')),
|
|
9
|
+
summary_report = temp(os.path.join(config["work_dir"], "Results", "report", f'{{sample}}_filter_report.tsv'))
|
|
10
|
+
log:
|
|
11
|
+
log = os.path.join(config["work_dir"], "logs", "filter", f'{{sample}}_filter.log')
|
|
12
|
+
conda: config['conda']
|
|
13
|
+
script:
|
|
14
|
+
"../scripts/filter.py"
|