fastQpick 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,24 @@
1
+ BSD 2-Clause License
2
+
3
+ Copyright (c) 2024, Pachter Lab
4
+
5
+ Redistribution and use in source and binary forms, with or without
6
+ modification, are permitted provided that the following conditions are met:
7
+
8
+ 1. Redistributions of source code must retain the above copyright notice, this
9
+ list of conditions and the following disclaimer.
10
+
11
+ 2. Redistributions in binary form must reproduce the above copyright notice,
12
+ this list of conditions and the following disclaimer in the documentation
13
+ and/or other materials provided with the distribution.
14
+
15
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,197 @@
1
+ Metadata-Version: 2.2
2
+ Name: fastQpick
3
+ Version: 0.1.0
4
+ Summary: Fast and memory-efficient sampling of DNA-Seq or RNA-seq fastq data with or without replacement.
5
+ Author-email: Joseph Rich <josephrich98@gmail.com>
6
+ Maintainer-email: Joseph Rich <josephrich98@gmail.com>
7
+ License: BSD 2-Clause License
8
+
9
+ Copyright (c) 2024, Pachter Lab
10
+
11
+ Redistribution and use in source and binary forms, with or without
12
+ modification, are permitted provided that the following conditions are met:
13
+
14
+ 1. Redistributions of source code must retain the above copyright notice, this
15
+ list of conditions and the following disclaimer.
16
+
17
+ 2. Redistributions in binary form must reproduce the above copyright notice,
18
+ this list of conditions and the following disclaimer in the documentation
19
+ and/or other materials provided with the distribution.
20
+
21
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
+
32
+ Project-URL: Homepage, https://github.com/pachterlab/fastQpick
33
+ Keywords: fastQpick,bioinformatics,statistics,RNA-seq,DNA-seq
34
+ Classifier: Environment :: Console
35
+ Classifier: Framework :: Jupyter
36
+ Classifier: Intended Audience :: Science/Research
37
+ Classifier: License :: OSI Approved :: BSD License
38
+ Classifier: Operating System :: OS Independent
39
+ Classifier: Programming Language :: Python :: 3.9
40
+ Classifier: Programming Language :: Python :: 3.10
41
+ Classifier: Programming Language :: Python :: 3.11
42
+ Classifier: Programming Language :: Python :: 3.12
43
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
44
+ Classifier: Topic :: Utilities
45
+ Requires-Python: >=3.7
46
+ Description-Content-Type: text/markdown
47
+ License-File: LICENSE
48
+ Requires-Dist: pyfastx>=2.0.0
49
+ Requires-Dist: tqdm>=4.66.0
50
+
51
+ # fastQpick
52
+
53
+ Fast and memory-efficient sampling of DNA-seq or RNA-seq FASTQ data with or without replacement.
54
+
55
+ ---
56
+
57
+ ## Installation
58
+
59
+ ### Install via PyPI
60
+ ```bash
61
+ pip install fastQpick
62
+ ```
63
+
64
+ ### Install from Source Code
65
+
66
+ Using pip:
67
+ ```bash
68
+ pip install git+https://github.com/pachterlab/fastQpick.git
69
+ ```
70
+
71
+ Or clone the repository and build manually:
72
+ ```bash
73
+ git clone https://github.com/pachterlab/fastQpick.git
74
+ cd fastQpick
75
+ python -m build
76
+ python -m pip install dist/fastQpick-x.x.x-py3-none-any.whl
77
+ ```
78
+
79
+ ---
80
+
81
+ ## Usage
82
+
83
+ ### Command-line Interface
84
+
85
+ Run `fastQpick` with a specified fraction and options:
86
+ ```bash
87
+ fastQpick --fraction FRACTION [OPTIONS] FASTQ_FILE1 FASTQ_FILE2 ...
88
+ ```
89
+
90
+ ### Python API
91
+
92
+ Use `fastQpick` in your Python code:
93
+ ```python
94
+ from fastQpick import fastQpick
95
+
96
+ fastQpick(
97
+ input_file_list=['FASTQ_FILE1', 'FASTQ_FILE2', ...],
98
+ fraction=FRACTION,
99
+ ...
100
+ )
101
+ ```
102
+
103
+ ---
104
+
105
+ ## Documentation
106
+
107
+ - **Command-line Help**: Use the following command to see all available options:
108
+ ```bash
109
+ fastQpick --help
110
+ ```
111
+
112
+ - **Python API Help**: Use the `help` function to explore the API:
113
+ ```python
114
+ help(fastQpick)
115
+ ```
116
+
117
+ ### Options
118
+ - input_files (str, list, or tuple) List of input FASTQ files or directories containing FASTQ files. Required. Positional argument on command line.
119
+ - fraction (int or float) The fraction of reads to sample, as a float greater than 0. Any value equal to or greater than 1 will turn on the -r flag automatically.
120
+ - seed (int or str) Random seed(s). Can provide multiple seeds separated by commas. Default: 42
121
+ - output_dir (str) Output directory. Default: ./fastQpick_output
122
+ - gzip_output (bool) Whether or not to gzip the output. Default: False (uncompressed)
123
+ - group_size (int) The size of grouped files. Provide each pair of files sequentially, separated by a space. E.g., I1, R1, R2 - would have group_size=3. Default: 1 (unpaired)
124
+ - replacement (bool) Sample with replacement. Default: False (without replacement).
125
+ - overwrite (bool) Overwrite existing output files. Default: False
126
+ - low_memory (bool) Whether to use low memory mode (uses ~5.5x less memory than default, but adds marginal time to the data - structure generation preprocessing). Default: False
127
+ - verbose (bool) Whether to print progress information. Default: True
128
+
129
+ ---
130
+
131
+ ## Features
132
+
133
+ - Efficient sampling of large FASTQ files.
134
+ - Works with both single and paired-end sequencing data.
135
+ - Supports sampling with or without replacement.
136
+ - Command-line interface and Python API for seamless integration.
137
+ - Memory efficient - in low-memory mode, only uses as much memory as a list of (small) integers the length of the number of reads in the fastq file for each file.
138
+ - Time efficient - only passes through the fastq once and writes to output in batches - can process 600M reads in 10-15 minutes
139
+
140
+ ## Low memory mode vs. standard
141
+ Low memory mode vs. standard, when fraction=1 (i.e., number of reads to sample is the same as the number of reads in the fastq):
142
+ - Adds an extra ~1-3 seconds per million reads per group_size (i.e., 500M reads would take 30 minutes instead of 20-25 minutes)
143
+ - Saves an extra ~40MB RAM per million reads (i.e., 500M reads would take 3.75GB RAM vs 20.6GB RAM)
144
+
145
+ ---
146
+
147
+ ## Examples
148
+
149
+ ### 1. Sample 10% of reads with replacement from a FASTQ file:
150
+
151
+ **Command-line**
152
+ ```bash
153
+ fastQpick --fraction 0.1 -r input.fastq
154
+ ```
155
+
156
+ **Python**
157
+ ```python
158
+ from fastQpick import fastQpick
159
+
160
+ fastQpick(
161
+ input_files='input.fastq',
162
+ fraction=0.1,
163
+ replacement=True
164
+ )
165
+ ```
166
+
167
+ ### 2. Sample 100% of reads with replacement from multiple paired FASTQ files (R1, R2) across three seeds (i.e., bootstrapping):
168
+
169
+ **Command-line**
170
+ ```bash
171
+ fastQpick --fraction 1 -s 42,43,44 -r -g 2 input1_R1.fastq input1_R2.fastq
172
+ ```
173
+
174
+ **Python**
175
+ ```python
176
+ from fastQpick import fastQpick
177
+
178
+ fastQpick(
179
+ input_files='input.fastq',
180
+ fraction=1,
181
+ seed="42,43,44",
182
+ replacement=True,
183
+ group_size=2,
184
+ )
185
+ ```
186
+ ---
187
+
188
+ ## License
189
+
190
+ fastQpick is licensed under the 2-clause BSD license. See the [LICENSE](LICENSE) file for details.
191
+
192
+ ---
193
+
194
+ ## Contributing
195
+
196
+ We welcome contributions! Please see the [CONTRIBUTING.md](CONTRIBUTING.md) file for guidelines on how to get involved.
197
+
@@ -0,0 +1,147 @@
1
+ # fastQpick
2
+
3
+ Fast and memory-efficient sampling of DNA-seq or RNA-seq FASTQ data with or without replacement.
4
+
5
+ ---
6
+
7
+ ## Installation
8
+
9
+ ### Install via PyPI
10
+ ```bash
11
+ pip install fastQpick
12
+ ```
13
+
14
+ ### Install from Source Code
15
+
16
+ Using pip:
17
+ ```bash
18
+ pip install git+https://github.com/pachterlab/fastQpick.git
19
+ ```
20
+
21
+ Or clone the repository and build manually:
22
+ ```bash
23
+ git clone https://github.com/pachterlab/fastQpick.git
24
+ cd fastQpick
25
+ python -m build
26
+ python -m pip install dist/fastQpick-x.x.x-py3-none-any.whl
27
+ ```
28
+
29
+ ---
30
+
31
+ ## Usage
32
+
33
+ ### Command-line Interface
34
+
35
+ Run `fastQpick` with a specified fraction and options:
36
+ ```bash
37
+ fastQpick --fraction FRACTION [OPTIONS] FASTQ_FILE1 FASTQ_FILE2 ...
38
+ ```
39
+
40
+ ### Python API
41
+
42
+ Use `fastQpick` in your Python code:
43
+ ```python
44
+ from fastQpick import fastQpick
45
+
46
+ fastQpick(
47
+ input_file_list=['FASTQ_FILE1', 'FASTQ_FILE2', ...],
48
+ fraction=FRACTION,
49
+ ...
50
+ )
51
+ ```
52
+
53
+ ---
54
+
55
+ ## Documentation
56
+
57
+ - **Command-line Help**: Use the following command to see all available options:
58
+ ```bash
59
+ fastQpick --help
60
+ ```
61
+
62
+ - **Python API Help**: Use the `help` function to explore the API:
63
+ ```python
64
+ help(fastQpick)
65
+ ```
66
+
67
+ ### Options
68
+ - input_files (str, list, or tuple) List of input FASTQ files or directories containing FASTQ files. Required. Positional argument on command line.
69
+ - fraction (int or float) The fraction of reads to sample, as a float greater than 0. Any value equal to or greater than 1 will turn on the -r flag automatically.
70
+ - seed (int or str) Random seed(s). Can provide multiple seeds separated by commas. Default: 42
71
+ - output_dir (str) Output directory. Default: ./fastQpick_output
72
+ - gzip_output (bool) Whether or not to gzip the output. Default: False (uncompressed)
73
+ - group_size (int) The size of grouped files. Provide each pair of files sequentially, separated by a space. E.g., I1, R1, R2 - would have group_size=3. Default: 1 (unpaired)
74
+ - replacement (bool) Sample with replacement. Default: False (without replacement).
75
+ - overwrite (bool) Overwrite existing output files. Default: False
76
+ - low_memory (bool) Whether to use low memory mode (uses ~5.5x less memory than default, but adds marginal time to the data - structure generation preprocessing). Default: False
77
+ - verbose (bool) Whether to print progress information. Default: True
78
+
79
+ ---
80
+
81
+ ## Features
82
+
83
+ - Efficient sampling of large FASTQ files.
84
+ - Works with both single and paired-end sequencing data.
85
+ - Supports sampling with or without replacement.
86
+ - Command-line interface and Python API for seamless integration.
87
+ - Memory efficient - in low-memory mode, only uses as much memory as a list of (small) integers the length of the number of reads in the fastq file for each file.
88
+ - Time efficient - only passes through the fastq once and writes to output in batches - can process 600M reads in 10-15 minutes
89
+
90
+ ## Low memory mode vs. standard
91
+ Low memory mode vs. standard, when fraction=1 (i.e., number of reads to sample is the same as the number of reads in the fastq):
92
+ - Adds an extra ~1-3 seconds per million reads per group_size (i.e., 500M reads would take 30 minutes instead of 20-25 minutes)
93
+ - Saves an extra ~40MB RAM per million reads (i.e., 500M reads would take 3.75GB RAM vs 20.6GB RAM)
94
+
95
+ ---
96
+
97
+ ## Examples
98
+
99
+ ### 1. Sample 10% of reads with replacement from a FASTQ file:
100
+
101
+ **Command-line**
102
+ ```bash
103
+ fastQpick --fraction 0.1 -r input.fastq
104
+ ```
105
+
106
+ **Python**
107
+ ```python
108
+ from fastQpick import fastQpick
109
+
110
+ fastQpick(
111
+ input_files='input.fastq',
112
+ fraction=0.1,
113
+ replacement=True
114
+ )
115
+ ```
116
+
117
+ ### 2. Sample 100% of reads with replacement from multiple paired FASTQ files (R1, R2) across three seeds (i.e., bootstrapping):
118
+
119
+ **Command-line**
120
+ ```bash
121
+ fastQpick --fraction 1 -s 42,43,44 -r -g 2 input1_R1.fastq input1_R2.fastq
122
+ ```
123
+
124
+ **Python**
125
+ ```python
126
+ from fastQpick import fastQpick
127
+
128
+ fastQpick(
129
+ input_files='input.fastq',
130
+ fraction=1,
131
+ seed="42,43,44",
132
+ replacement=True,
133
+ group_size=2,
134
+ )
135
+ ```
136
+ ---
137
+
138
+ ## License
139
+
140
+ fastQpick is licensed under the 2-clause BSD license. See the [LICENSE](LICENSE) file for details.
141
+
142
+ ---
143
+
144
+ ## Contributing
145
+
146
+ We welcome contributions! Please see the [CONTRIBUTING.md](CONTRIBUTING.md) file for guidelines on how to get involved.
147
+
@@ -0,0 +1,5 @@
1
+ from .main import fastQpick
2
+
3
+ from ._version import __version__
4
+ __author__ = "Joseph Rich"
5
+ __email__ = "josephrich98@gmail.com"
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
@@ -0,0 +1,241 @@
1
+ import argparse
2
+ import gzip
3
+ import os
4
+ import random
5
+ from tqdm import tqdm
6
+ import pyfastx # to loop through fastq (faster than custom python code)
7
+
8
+ from fastQpick._version import __version__
9
+ from fastQpick.utils import save_params_to_config_file, is_directory_effectively_empty, group_items, count_reads
10
+
11
+ # Global variables
12
+ valid_fastq_extensions = (".fastq", ".fq", ".fastq.gz", ".fq.gz")
13
+ batch_size = 200000 # for buffer
14
+ fastq_to_length_dict = {} # set to empty, and the user can provide otherwise it will be calculated
15
+
16
+ def write_fastq(input_fastq, output_path, occurrence_list, total_reads, gzip_output, seed = None, verbose = True):
17
+ if gzip_output:
18
+ open_func = gzip.open
19
+ write_mode = "wt"
20
+ else:
21
+ open_func = open
22
+ write_mode = "w"
23
+
24
+ buffer = [] # Temporary storage for the batch
25
+
26
+ input_fastq_read_only = pyfastx.Fastx(input_fastq)
27
+
28
+ # use tqdm if verbose else silently loop
29
+ iterator = (
30
+ tqdm(input_fastq_read_only, desc=f"Iterating through seed {seed}, file {input_fastq}", unit="read", total=total_reads)
31
+ if verbose else input_fastq_read_only
32
+ )
33
+
34
+ with open_func(output_path, write_mode) as f:
35
+ for i, (name, seq, qual) in enumerate(iterator):
36
+ # Add the FASTQ entry to the buffer
37
+ buffer.extend([f"@{name}\n{seq}\n+\n{qual}\n"] * occurrence_list[i])
38
+
39
+ # If the buffer reaches the batch size, write all at once and clear the buffer
40
+ if (i + 1) % batch_size == 0:
41
+ f.writelines(buffer)
42
+ buffer.clear() # Clear the buffer after writing
43
+
44
+ # Write any remaining entries in the buffer
45
+ if buffer:
46
+ f.writelines(buffer)
47
+ buffer.clear()
48
+
49
+ def make_occurrence_list(file, seed, total_reads, number_of_reads_to_sample, replacement, low_memory, verbose):
50
+ if verbose:
51
+ print(f"Calculating total reads and determining random indices for seed {seed}, file {file}")
52
+ if replacement:
53
+ if low_memory:
54
+ random_indices = (random.choice(range(total_reads)) for _ in range(number_of_reads_to_sample))
55
+ else:
56
+ random_indices = tuple(random.choices(range(total_reads), k=number_of_reads_to_sample)) # with replacement
57
+ else:
58
+ if low_memory:
59
+ random_indices = (index for index in random.sample(range(total_reads), k=number_of_reads_to_sample))
60
+ else:
61
+ random_indices = tuple(random.sample(range(total_reads), k=number_of_reads_to_sample)) # without replacement
62
+
63
+ # Initialize a list with zeros
64
+ occurrence_list = [0] * total_reads
65
+
66
+ # use tqdm if verbose, else just silently loop through
67
+ iterator = (
68
+ tqdm(random_indices, desc=f"Counting occurrences for seed {seed}, file {file}", unit="read", total=number_of_reads_to_sample)
69
+ if verbose else random_indices
70
+ )
71
+
72
+ # Count occurrences (I don't use a counter in order to save memory, as a counter is essentially a dictionary)
73
+ for index in iterator:
74
+ occurrence_list[index] += 1
75
+
76
+ del random_indices
77
+
78
+ return occurrence_list
79
+
80
+ def bootstrap_single_file(files_total = None, gzip_output = None, output_directory = None, seed = None, fraction = None, replacement = None, low_memory = False, verbose=True):
81
+ if isinstance(files_total, str):
82
+ files_total = (files_total, )
83
+
84
+ total_reads = fastq_to_length_dict[files_total[0]]
85
+ number_of_reads_to_sample = int(fraction * total_reads)
86
+
87
+ occurrence_list = make_occurrence_list(file=files_total[0], seed=seed, total_reads=total_reads, number_of_reads_to_sample=number_of_reads_to_sample, replacement=replacement, low_memory=low_memory, verbose=verbose)
88
+
89
+ for file in files_total:
90
+ # Create output directory if it doesn't exist
91
+ output_path = os.path.join(output_directory, os.path.basename(file))
92
+ if output_directory:
93
+ os.makedirs(output_directory, exist_ok=True)
94
+
95
+ if gzip_output and not output_path.endswith(".gz"):
96
+ output_path += ".gz"
97
+ elif not gzip_output and output_path.endswith(".gz"):
98
+ output_path = output_path[:-3]
99
+
100
+ # write fastq
101
+ write_fastq(input_fastq = file, output_path = output_path, occurrence_list = occurrence_list, total_reads = total_reads, gzip_output = gzip_output, seed = seed, verbose = verbose)
102
+
103
+ def sample_multiple_files(file_list, fraction, seed_list, output, gzip_output, replacement, low_memory, verbose):
104
+ for seed in seed_list:
105
+ random.seed(seed)
106
+ for file in file_list:
107
+ bootstrap_single_file(files_total = file, gzip_output = gzip_output, output_directory = output, seed = seed, fraction = fraction, replacement = replacement, low_memory = low_memory, verbose = verbose)
108
+
109
+ def make_fastq_to_length_dict(file_list, verbose=True):
110
+ global fastq_to_length_dict
111
+ for file in file_list:
112
+ if isinstance(file, tuple):
113
+ if all(specific_file in fastq_to_length_dict for specific_file in file):
114
+ continue
115
+ if verbose:
116
+ print(f"Counting {file[0]}")
117
+ count = count_reads(file[0])
118
+ for i in range(len(file)):
119
+ fastq_to_length_dict[file[i]] = count
120
+ elif isinstance(file, str):
121
+ if file in fastq_to_length_dict:
122
+ continue
123
+ if verbose:
124
+ print(f"Counting {file}")
125
+ count = count_reads(file)
126
+ fastq_to_length_dict[file] = count
127
+ if verbose:
128
+ print("fastq_to_length_dict:", fastq_to_length_dict)
129
+
130
+ def fastQpick(input_files, fraction, seed=42, output_dir="fastQpick_output", gzip_output=False, group_size=1, replacement=False, overwrite=False, low_memory=False, verbose=True, **kwargs):
131
+ """
132
+ Fast and memory-efficient sampling of DNA-Seq or RNA-seq fastq data with or without replacement.
133
+
134
+ Parameters
135
+ ----------
136
+ input_files (str, list, or tuple) List of input FASTQ files or directories containing FASTQ files.
137
+ fraction (int or float) The fraction of reads to sample, as a float greater than 0. Any value equal to or greater than 1 will turn on the -r flag automatically.
138
+ seed (int or str) Random seed(s). Can provide multiple seeds separated by commas. Default: 42
139
+ output_dir (str) Output directory. Default: ./fastQpick_output
140
+ gzip_output (bool) Whether or not to gzip the output. Default: False (uncompressed)
141
+ group_size (int) The size of grouped files. Provide each pair of files sequentially, separated by a space. E.g., I1, R1, R2 would have group_size=3. Default: 1 (unpaired)
142
+ replacement (bool) Sample with replacement. Default: False (without replacement).
143
+ overwrite (bool) Overwrite existing output files. Default: False
144
+ low_memory (bool) Whether to use low memory mode (uses ~5.5x less memory than default, but adds marginal time to the data structure generation preprocessing). Default: False
145
+ verbose (bool) Whether to print progress information. Default: True
146
+
147
+ kwargs
148
+ ------
149
+ fastq_to_length_dict (dict) Dictionary of FASTQ file paths to number of reads in each file. If not provided, will be calculated.
150
+ """
151
+ # check if fastq_to_length_dict is in kwargs
152
+ if "fastq_to_length_dict" in kwargs and isinstance(kwargs["fastq_to_length_dict"], dict):
153
+ global fastq_to_length_dict
154
+ fastq_to_length_dict = kwargs["fastq_to_length_dict"]
155
+
156
+ # Check overwrite
157
+ if not overwrite:
158
+ if os.path.exists(output_dir) and not is_directory_effectively_empty(output_dir): # check if dir exists and is not empty
159
+ raise FileExistsError(f"Output directory '{output_dir}' already exists. Please specify a different output directory or set the overwrite flag to True.")
160
+
161
+ # Save arguments to a config file
162
+ os.makedirs(output_dir, exist_ok=True)
163
+ config_file = os.path.join(output_dir, "fastQpick_config.json")
164
+ save_params_to_config_file(config_file)
165
+
166
+ # type checking
167
+ # if fraction >= 1, set replacement to True
168
+ if float(fraction) >= 1.0:
169
+ replacement = True
170
+
171
+ # go through files, and only keep those that are valid fastq files or that are a folder containing valid fastq files in the direct subdirectory
172
+ input_files_parsed = []
173
+ if isinstance(input_files, str):
174
+ input_files_parsed = [input_files]
175
+ elif isinstance(input_files, tuple) or isinstance(input_files, list):
176
+ for path in input_files:
177
+ if not isinstance(path, str):
178
+ raise ValueError("Input file list must be a string, tuple of strings, or list of strings.")
179
+ if not os.path.exists(path):
180
+ raise FileNotFoundError(f"File or directory '{path}' not found.")
181
+ elif os.path.isfile(path) and not path.endswith(tuple(valid_fastq_extensions)):
182
+ raise ValueError(f"File '{path}' is not a valid FASTQ file.")
183
+ elif os.path.isdir(path):
184
+ input_files_before_path = input_files_parsed.copy()
185
+ for subpath in os.listdir(path):
186
+ if os.path.isfile(subpath) and subpath.endswith(tuple(valid_fastq_extensions)):
187
+ input_files_parsed.append(subpath)
188
+ if input_files_before_path == input_files_parsed:
189
+ raise ValueError(f"No valid FASTQ files found in directory '{path}'.")
190
+ elif os.path.isfile(path) and path.endswith(tuple(valid_fastq_extensions)):
191
+ input_files_parsed.append(path)
192
+ else:
193
+ raise ValueError("Input file list must be a string, tuple of strings, or list of strings.")
194
+
195
+ if isinstance(seed, int): # if a single int is passed as a seed
196
+ seed = [seed]
197
+ elif isinstance(seed, str): # if a string of comma-separated ints is passed as a seed (like on the command line)
198
+ seed = [int(specific_seed) for specific_seed in seed.split(",")]
199
+
200
+ group_size = int(group_size) # make sure group_size is an int (not a string)
201
+ fraction = float(fraction) # make sure fraction is a float (not a string)
202
+
203
+ if group_size > 1:
204
+ input_files_parsed = group_items(input_files_parsed, group_size=group_size)
205
+
206
+ # Count reads in each file and store in a dictionary
207
+ make_fastq_to_length_dict(input_files_parsed, verbose=verbose)
208
+
209
+ # Do the sampling
210
+ sample_multiple_files(file_list=input_files_parsed, fraction=fraction, seed_list=seed, output=output_dir, gzip_output=gzip_output, replacement=replacement, low_memory=low_memory, verbose=verbose)
211
+
212
+ def main():
213
+ # Create argument parser
214
+ parser = argparse.ArgumentParser(description="Fast and memory-efficient sampling of DNA-Seq or RNA-seq fastq data with or without replacement.")
215
+ parser.add_argument("-f", "--fraction", required=True, default=False, help="The fraction of reads to sample, as a float greater than 0. Any value equal to or greater than 1 will turn on the -r flag automatically.")
216
+ parser.add_argument("-s", "--seed", required=False, default=42, help="Random seed(s). Can provide multiple seeds separated by commas. Default: 42")
217
+ parser.add_argument("-o", "--output_dir", required=False, type=str, default="fastQpick_output", help="Output file path. Default: ./fastQpick_output")
218
+ parser.add_argument("-z", "--gzip_output", required=False, default=False, help="Whether or not to gzip the output. Default: False (uncompressed)")
219
+ parser.add_argument("-g", "--group_size", required=False, default=1, help="The size of grouped files. Provide each pair of files sequentially, separated by a space. E.g., I1, R1, R2 would have group_size=3. Default: 1 (unpaired)")
220
+ parser.add_argument("-r", "--replacement", action="store_true", help="Sample with replacement. Default: False (without replacement).")
221
+ parser.add_argument("-w", "--overwrite", action="store_true", help="Overwrite existing output files. Default: False")
222
+ parser.add_argument("-l", "--low_memory", action="store_true", help="Whether to use low memory mode (uses ~5.5x less memory than default, but adds marginal time to the data structure generation preprocessing). Default: False")
223
+ parser.add_argument("-q", "--quiet", action="store_false", help="Turn off verbose output. Default: False")
224
+ parser.add_argument("-v", "--version", action="version", version=f"fastQpick {__version__}", help="Show program's version number and exit")
225
+
226
+ # Positional argument for input files (indefinite number)
227
+ parser.add_argument("input_files", nargs="+", help="Input FASTQ file(s) (one after the other, space-separated) or FASTQ folder(s)")
228
+
229
+ # Parse arguments
230
+ args = parser.parse_args()
231
+
232
+ fastQpick(input_files=args.input_files,
233
+ fraction=args.fraction,
234
+ seed=args.seed,
235
+ output_dir=args.output_dir,
236
+ gzip_output=args.gzip_output,
237
+ group_size=args.group_size,
238
+ replacement=args.replacement,
239
+ overwrite=args.overwrite,
240
+ low_memory=args.low_memory,
241
+ verbose=args.quiet)
@@ -0,0 +1,102 @@
1
+ import gzip
2
+ import inspect
3
+ import json
4
+ import os
5
+ from collections import OrderedDict
6
+ import pyfastx
7
+
8
+ def count_reads(filepath):
9
+ fastq_file = pyfastx.Fastx(filepath)
10
+ num_reads = sum(1 for _ in fastq_file)
11
+ return num_reads
12
+
13
+ def read_fastq(fastq_file, include_plus_line=False):
14
+ is_gzipped = fastq_file.endswith(".gz")
15
+ open_func = gzip.open if is_gzipped else open
16
+ open_mode = "rt" if is_gzipped else "r"
17
+
18
+ try:
19
+ if include_plus_line:
20
+ with open_func(fastq_file, open_mode) as file:
21
+ while True:
22
+ header = file.readline().strip()
23
+ sequence = file.readline().strip()
24
+ plus_line = file.readline().strip()
25
+ quality = file.readline().strip()
26
+
27
+ if not header:
28
+ break
29
+
30
+ yield header, sequence, plus_line, quality
31
+ else: # copy-paste the code so that it doesn't have to check the conditional every iteration
32
+ with open_func(fastq_file, open_mode) as file:
33
+ while True:
34
+ header = file.readline().strip()
35
+ sequence = file.readline().strip()
36
+ plus_line = file.readline().strip()
37
+ quality = file.readline().strip()
38
+
39
+ if not header:
40
+ break
41
+
42
+ yield header, sequence, quality
43
+ except Exception as e:
44
+ raise RuntimeError(f"Error reading FASTQ file '{fastq_file}': {e}")
45
+
46
+ def make_function_parameter_to_value_dict(levels_up = 1):
47
+ # Collect parameters in a dictionary
48
+ params = OrderedDict()
49
+
50
+ # Get the caller's frame (one level up in the stack)
51
+ frame = inspect.currentframe()
52
+
53
+ for _ in range(levels_up):
54
+ if frame is None:
55
+ break
56
+ frame = frame.f_back
57
+
58
+ function_args, varargs, varkw, values = inspect.getargvalues(frame)
59
+
60
+ # handle explicit function arguments
61
+ for arg in function_args:
62
+ params[arg] = values[arg]
63
+
64
+ # handle *args
65
+ if varargs:
66
+ params["*args"] = values[varargs]
67
+
68
+ # handle **kwargs
69
+ if varkw:
70
+ for key, value in values[varkw].items():
71
+ params[key] = value
72
+
73
+ return params
74
+
75
+
76
+ def save_params_to_config_file(out_file="run_config.json"):
77
+ out_file_directory = os.path.dirname(out_file)
78
+ if not out_file_directory:
79
+ out_file_directory = "."
80
+ else:
81
+ os.makedirs(out_file_directory, exist_ok=True)
82
+
83
+ # Collect parameters in a dictionary
84
+ params = make_function_parameter_to_value_dict(levels_up = 2)
85
+
86
+ # Write to JSON
87
+ with open(out_file, "w") as file:
88
+ json.dump(params, file, indent=4)
89
+
90
+
91
+ def is_directory_effectively_empty(directory_path):
92
+ # Get all non-hidden entries, excluding system files like `.DS_Store`
93
+ entries = [
94
+ entry for entry in os.listdir(directory_path)
95
+ if entry not in {".DS_Store"} and not entry.startswith(".")
96
+ ]
97
+ return len(entries) == 0
98
+
99
+ def group_items(file_list, group_size=2):
100
+ if len(file_list) % group_size != 0:
101
+ raise ValueError(f"The list length must be divisible by {group_size} to form groups.")
102
+ return [tuple(file_list[i:i + group_size]) for i in range(0, len(file_list), group_size)]
@@ -0,0 +1,197 @@
1
+ Metadata-Version: 2.2
2
+ Name: fastQpick
3
+ Version: 0.1.0
4
+ Summary: Fast and memory-efficient sampling of DNA-Seq or RNA-seq fastq data with or without replacement.
5
+ Author-email: Joseph Rich <josephrich98@gmail.com>
6
+ Maintainer-email: Joseph Rich <josephrich98@gmail.com>
7
+ License: BSD 2-Clause License
8
+
9
+ Copyright (c) 2024, Pachter Lab
10
+
11
+ Redistribution and use in source and binary forms, with or without
12
+ modification, are permitted provided that the following conditions are met:
13
+
14
+ 1. Redistributions of source code must retain the above copyright notice, this
15
+ list of conditions and the following disclaimer.
16
+
17
+ 2. Redistributions in binary form must reproduce the above copyright notice,
18
+ this list of conditions and the following disclaimer in the documentation
19
+ and/or other materials provided with the distribution.
20
+
21
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
+
32
+ Project-URL: Homepage, https://github.com/pachterlab/fastQpick
33
+ Keywords: fastQpick,bioinformatics,statistics,RNA-seq,DNA-seq
34
+ Classifier: Environment :: Console
35
+ Classifier: Framework :: Jupyter
36
+ Classifier: Intended Audience :: Science/Research
37
+ Classifier: License :: OSI Approved :: BSD License
38
+ Classifier: Operating System :: OS Independent
39
+ Classifier: Programming Language :: Python :: 3.9
40
+ Classifier: Programming Language :: Python :: 3.10
41
+ Classifier: Programming Language :: Python :: 3.11
42
+ Classifier: Programming Language :: Python :: 3.12
43
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
44
+ Classifier: Topic :: Utilities
45
+ Requires-Python: >=3.7
46
+ Description-Content-Type: text/markdown
47
+ License-File: LICENSE
48
+ Requires-Dist: pyfastx>=2.0.0
49
+ Requires-Dist: tqdm>=4.66.0
50
+
51
+ # fastQpick
52
+
53
+ Fast and memory-efficient sampling of DNA-seq or RNA-seq FASTQ data with or without replacement.
54
+
55
+ ---
56
+
57
+ ## Installation
58
+
59
+ ### Install via PyPI
60
+ ```bash
61
+ pip install fastQpick
62
+ ```
63
+
64
+ ### Install from Source Code
65
+
66
+ Using pip:
67
+ ```bash
68
+ pip install git+https://github.com/pachterlab/fastQpick.git
69
+ ```
70
+
71
+ Or clone the repository and build manually:
72
+ ```bash
73
+ git clone https://github.com/pachterlab/fastQpick.git
74
+ cd fastQpick
75
+ python -m build
76
+ python -m pip install dist/fastQpick-x.x.x-py3-none-any.whl
77
+ ```
78
+
79
+ ---
80
+
81
+ ## Usage
82
+
83
+ ### Command-line Interface
84
+
85
+ Run `fastQpick` with a specified fraction and options:
86
+ ```bash
87
+ fastQpick --fraction FRACTION [OPTIONS] FASTQ_FILE1 FASTQ_FILE2 ...
88
+ ```
89
+
90
+ ### Python API
91
+
92
+ Use `fastQpick` in your Python code:
93
+ ```python
94
+ from fastQpick import fastQpick
95
+
96
+ fastQpick(
97
+ input_file_list=['FASTQ_FILE1', 'FASTQ_FILE2', ...],
98
+ fraction=FRACTION,
99
+ ...
100
+ )
101
+ ```
102
+
103
+ ---
104
+
105
+ ## Documentation
106
+
107
+ - **Command-line Help**: Use the following command to see all available options:
108
+ ```bash
109
+ fastQpick --help
110
+ ```
111
+
112
+ - **Python API Help**: Use the `help` function to explore the API:
113
+ ```python
114
+ help(fastQpick)
115
+ ```
116
+
117
+ ### Options
118
+ - input_files (str, list, or tuple) List of input FASTQ files or directories containing FASTQ files. Required. Positional argument on command line.
119
+ - fraction (int or float) The fraction of reads to sample, as a float greater than 0. Any value equal to or greater than 1 will turn on the -r flag automatically.
120
+ - seed (int or str) Random seed(s). Can provide multiple seeds separated by commas. Default: 42
121
+ - output_dir (str) Output directory. Default: ./fastQpick_output
122
+ - gzip_output (bool) Whether or not to gzip the output. Default: False (uncompressed)
123
+ - group_size (int) The size of grouped files. Provide each pair of files sequentially, separated by a space. E.g., I1, R1, R2 - would have group_size=3. Default: 1 (unpaired)
124
+ - replacement (bool) Sample with replacement. Default: False (without replacement).
125
+ - overwrite (bool) Overwrite existing output files. Default: False
126
+ - low_memory (bool) Whether to use low memory mode (uses ~5.5x less memory than default, but adds marginal time to the data - structure generation preprocessing). Default: False
127
+ - verbose (bool) Whether to print progress information. Default: True
128
+
129
+ ---
130
+
131
+ ## Features
132
+
133
+ - Efficient sampling of large FASTQ files.
134
+ - Works with both single and paired-end sequencing data.
135
+ - Supports sampling with or without replacement.
136
+ - Command-line interface and Python API for seamless integration.
137
+ - Memory efficient - in low-memory mode, only uses as much memory as a list of (small) integers the length of the number of reads in the fastq file for each file.
138
+ - Time efficient - only passes through the fastq once and writes to output in batches - can process 600M reads in 10-15 minutes
139
+
140
+ ## Low memory mode vs. standard
141
+ Low memory mode vs. standard, when fraction=1 (i.e., number of reads to sample is the same as the number of reads in the fastq):
142
+ - Adds an extra ~1-3 seconds per million reads per group_size (i.e., 500M reads would take 30 minutes instead of 20-25 minutes)
143
+ - Saves an extra ~40MB RAM per million reads (i.e., 500M reads would take 3.75GB RAM vs 20.6GB RAM)
144
+
145
+ ---
146
+
147
+ ## Examples
148
+
149
+ ### 1. Sample 10% of reads with replacement from a FASTQ file:
150
+
151
+ **Command-line**
152
+ ```bash
153
+ fastQpick --fraction 0.1 -r input.fastq
154
+ ```
155
+
156
+ **Python**
157
+ ```python
158
+ from fastQpick import fastQpick
159
+
160
+ fastQpick(
161
+ input_files='input.fastq',
162
+ fraction=0.1,
163
+ replacement=True
164
+ )
165
+ ```
166
+
167
+ ### 2. Sample 100% of reads with replacement from multiple paired FASTQ files (R1, R2) across three seeds (i.e., bootstrapping):
168
+
169
+ **Command-line**
170
+ ```bash
171
+ fastQpick --fraction 1 -s 42,43,44 -r -g 2 input1_R1.fastq input1_R2.fastq
172
+ ```
173
+
174
+ **Python**
175
+ ```python
176
+ from fastQpick import fastQpick
177
+
178
+ fastQpick(
179
+ input_files='input.fastq',
180
+ fraction=1,
181
+ seed="42,43,44",
182
+ replacement=True,
183
+ group_size=2,
184
+ )
185
+ ```
186
+ ---
187
+
188
+ ## License
189
+
190
+ fastQpick is licensed under the 2-clause BSD license. See the [LICENSE](LICENSE) file for details.
191
+
192
+ ---
193
+
194
+ ## Contributing
195
+
196
+ We welcome contributions! Please see the [CONTRIBUTING.md](CONTRIBUTING.md) file for guidelines on how to get involved.
197
+
@@ -0,0 +1,14 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ fastQpick/__init__.py
5
+ fastQpick/_version.py
6
+ fastQpick/main.py
7
+ fastQpick/utils.py
8
+ fastQpick.egg-info/PKG-INFO
9
+ fastQpick.egg-info/SOURCES.txt
10
+ fastQpick.egg-info/dependency_links.txt
11
+ fastQpick.egg-info/entry_points.txt
12
+ fastQpick.egg-info/requires.txt
13
+ fastQpick.egg-info/top_level.txt
14
+ tests/test_fastQpick.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ fastQpick = fastQpick.main:main
@@ -0,0 +1,2 @@
1
+ pyfastx>=2.0.0
2
+ tqdm>=4.66.0
@@ -0,0 +1 @@
1
+ fastQpick
@@ -0,0 +1,47 @@
1
+ [build-system]
2
+ requires = ["setuptools>=64", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "fastQpick"
7
+ version = "0.1.0"
8
+ description = "Fast and memory-efficient sampling of DNA-Seq or RNA-seq fastq data with or without replacement."
9
+ readme = {file = "README.md", content-type = "text/markdown"}
10
+ license = {file = "LICENSE"}
11
+ authors = [
12
+ {name = "Joseph Rich", email = "josephrich98@gmail.com"}
13
+ ]
14
+ maintainers = [
15
+ {name = "Joseph Rich", email = "josephrich98@gmail.com"}
16
+ ]
17
+ requires-python = ">=3.7"
18
+ keywords = ["fastQpick", "bioinformatics", "statistics", "RNA-seq", "DNA-seq"]
19
+ classifiers = [
20
+ "Environment :: Console",
21
+ "Framework :: Jupyter",
22
+ "Intended Audience :: Science/Research",
23
+ "License :: OSI Approved :: BSD License",
24
+ "Operating System :: OS Independent",
25
+ "Programming Language :: Python :: 3.9",
26
+ "Programming Language :: Python :: 3.10",
27
+ "Programming Language :: Python :: 3.11",
28
+ "Programming Language :: Python :: 3.12",
29
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
30
+ "Topic :: Utilities"
31
+ ]
32
+ dependencies = [
33
+ "pyfastx>=2.0.0",
34
+ "tqdm>=4.66.0",
35
+ ]
36
+
37
+ [project.urls]
38
+ "Homepage" = "https://github.com/pachterlab/fastQpick"
39
+
40
+ [tool.setuptools]
41
+ packages = ["fastQpick"]
42
+
43
+ [tool.setuptools.package-data]
44
+ fastQpick = ["*.txt", "*.md", "*.csv"] # Ensure additional files like data or docs are included
45
+
46
+ [project.scripts]
47
+ fastQpick = "fastQpick.main:main" # This replaces the console_scripts entry
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,320 @@
1
+ import os
2
+ import tempfile
3
+ import pytest
4
+ from fastQpick import fastQpick
5
+ from fastQpick.utils import read_fastq, count_reads
6
+ from pdb import set_trace as st
7
+
8
+ @pytest.fixture
9
+ def temp_fastq_file():
10
+ content = """@Header1
11
+ AAAAAAAAAAAAAAAAAAAAA
12
+ +
13
+ IIIIIIIIIIIIIIIIIIIII
14
+ @Header2
15
+ CCCCCCCCCCCCCCCCCCCCC
16
+ +
17
+ IIIIIIIIIIIIIIIIIIIII
18
+ @Header3
19
+ GGGGGGGGGGGGGGGGGGGGG
20
+ +
21
+ IIIIIIIIIIIIIIIIIIIII
22
+ @Header4
23
+ TTTTTTTTTTTTTTTTTTTTT
24
+ +
25
+ IIIIIIIIIIIIIIIIIIIII
26
+ @Header5
27
+ AAAAAAAAAAAAACCCCCCCC
28
+ +
29
+ IIIIIIIIIIIIIIIIIIIII
30
+ """
31
+ # Create a temporary file
32
+ with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".fastq") as temp_file:
33
+ temp_file.write(content)
34
+ temp_file.seek(0) # Move to the start of the file
35
+ yield temp_file.name # Provide the file path to the test
36
+
37
+ # Cleanup after the test
38
+ os.remove(temp_file.name)
39
+
40
+ # Fixture to create two temporary FASTQ files
41
+ @pytest.fixture
42
+ def temp_paired_fastq_files():
43
+ content_1 = """@Header1_1
44
+ AAAAAAAAAAAAAAAAAAAAA
45
+ +
46
+ IIIIIIIIIIIIIIIIIIIII
47
+ @Header2_1
48
+ CCCCCCCCCCCCCCCCCCCCC
49
+ +
50
+ IIIIIIIIIIIIIIIIIIIII
51
+ @Header3_1
52
+ GGGGGGGGGGGGGGGGGGGGG
53
+ +
54
+ IIIIIIIIIIIIIIIIIIIII
55
+ @Header4_1
56
+ TTTTTTTTTTTTTTTTTTTTT
57
+ +
58
+ IIIIIIIIIIIIIIIIIIIII
59
+ """
60
+
61
+ content_2 = """@Header1_2
62
+ AAAAAAAACCCCCCCCCCCCC
63
+ +
64
+ IIIIIIIIIIIIIIIIIIIII
65
+ @Header2_2
66
+ AAAAAAAGGGGGGGGGGGGGG
67
+ +
68
+ IIIIIIIIIIIIIIIIIIIII
69
+ @Header3_2
70
+ AAAAAAATTTTTTTTTTTTTT
71
+ +
72
+ IIIIIIIIIIIIIIIIIIIII
73
+ @Header4_2
74
+ CCCCCCCCAAAAAAAAAAAAA
75
+ +
76
+ IIIIIIIIIIIIIIIIIIIII
77
+ """
78
+
79
+ # Create two temporary files
80
+ with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".fastq") as temp_file1, \
81
+ tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".fastq") as temp_file2:
82
+ temp_file1.write(content_1)
83
+ temp_file2.write(content_2)
84
+ temp_file1.seek(0)
85
+ temp_file2.seek(0)
86
+ yield [temp_file1.name, temp_file2.name] # Yield the paths of both files
87
+
88
+ # Cleanup after the test
89
+ os.remove(temp_file1.name)
90
+ os.remove(temp_file2.name)
91
+
92
+
93
+
94
+
95
+ def is_gzipped(file_path):
96
+ with open(file_path, "rb") as f:
97
+ magic_number = f.read(2)
98
+ return magic_number == b"\x1f\x8b"
99
+
100
+ def validate_fastq_format(file_path, ground_truth=None):
101
+ for header, seq, plus_line, qual in read_fastq(file_path, include_plus_line=True):
102
+ assert header.startswith("@"), f"Header does not start with '@': {header}"
103
+ assert len(seq) == len(qual), f"Sequence and quality lengths do not match: {seq} {qual}"
104
+ assert plus_line.startswith("+"), f"Plus line does not start with '+': {plus_line}"
105
+
106
+ if ground_truth:
107
+ assert header in ground_truth, f"Header not found in ground truth: {header}"
108
+ assert seq == ground_truth[header]["sequence"], f"Sequence mismatch - expected: {seq}; got: {ground_truth[header]['sequence']}"
109
+ assert plus_line == ground_truth[header]["plus_line"], f"Plus line mismatch - expected: {plus_line}; got: {ground_truth[header]['plus_line']}"
110
+ assert qual == ground_truth[header]["quality"], f"Quality mismatch - expected: {qual}; got: {ground_truth[header]['quality']}"
111
+
112
+ def count_number_of_unique_headers(file_path):
113
+ headers = set()
114
+ for header, _, _, _ in read_fastq(file_path, include_plus_line=True):
115
+ headers.add(header)
116
+ return len(headers)
117
+
118
+ def make_fastq_dict(file_path):
119
+ fastq_dict = {}
120
+ for header, seq, plus_line, qual in read_fastq(file_path, include_plus_line=True):
121
+ fastq_dict[header] = {}
122
+ fastq_dict[header]["sequence"] = seq
123
+ fastq_dict[header]["plus_line"] = plus_line
124
+ fastq_dict[header]["quality"] = qual
125
+ return fastq_dict
126
+
127
+
128
+ def check_pairwise_agreement(temp_paired_fastq_files, temp_output_dir, gzip_output):
129
+ file1_base_name = os.path.basename(temp_paired_fastq_files[0])
130
+ file2_base_name = os.path.basename(temp_paired_fastq_files[1])
131
+
132
+ output_fastq_file1 = os.path.join(temp_output_dir, file1_base_name)
133
+ output_fastq_file2 = os.path.join(temp_output_dir, file2_base_name)
134
+
135
+ if gzip_output:
136
+ output_fastq_file1 += ".gz"
137
+ output_fastq_file2 += ".gz"
138
+
139
+ for (header1, seq1, plus_line1, qual1), (header2, seq2, plus_line2, qual2) in zip(
140
+ read_fastq(output_fastq_file1, include_plus_line=True),
141
+ read_fastq(output_fastq_file2, include_plus_line=True)
142
+ ):
143
+ # Split headers up to the last underscore
144
+ split_header1 = header1.rsplit('_', 1)[0]
145
+ split_header2 = header2.rsplit('_', 1)[0]
146
+
147
+ # Assert that the two headers are equal
148
+ assert split_header1 == split_header2, f"Headers do not match: {split_header1} != {split_header2}"
149
+
150
+ def run_all_single_file_tests(temp_output_dir, temp_fastq_file, gzip_output, fraction, replacement):
151
+ # Assert that the output directory exists
152
+ assert os.path.exists(temp_output_dir), "Output directory does not exist!"
153
+
154
+ # Optionally, verify the output files
155
+ output_files = os.listdir(temp_output_dir)
156
+ assert len(output_files) > 0, "No output files were created!"
157
+
158
+ file_base_name = os.path.basename(temp_fastq_file)
159
+ output_fastq_file = os.path.join(temp_output_dir, file_base_name)
160
+
161
+ if gzip_output:
162
+ output_fastq_file += ".gz"
163
+
164
+ input_fastq_dict = make_fastq_dict(temp_fastq_file)
165
+ validate_fastq_format(output_fastq_file, ground_truth=input_fastq_dict)
166
+
167
+ output_is_gzipped = is_gzipped(output_fastq_file)
168
+ assert output_is_gzipped == gzip_output, f"Gzipped output - expected: {gzip_output}; got: {output_is_gzipped}"
169
+
170
+ num_reads_truth = count_reads(temp_fastq_file)
171
+ num_reads_output = count_reads(output_fastq_file)
172
+
173
+ assert num_reads_output == num_reads_truth * fraction, f"Number of reads mismatch - expected: {num_reads_truth * fraction}; got: {num_reads_output}"
174
+
175
+ num_unique_reads = count_number_of_unique_headers(output_fastq_file)
176
+
177
+ if not replacement:
178
+ assert num_unique_reads == num_reads_output, f"Number of unique reads mismatch - expected: {num_reads_output}; got: {num_unique_reads}"
179
+
180
+ if replacement and fraction > 1:
181
+ assert num_unique_reads < num_reads_output, f"Number of unique reads mismatch - expected: less than {num_reads_output}; got: {num_unique_reads}"
182
+
183
+ def test_single_file(temp_fastq_file):
184
+ fraction = 0.6
185
+ seed = 42
186
+ gzip_output = False
187
+ group_size = 1
188
+ replacement = False
189
+
190
+ with tempfile.TemporaryDirectory() as temp_output_dir:
191
+ fastQpick(input_files=temp_fastq_file,
192
+ fraction=fraction,
193
+ seed=seed,
194
+ output_dir=temp_output_dir,
195
+ gzip_output=gzip_output,
196
+ group_size=group_size,
197
+ replacement=replacement,
198
+ overwrite=True
199
+ )
200
+
201
+ run_all_single_file_tests(temp_output_dir=temp_output_dir, temp_fastq_file=temp_fastq_file, gzip_output=gzip_output, fraction=fraction, replacement=replacement)
202
+
203
+ # st()
204
+
205
+ def test_single_file_bootstrapped(temp_fastq_file):
206
+ fraction = 1
207
+ seed = 42
208
+ gzip_output = False
209
+ group_size = 1
210
+ replacement = True
211
+
212
+ with tempfile.TemporaryDirectory() as temp_output_dir:
213
+ fastQpick(input_files=temp_fastq_file,
214
+ fraction=fraction,
215
+ seed=seed,
216
+ output_dir=temp_output_dir,
217
+ gzip_output=gzip_output,
218
+ group_size=group_size,
219
+ replacement=replacement,
220
+ overwrite=True
221
+ )
222
+
223
+ run_all_single_file_tests(temp_output_dir=temp_output_dir, temp_fastq_file=temp_fastq_file, gzip_output=gzip_output, fraction=fraction, replacement=replacement)
224
+
225
+ # st()
226
+
227
+ def test_single_file_oversampled(temp_fastq_file):
228
+ fraction = 3
229
+ seed = 42
230
+ gzip_output = False
231
+ group_size = 1
232
+ replacement = True
233
+
234
+ with tempfile.TemporaryDirectory() as temp_output_dir:
235
+ fastQpick(input_files=temp_fastq_file,
236
+ fraction=fraction,
237
+ seed=seed,
238
+ output_dir=temp_output_dir,
239
+ gzip_output=gzip_output,
240
+ group_size=group_size,
241
+ replacement=replacement,
242
+ overwrite=True
243
+ )
244
+
245
+ run_all_single_file_tests(temp_output_dir=temp_output_dir, temp_fastq_file=temp_fastq_file, gzip_output=gzip_output, fraction=fraction, replacement=replacement)
246
+
247
+ # st()
248
+
249
+ def test_single_gzipped(temp_fastq_file):
250
+ fraction = 0.6
251
+ seed = 42
252
+ gzip_output = True
253
+ group_size = 1
254
+ replacement = False
255
+
256
+ with tempfile.TemporaryDirectory() as temp_output_dir:
257
+ fastQpick(input_files=temp_fastq_file,
258
+ fraction=fraction,
259
+ seed=seed,
260
+ output_dir=temp_output_dir,
261
+ gzip_output=gzip_output,
262
+ group_size=group_size,
263
+ replacement=replacement,
264
+ overwrite=True
265
+ )
266
+
267
+ run_all_single_file_tests(temp_output_dir=temp_output_dir, temp_fastq_file=temp_fastq_file, gzip_output=gzip_output, fraction=fraction, replacement=replacement)
268
+
269
+ # st()
270
+
271
+
272
+ def test_paired_files(temp_paired_fastq_files):
273
+ fraction = 0.75
274
+ seed = 42
275
+ gzip_output = False
276
+ group_size = 2
277
+ replacement = False
278
+
279
+ with tempfile.TemporaryDirectory() as temp_output_dir:
280
+ fastQpick(input_files=temp_paired_fastq_files,
281
+ fraction=fraction,
282
+ seed=seed,
283
+ output_dir=temp_output_dir,
284
+ gzip_output=gzip_output,
285
+ group_size=group_size,
286
+ replacement=replacement,
287
+ overwrite=True
288
+ )
289
+
290
+ for fastq_file in temp_paired_fastq_files:
291
+ run_all_single_file_tests(temp_output_dir=temp_output_dir, temp_fastq_file=fastq_file, gzip_output=gzip_output, fraction=fraction, replacement=replacement)
292
+
293
+ check_pairwise_agreement(temp_paired_fastq_files=temp_paired_fastq_files, temp_output_dir=temp_output_dir, gzip_output=gzip_output)
294
+
295
+ # st()
296
+
297
+ def test_paired_files_bootstrapped(temp_paired_fastq_files):
298
+ fraction = 1
299
+ seed = 42
300
+ gzip_output = False
301
+ group_size = 2
302
+ replacement = True
303
+
304
+ with tempfile.TemporaryDirectory() as temp_output_dir:
305
+ fastQpick(input_files=temp_paired_fastq_files,
306
+ fraction=fraction,
307
+ seed=seed,
308
+ output_dir=temp_output_dir,
309
+ gzip_output=gzip_output,
310
+ group_size=group_size,
311
+ replacement=replacement,
312
+ overwrite=True
313
+ )
314
+
315
+ for fastq_file in temp_paired_fastq_files:
316
+ run_all_single_file_tests(temp_output_dir=temp_output_dir, temp_fastq_file=fastq_file, gzip_output=gzip_output, fraction=fraction, replacement=replacement)
317
+
318
+ check_pairwise_agreement(temp_paired_fastq_files=temp_paired_fastq_files, temp_output_dir=temp_output_dir, gzip_output=gzip_output)
319
+
320
+ # st()