fastQpick 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fastqpick-0.1.0/LICENSE +24 -0
- fastqpick-0.1.0/PKG-INFO +197 -0
- fastqpick-0.1.0/README.md +147 -0
- fastqpick-0.1.0/fastQpick/__init__.py +5 -0
- fastqpick-0.1.0/fastQpick/_version.py +1 -0
- fastqpick-0.1.0/fastQpick/main.py +241 -0
- fastqpick-0.1.0/fastQpick/utils.py +102 -0
- fastqpick-0.1.0/fastQpick.egg-info/PKG-INFO +197 -0
- fastqpick-0.1.0/fastQpick.egg-info/SOURCES.txt +14 -0
- fastqpick-0.1.0/fastQpick.egg-info/dependency_links.txt +1 -0
- fastqpick-0.1.0/fastQpick.egg-info/entry_points.txt +2 -0
- fastqpick-0.1.0/fastQpick.egg-info/requires.txt +2 -0
- fastqpick-0.1.0/fastQpick.egg-info/top_level.txt +1 -0
- fastqpick-0.1.0/pyproject.toml +47 -0
- fastqpick-0.1.0/setup.cfg +4 -0
- fastqpick-0.1.0/tests/test_fastQpick.py +320 -0
fastqpick-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
BSD 2-Clause License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024, Pachter Lab
|
|
4
|
+
|
|
5
|
+
Redistribution and use in source and binary forms, with or without
|
|
6
|
+
modification, are permitted provided that the following conditions are met:
|
|
7
|
+
|
|
8
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
|
9
|
+
list of conditions and the following disclaimer.
|
|
10
|
+
|
|
11
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
|
12
|
+
this list of conditions and the following disclaimer in the documentation
|
|
13
|
+
and/or other materials provided with the distribution.
|
|
14
|
+
|
|
15
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
16
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
17
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
18
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
19
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
20
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
21
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
22
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
23
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
24
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
fastqpick-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: fastQpick
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Fast and memory-efficient sampling of DNA-Seq or RNA-seq fastq data with or without replacement.
|
|
5
|
+
Author-email: Joseph Rich <josephrich98@gmail.com>
|
|
6
|
+
Maintainer-email: Joseph Rich <josephrich98@gmail.com>
|
|
7
|
+
License: BSD 2-Clause License
|
|
8
|
+
|
|
9
|
+
Copyright (c) 2024, Pachter Lab
|
|
10
|
+
|
|
11
|
+
Redistribution and use in source and binary forms, with or without
|
|
12
|
+
modification, are permitted provided that the following conditions are met:
|
|
13
|
+
|
|
14
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
|
15
|
+
list of conditions and the following disclaimer.
|
|
16
|
+
|
|
17
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
|
18
|
+
this list of conditions and the following disclaimer in the documentation
|
|
19
|
+
and/or other materials provided with the distribution.
|
|
20
|
+
|
|
21
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
22
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
23
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
24
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
25
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
26
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
27
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
28
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
29
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
30
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
31
|
+
|
|
32
|
+
Project-URL: Homepage, https://github.com/pachterlab/fastQpick
|
|
33
|
+
Keywords: fastQpick,bioinformatics,statistics,RNA-seq,DNA-seq
|
|
34
|
+
Classifier: Environment :: Console
|
|
35
|
+
Classifier: Framework :: Jupyter
|
|
36
|
+
Classifier: Intended Audience :: Science/Research
|
|
37
|
+
Classifier: License :: OSI Approved :: BSD License
|
|
38
|
+
Classifier: Operating System :: OS Independent
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
40
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
41
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
42
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
43
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
44
|
+
Classifier: Topic :: Utilities
|
|
45
|
+
Requires-Python: >=3.7
|
|
46
|
+
Description-Content-Type: text/markdown
|
|
47
|
+
License-File: LICENSE
|
|
48
|
+
Requires-Dist: pyfastx>=2.0.0
|
|
49
|
+
Requires-Dist: tqdm>=4.66.0
|
|
50
|
+
|
|
51
|
+
# fastQpick
|
|
52
|
+
|
|
53
|
+
Fast and memory-efficient sampling of DNA-seq or RNA-seq FASTQ data with or without replacement.
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## Installation
|
|
58
|
+
|
|
59
|
+
### Install via PyPI
|
|
60
|
+
```bash
|
|
61
|
+
pip install fastQpick
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Install from Source Code
|
|
65
|
+
|
|
66
|
+
Using pip:
|
|
67
|
+
```bash
|
|
68
|
+
pip install git+https://github.com/pachterlab/fastQpick.git
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Or clone the repository and build manually:
|
|
72
|
+
```bash
|
|
73
|
+
git clone https://github.com/pachterlab/fastQpick.git
|
|
74
|
+
cd fastQpick
|
|
75
|
+
python -m build
|
|
76
|
+
python -m pip install dist/fastQpick-x.x.x-py3-none-any.whl
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## Usage
|
|
82
|
+
|
|
83
|
+
### Command-line Interface
|
|
84
|
+
|
|
85
|
+
Run `fastQpick` with a specified fraction and options:
|
|
86
|
+
```bash
|
|
87
|
+
fastQpick --fraction FRACTION [OPTIONS] FASTQ_FILE1 FASTQ_FILE2 ...
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Python API
|
|
91
|
+
|
|
92
|
+
Use `fastQpick` in your Python code:
|
|
93
|
+
```python
|
|
94
|
+
from fastQpick import fastQpick
|
|
95
|
+
|
|
96
|
+
fastQpick(
|
|
97
|
+
input_file_list=['FASTQ_FILE1', 'FASTQ_FILE2', ...],
|
|
98
|
+
fraction=FRACTION,
|
|
99
|
+
...
|
|
100
|
+
)
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
## Documentation
|
|
106
|
+
|
|
107
|
+
- **Command-line Help**: Use the following command to see all available options:
|
|
108
|
+
```bash
|
|
109
|
+
fastQpick --help
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
- **Python API Help**: Use the `help` function to explore the API:
|
|
113
|
+
```python
|
|
114
|
+
help(fastQpick)
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
### Options
|
|
118
|
+
- input_files (str, list, or tuple) List of input FASTQ files or directories containing FASTQ files. Required. Positional argument on command line.
|
|
119
|
+
- fraction (int or float) The fraction of reads to sample, as a float greater than 0. Any value equal to or greater than 1 will turn on the -r flag automatically.
|
|
120
|
+
- seed (int or str) Random seed(s). Can provide multiple seeds separated by commas. Default: 42
|
|
121
|
+
- output_dir (str) Output directory. Default: ./fastQpick_output
|
|
122
|
+
- gzip_output (bool) Whether or not to gzip the output. Default: False (uncompressed)
|
|
123
|
+
- group_size (int) The size of grouped files. Provide each pair of files sequentially, separated by a space. E.g., I1, R1, R2 - would have group_size=3. Default: 1 (unpaired)
|
|
124
|
+
- replacement (bool) Sample with replacement. Default: False (without replacement).
|
|
125
|
+
- overwrite (bool) Overwrite existing output files. Default: False
|
|
126
|
+
- low_memory (bool) Whether to use low memory mode (uses ~5.5x less memory than default, but adds marginal time to the data - structure generation preprocessing). Default: False
|
|
127
|
+
- verbose (bool) Whether to print progress information. Default: True
|
|
128
|
+
|
|
129
|
+
---
|
|
130
|
+
|
|
131
|
+
## Features
|
|
132
|
+
|
|
133
|
+
- Efficient sampling of large FASTQ files.
|
|
134
|
+
- Works with both single and paired-end sequencing data.
|
|
135
|
+
- Supports sampling with or without replacement.
|
|
136
|
+
- Command-line interface and Python API for seamless integration.
|
|
137
|
+
- Memory efficient - in low-memory mode, only uses as much memory as a list of (small) integers the length of the number of reads in the fastq file for each file.
|
|
138
|
+
- Time efficient - only passes through the fastq once and writes to output in batches - can process 600M reads in 10-15 minutes
|
|
139
|
+
|
|
140
|
+
## Low memory mode vs. standard
|
|
141
|
+
Low memory mode vs. standard, when fraction=1 (i.e., number of reads to sample is the same as the number of reads in the fastq):
|
|
142
|
+
- Adds an extra ~1-3 seconds per million reads per group_size (i.e., 500M reads would take 30 minutes instead of 20-25 minutes)
|
|
143
|
+
- Saves an extra ~40MB RAM per million reads (i.e., 500M reads would take 3.75GB RAM vs 20.6GB RAM)
|
|
144
|
+
|
|
145
|
+
---
|
|
146
|
+
|
|
147
|
+
## Examples
|
|
148
|
+
|
|
149
|
+
### 1. Sample 10% of reads with replacement from a FASTQ file:
|
|
150
|
+
|
|
151
|
+
**Command-line**
|
|
152
|
+
```bash
|
|
153
|
+
fastQpick --fraction 0.1 -r input.fastq
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
**Python**
|
|
157
|
+
```python
|
|
158
|
+
from fastQpick import fastQpick
|
|
159
|
+
|
|
160
|
+
fastQpick(
|
|
161
|
+
input_files='input.fastq',
|
|
162
|
+
fraction=0.1,
|
|
163
|
+
replacement=True
|
|
164
|
+
)
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
### 2. Sample 100% of reads with replacement from multiple paired FASTQ files (R1, R2) across three seeds (i.e., bootstrapping):
|
|
168
|
+
|
|
169
|
+
**Command-line**
|
|
170
|
+
```bash
|
|
171
|
+
fastQpick --fraction 1 -s 42,43,44 -r -g 2 input1_R1.fastq input1_R2.fastq
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
**Python**
|
|
175
|
+
```python
|
|
176
|
+
from fastQpick import fastQpick
|
|
177
|
+
|
|
178
|
+
fastQpick(
|
|
179
|
+
input_files='input.fastq',
|
|
180
|
+
fraction=1,
|
|
181
|
+
seed="42,43,44",
|
|
182
|
+
replacement=True,
|
|
183
|
+
group_size=2,
|
|
184
|
+
)
|
|
185
|
+
```
|
|
186
|
+
---
|
|
187
|
+
|
|
188
|
+
## License
|
|
189
|
+
|
|
190
|
+
fastQpick is licensed under the 2-clause BSD license. See the [LICENSE](LICENSE) file for details.
|
|
191
|
+
|
|
192
|
+
---
|
|
193
|
+
|
|
194
|
+
## Contributing
|
|
195
|
+
|
|
196
|
+
We welcome contributions! Please see the [CONTRIBUTING.md](CONTRIBUTING.md) file for guidelines on how to get involved.
|
|
197
|
+
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
# fastQpick
|
|
2
|
+
|
|
3
|
+
Fast and memory-efficient sampling of DNA-seq or RNA-seq FASTQ data with or without replacement.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
### Install via PyPI
|
|
10
|
+
```bash
|
|
11
|
+
pip install fastQpick
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
### Install from Source Code
|
|
15
|
+
|
|
16
|
+
Using pip:
|
|
17
|
+
```bash
|
|
18
|
+
pip install git+https://github.com/pachterlab/fastQpick.git
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
Or clone the repository and build manually:
|
|
22
|
+
```bash
|
|
23
|
+
git clone https://github.com/pachterlab/fastQpick.git
|
|
24
|
+
cd fastQpick
|
|
25
|
+
python -m build
|
|
26
|
+
python -m pip install dist/fastQpick-x.x.x-py3-none-any.whl
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## Usage
|
|
32
|
+
|
|
33
|
+
### Command-line Interface
|
|
34
|
+
|
|
35
|
+
Run `fastQpick` with a specified fraction and options:
|
|
36
|
+
```bash
|
|
37
|
+
fastQpick --fraction FRACTION [OPTIONS] FASTQ_FILE1 FASTQ_FILE2 ...
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
### Python API
|
|
41
|
+
|
|
42
|
+
Use `fastQpick` in your Python code:
|
|
43
|
+
```python
|
|
44
|
+
from fastQpick import fastQpick
|
|
45
|
+
|
|
46
|
+
fastQpick(
|
|
47
|
+
input_file_list=['FASTQ_FILE1', 'FASTQ_FILE2', ...],
|
|
48
|
+
fraction=FRACTION,
|
|
49
|
+
...
|
|
50
|
+
)
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## Documentation
|
|
56
|
+
|
|
57
|
+
- **Command-line Help**: Use the following command to see all available options:
|
|
58
|
+
```bash
|
|
59
|
+
fastQpick --help
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
- **Python API Help**: Use the `help` function to explore the API:
|
|
63
|
+
```python
|
|
64
|
+
help(fastQpick)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Options
|
|
68
|
+
- input_files (str, list, or tuple) List of input FASTQ files or directories containing FASTQ files. Required. Positional argument on command line.
|
|
69
|
+
- fraction (int or float) The fraction of reads to sample, as a float greater than 0. Any value equal to or greater than 1 will turn on the -r flag automatically.
|
|
70
|
+
- seed (int or str) Random seed(s). Can provide multiple seeds separated by commas. Default: 42
|
|
71
|
+
- output_dir (str) Output directory. Default: ./fastQpick_output
|
|
72
|
+
- gzip_output (bool) Whether or not to gzip the output. Default: False (uncompressed)
|
|
73
|
+
- group_size (int) The size of grouped files. Provide each pair of files sequentially, separated by a space. E.g., I1, R1, R2 - would have group_size=3. Default: 1 (unpaired)
|
|
74
|
+
- replacement (bool) Sample with replacement. Default: False (without replacement).
|
|
75
|
+
- overwrite (bool) Overwrite existing output files. Default: False
|
|
76
|
+
- low_memory (bool) Whether to use low memory mode (uses ~5.5x less memory than default, but adds marginal time to the data - structure generation preprocessing). Default: False
|
|
77
|
+
- verbose (bool) Whether to print progress information. Default: True
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## Features
|
|
82
|
+
|
|
83
|
+
- Efficient sampling of large FASTQ files.
|
|
84
|
+
- Works with both single and paired-end sequencing data.
|
|
85
|
+
- Supports sampling with or without replacement.
|
|
86
|
+
- Command-line interface and Python API for seamless integration.
|
|
87
|
+
- Memory efficient - in low-memory mode, only uses as much memory as a list of (small) integers the length of the number of reads in the fastq file for each file.
|
|
88
|
+
- Time efficient - only passes through the fastq once and writes to output in batches - can process 600M reads in 10-15 minutes
|
|
89
|
+
|
|
90
|
+
## Low memory mode vs. standard
|
|
91
|
+
Low memory mode vs. standard, when fraction=1 (i.e., number of reads to sample is the same as the number of reads in the fastq):
|
|
92
|
+
- Adds an extra ~1-3 seconds per million reads per group_size (i.e., 500M reads would take 30 minutes instead of 20-25 minutes)
|
|
93
|
+
- Saves an extra ~40MB RAM per million reads (i.e., 500M reads would take 3.75GB RAM vs 20.6GB RAM)
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
|
|
97
|
+
## Examples
|
|
98
|
+
|
|
99
|
+
### 1. Sample 10% of reads with replacement from a FASTQ file:
|
|
100
|
+
|
|
101
|
+
**Command-line**
|
|
102
|
+
```bash
|
|
103
|
+
fastQpick --fraction 0.1 -r input.fastq
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
**Python**
|
|
107
|
+
```python
|
|
108
|
+
from fastQpick import fastQpick
|
|
109
|
+
|
|
110
|
+
fastQpick(
|
|
111
|
+
input_files='input.fastq',
|
|
112
|
+
fraction=0.1,
|
|
113
|
+
replacement=True
|
|
114
|
+
)
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
### 2. Sample 100% of reads with replacement from multiple paired FASTQ files (R1, R2) across three seeds (i.e., bootstrapping):
|
|
118
|
+
|
|
119
|
+
**Command-line**
|
|
120
|
+
```bash
|
|
121
|
+
fastQpick --fraction 1 -s 42,43,44 -r -g 2 input1_R1.fastq input1_R2.fastq
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
**Python**
|
|
125
|
+
```python
|
|
126
|
+
from fastQpick import fastQpick
|
|
127
|
+
|
|
128
|
+
fastQpick(
|
|
129
|
+
input_files='input.fastq',
|
|
130
|
+
fraction=1,
|
|
131
|
+
seed="42,43,44",
|
|
132
|
+
replacement=True,
|
|
133
|
+
group_size=2,
|
|
134
|
+
)
|
|
135
|
+
```
|
|
136
|
+
---
|
|
137
|
+
|
|
138
|
+
## License
|
|
139
|
+
|
|
140
|
+
fastQpick is licensed under the 2-clause BSD license. See the [LICENSE](LICENSE) file for details.
|
|
141
|
+
|
|
142
|
+
---
|
|
143
|
+
|
|
144
|
+
## Contributing
|
|
145
|
+
|
|
146
|
+
We welcome contributions! Please see the [CONTRIBUTING.md](CONTRIBUTING.md) file for guidelines on how to get involved.
|
|
147
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import gzip
|
|
3
|
+
import os
|
|
4
|
+
import random
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
import pyfastx # to loop through fastq (faster than custom python code)
|
|
7
|
+
|
|
8
|
+
from fastQpick._version import __version__
|
|
9
|
+
from fastQpick.utils import save_params_to_config_file, is_directory_effectively_empty, group_items, count_reads
|
|
10
|
+
|
|
11
|
+
# Global variables
|
|
12
|
+
valid_fastq_extensions = (".fastq", ".fq", ".fastq.gz", ".fq.gz")
|
|
13
|
+
batch_size = 200000 # for buffer
|
|
14
|
+
fastq_to_length_dict = {} # set to empty, and the user can provide otherwise it will be calculated
|
|
15
|
+
|
|
16
|
+
def write_fastq(input_fastq, output_path, occurrence_list, total_reads, gzip_output, seed = None, verbose = True):
|
|
17
|
+
if gzip_output:
|
|
18
|
+
open_func = gzip.open
|
|
19
|
+
write_mode = "wt"
|
|
20
|
+
else:
|
|
21
|
+
open_func = open
|
|
22
|
+
write_mode = "w"
|
|
23
|
+
|
|
24
|
+
buffer = [] # Temporary storage for the batch
|
|
25
|
+
|
|
26
|
+
input_fastq_read_only = pyfastx.Fastx(input_fastq)
|
|
27
|
+
|
|
28
|
+
# use tqdm if verbose else silently loop
|
|
29
|
+
iterator = (
|
|
30
|
+
tqdm(input_fastq_read_only, desc=f"Iterating through seed {seed}, file {input_fastq}", unit="read", total=total_reads)
|
|
31
|
+
if verbose else input_fastq_read_only
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
with open_func(output_path, write_mode) as f:
|
|
35
|
+
for i, (name, seq, qual) in enumerate(iterator):
|
|
36
|
+
# Add the FASTQ entry to the buffer
|
|
37
|
+
buffer.extend([f"@{name}\n{seq}\n+\n{qual}\n"] * occurrence_list[i])
|
|
38
|
+
|
|
39
|
+
# If the buffer reaches the batch size, write all at once and clear the buffer
|
|
40
|
+
if (i + 1) % batch_size == 0:
|
|
41
|
+
f.writelines(buffer)
|
|
42
|
+
buffer.clear() # Clear the buffer after writing
|
|
43
|
+
|
|
44
|
+
# Write any remaining entries in the buffer
|
|
45
|
+
if buffer:
|
|
46
|
+
f.writelines(buffer)
|
|
47
|
+
buffer.clear()
|
|
48
|
+
|
|
49
|
+
def make_occurrence_list(file, seed, total_reads, number_of_reads_to_sample, replacement, low_memory, verbose):
|
|
50
|
+
if verbose:
|
|
51
|
+
print(f"Calculating total reads and determining random indices for seed {seed}, file {file}")
|
|
52
|
+
if replacement:
|
|
53
|
+
if low_memory:
|
|
54
|
+
random_indices = (random.choice(range(total_reads)) for _ in range(number_of_reads_to_sample))
|
|
55
|
+
else:
|
|
56
|
+
random_indices = tuple(random.choices(range(total_reads), k=number_of_reads_to_sample)) # with replacement
|
|
57
|
+
else:
|
|
58
|
+
if low_memory:
|
|
59
|
+
random_indices = (index for index in random.sample(range(total_reads), k=number_of_reads_to_sample))
|
|
60
|
+
else:
|
|
61
|
+
random_indices = tuple(random.sample(range(total_reads), k=number_of_reads_to_sample)) # without replacement
|
|
62
|
+
|
|
63
|
+
# Initialize a list with zeros
|
|
64
|
+
occurrence_list = [0] * total_reads
|
|
65
|
+
|
|
66
|
+
# use tqdm if verbose, else just silently loop through
|
|
67
|
+
iterator = (
|
|
68
|
+
tqdm(random_indices, desc=f"Counting occurrences for seed {seed}, file {file}", unit="read", total=number_of_reads_to_sample)
|
|
69
|
+
if verbose else random_indices
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# Count occurrences (I don't use a counter in order to save memory, as a counter is essentially a dictionary)
|
|
73
|
+
for index in iterator:
|
|
74
|
+
occurrence_list[index] += 1
|
|
75
|
+
|
|
76
|
+
del random_indices
|
|
77
|
+
|
|
78
|
+
return occurrence_list
|
|
79
|
+
|
|
80
|
+
def bootstrap_single_file(files_total = None, gzip_output = None, output_directory = None, seed = None, fraction = None, replacement = None, low_memory = False, verbose=True):
|
|
81
|
+
if isinstance(files_total, str):
|
|
82
|
+
files_total = (files_total, )
|
|
83
|
+
|
|
84
|
+
total_reads = fastq_to_length_dict[files_total[0]]
|
|
85
|
+
number_of_reads_to_sample = int(fraction * total_reads)
|
|
86
|
+
|
|
87
|
+
occurrence_list = make_occurrence_list(file=files_total[0], seed=seed, total_reads=total_reads, number_of_reads_to_sample=number_of_reads_to_sample, replacement=replacement, low_memory=low_memory, verbose=verbose)
|
|
88
|
+
|
|
89
|
+
for file in files_total:
|
|
90
|
+
# Create output directory if it doesn't exist
|
|
91
|
+
output_path = os.path.join(output_directory, os.path.basename(file))
|
|
92
|
+
if output_directory:
|
|
93
|
+
os.makedirs(output_directory, exist_ok=True)
|
|
94
|
+
|
|
95
|
+
if gzip_output and not output_path.endswith(".gz"):
|
|
96
|
+
output_path += ".gz"
|
|
97
|
+
elif not gzip_output and output_path.endswith(".gz"):
|
|
98
|
+
output_path = output_path[:-3]
|
|
99
|
+
|
|
100
|
+
# write fastq
|
|
101
|
+
write_fastq(input_fastq = file, output_path = output_path, occurrence_list = occurrence_list, total_reads = total_reads, gzip_output = gzip_output, seed = seed, verbose = verbose)
|
|
102
|
+
|
|
103
|
+
def sample_multiple_files(file_list, fraction, seed_list, output, gzip_output, replacement, low_memory, verbose):
|
|
104
|
+
for seed in seed_list:
|
|
105
|
+
random.seed(seed)
|
|
106
|
+
for file in file_list:
|
|
107
|
+
bootstrap_single_file(files_total = file, gzip_output = gzip_output, output_directory = output, seed = seed, fraction = fraction, replacement = replacement, low_memory = low_memory, verbose = verbose)
|
|
108
|
+
|
|
109
|
+
def make_fastq_to_length_dict(file_list, verbose=True):
|
|
110
|
+
global fastq_to_length_dict
|
|
111
|
+
for file in file_list:
|
|
112
|
+
if isinstance(file, tuple):
|
|
113
|
+
if all(specific_file in fastq_to_length_dict for specific_file in file):
|
|
114
|
+
continue
|
|
115
|
+
if verbose:
|
|
116
|
+
print(f"Counting {file[0]}")
|
|
117
|
+
count = count_reads(file[0])
|
|
118
|
+
for i in range(len(file)):
|
|
119
|
+
fastq_to_length_dict[file[i]] = count
|
|
120
|
+
elif isinstance(file, str):
|
|
121
|
+
if file in fastq_to_length_dict:
|
|
122
|
+
continue
|
|
123
|
+
if verbose:
|
|
124
|
+
print(f"Counting {file}")
|
|
125
|
+
count = count_reads(file)
|
|
126
|
+
fastq_to_length_dict[file] = count
|
|
127
|
+
if verbose:
|
|
128
|
+
print("fastq_to_length_dict:", fastq_to_length_dict)
|
|
129
|
+
|
|
130
|
+
def fastQpick(input_files, fraction, seed=42, output_dir="fastQpick_output", gzip_output=False, group_size=1, replacement=False, overwrite=False, low_memory=False, verbose=True, **kwargs):
|
|
131
|
+
"""
|
|
132
|
+
Fast and memory-efficient sampling of DNA-Seq or RNA-seq fastq data with or without replacement.
|
|
133
|
+
|
|
134
|
+
Parameters
|
|
135
|
+
----------
|
|
136
|
+
input_files (str, list, or tuple) List of input FASTQ files or directories containing FASTQ files.
|
|
137
|
+
fraction (int or float) The fraction of reads to sample, as a float greater than 0. Any value equal to or greater than 1 will turn on the -r flag automatically.
|
|
138
|
+
seed (int or str) Random seed(s). Can provide multiple seeds separated by commas. Default: 42
|
|
139
|
+
output_dir (str) Output directory. Default: ./fastQpick_output
|
|
140
|
+
gzip_output (bool) Whether or not to gzip the output. Default: False (uncompressed)
|
|
141
|
+
group_size (int) The size of grouped files. Provide each pair of files sequentially, separated by a space. E.g., I1, R1, R2 would have group_size=3. Default: 1 (unpaired)
|
|
142
|
+
replacement (bool) Sample with replacement. Default: False (without replacement).
|
|
143
|
+
overwrite (bool) Overwrite existing output files. Default: False
|
|
144
|
+
low_memory (bool) Whether to use low memory mode (uses ~5.5x less memory than default, but adds marginal time to the data structure generation preprocessing). Default: False
|
|
145
|
+
verbose (bool) Whether to print progress information. Default: True
|
|
146
|
+
|
|
147
|
+
kwargs
|
|
148
|
+
------
|
|
149
|
+
fastq_to_length_dict (dict) Dictionary of FASTQ file paths to number of reads in each file. If not provided, will be calculated.
|
|
150
|
+
"""
|
|
151
|
+
# check if fastq_to_length_dict is in kwargs
|
|
152
|
+
if "fastq_to_length_dict" in kwargs and isinstance(kwargs["fastq_to_length_dict"], dict):
|
|
153
|
+
global fastq_to_length_dict
|
|
154
|
+
fastq_to_length_dict = kwargs["fastq_to_length_dict"]
|
|
155
|
+
|
|
156
|
+
# Check overwrite
|
|
157
|
+
if not overwrite:
|
|
158
|
+
if os.path.exists(output_dir) and not is_directory_effectively_empty(output_dir): # check if dir exists and is not empty
|
|
159
|
+
raise FileExistsError(f"Output directory '{output_dir}' already exists. Please specify a different output directory or set the overwrite flag to True.")
|
|
160
|
+
|
|
161
|
+
# Save arguments to a config file
|
|
162
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
163
|
+
config_file = os.path.join(output_dir, "fastQpick_config.json")
|
|
164
|
+
save_params_to_config_file(config_file)
|
|
165
|
+
|
|
166
|
+
# type checking
|
|
167
|
+
# if fraction >= 1, set replacement to True
|
|
168
|
+
if float(fraction) >= 1.0:
|
|
169
|
+
replacement = True
|
|
170
|
+
|
|
171
|
+
# go through files, and only keep those that are valid fastq files or that are a folder containing valid fastq files in the direct subdirectory
|
|
172
|
+
input_files_parsed = []
|
|
173
|
+
if isinstance(input_files, str):
|
|
174
|
+
input_files_parsed = [input_files]
|
|
175
|
+
elif isinstance(input_files, tuple) or isinstance(input_files, list):
|
|
176
|
+
for path in input_files:
|
|
177
|
+
if not isinstance(path, str):
|
|
178
|
+
raise ValueError("Input file list must be a string, tuple of strings, or list of strings.")
|
|
179
|
+
if not os.path.exists(path):
|
|
180
|
+
raise FileNotFoundError(f"File or directory '{path}' not found.")
|
|
181
|
+
elif os.path.isfile(path) and not path.endswith(tuple(valid_fastq_extensions)):
|
|
182
|
+
raise ValueError(f"File '{path}' is not a valid FASTQ file.")
|
|
183
|
+
elif os.path.isdir(path):
|
|
184
|
+
input_files_before_path = input_files_parsed.copy()
|
|
185
|
+
for subpath in os.listdir(path):
|
|
186
|
+
if os.path.isfile(subpath) and subpath.endswith(tuple(valid_fastq_extensions)):
|
|
187
|
+
input_files_parsed.append(subpath)
|
|
188
|
+
if input_files_before_path == input_files_parsed:
|
|
189
|
+
raise ValueError(f"No valid FASTQ files found in directory '{path}'.")
|
|
190
|
+
elif os.path.isfile(path) and path.endswith(tuple(valid_fastq_extensions)):
|
|
191
|
+
input_files_parsed.append(path)
|
|
192
|
+
else:
|
|
193
|
+
raise ValueError("Input file list must be a string, tuple of strings, or list of strings.")
|
|
194
|
+
|
|
195
|
+
if isinstance(seed, int): # if a single int is passed as a seed
|
|
196
|
+
seed = [seed]
|
|
197
|
+
elif isinstance(seed, str): # if a string of comma-separated ints is passed as a seed (like on the command line)
|
|
198
|
+
seed = [int(specific_seed) for specific_seed in seed.split(",")]
|
|
199
|
+
|
|
200
|
+
group_size = int(group_size) # make sure group_size is an int (not a string)
|
|
201
|
+
fraction = float(fraction) # make sure fraction is a float (not a string)
|
|
202
|
+
|
|
203
|
+
if group_size > 1:
|
|
204
|
+
input_files_parsed = group_items(input_files_parsed, group_size=group_size)
|
|
205
|
+
|
|
206
|
+
# Count reads in each file and store in a dictionary
|
|
207
|
+
make_fastq_to_length_dict(input_files_parsed, verbose=verbose)
|
|
208
|
+
|
|
209
|
+
# Do the sampling
|
|
210
|
+
sample_multiple_files(file_list=input_files_parsed, fraction=fraction, seed_list=seed, output=output_dir, gzip_output=gzip_output, replacement=replacement, low_memory=low_memory, verbose=verbose)
|
|
211
|
+
|
|
212
|
+
def main():
|
|
213
|
+
# Create argument parser
|
|
214
|
+
parser = argparse.ArgumentParser(description="Fast and memory-efficient sampling of DNA-Seq or RNA-seq fastq data with or without replacement.")
|
|
215
|
+
parser.add_argument("-f", "--fraction", required=True, default=False, help="The fraction of reads to sample, as a float greater than 0. Any value equal to or greater than 1 will turn on the -r flag automatically.")
|
|
216
|
+
parser.add_argument("-s", "--seed", required=False, default=42, help="Random seed(s). Can provide multiple seeds separated by commas. Default: 42")
|
|
217
|
+
parser.add_argument("-o", "--output_dir", required=False, type=str, default="fastQpick_output", help="Output file path. Default: ./fastQpick_output")
|
|
218
|
+
parser.add_argument("-z", "--gzip_output", required=False, default=False, help="Whether or not to gzip the output. Default: False (uncompressed)")
|
|
219
|
+
parser.add_argument("-g", "--group_size", required=False, default=1, help="The size of grouped files. Provide each pair of files sequentially, separated by a space. E.g., I1, R1, R2 would have group_size=3. Default: 1 (unpaired)")
|
|
220
|
+
parser.add_argument("-r", "--replacement", action="store_true", help="Sample with replacement. Default: False (without replacement).")
|
|
221
|
+
parser.add_argument("-w", "--overwrite", action="store_true", help="Overwrite existing output files. Default: False")
|
|
222
|
+
parser.add_argument("-l", "--low_memory", action="store_true", help="Whether to use low memory mode (uses ~5.5x less memory than default, but adds marginal time to the data structure generation preprocessing). Default: False")
|
|
223
|
+
parser.add_argument("-q", "--quiet", action="store_false", help="Turn off verbose output. Default: False")
|
|
224
|
+
parser.add_argument("-v", "--version", action="version", version=f"fastQpick {__version__}", help="Show program's version number and exit")
|
|
225
|
+
|
|
226
|
+
# Positional argument for input files (indefinite number)
|
|
227
|
+
parser.add_argument("input_files", nargs="+", help="Input FASTQ file(s) (one after the other, space-separated) or FASTQ folder(s)")
|
|
228
|
+
|
|
229
|
+
# Parse arguments
|
|
230
|
+
args = parser.parse_args()
|
|
231
|
+
|
|
232
|
+
fastQpick(input_files=args.input_files,
|
|
233
|
+
fraction=args.fraction,
|
|
234
|
+
seed=args.seed,
|
|
235
|
+
output_dir=args.output_dir,
|
|
236
|
+
gzip_output=args.gzip_output,
|
|
237
|
+
group_size=args.group_size,
|
|
238
|
+
replacement=args.replacement,
|
|
239
|
+
overwrite=args.overwrite,
|
|
240
|
+
low_memory=args.low_memory,
|
|
241
|
+
verbose=args.quiet)
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
import gzip
|
|
2
|
+
import inspect
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
from collections import OrderedDict
|
|
6
|
+
import pyfastx
|
|
7
|
+
|
|
8
|
+
def count_reads(filepath):
|
|
9
|
+
fastq_file = pyfastx.Fastx(filepath)
|
|
10
|
+
num_reads = sum(1 for _ in fastq_file)
|
|
11
|
+
return num_reads
|
|
12
|
+
|
|
13
|
+
def read_fastq(fastq_file, include_plus_line=False):
|
|
14
|
+
is_gzipped = fastq_file.endswith(".gz")
|
|
15
|
+
open_func = gzip.open if is_gzipped else open
|
|
16
|
+
open_mode = "rt" if is_gzipped else "r"
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
if include_plus_line:
|
|
20
|
+
with open_func(fastq_file, open_mode) as file:
|
|
21
|
+
while True:
|
|
22
|
+
header = file.readline().strip()
|
|
23
|
+
sequence = file.readline().strip()
|
|
24
|
+
plus_line = file.readline().strip()
|
|
25
|
+
quality = file.readline().strip()
|
|
26
|
+
|
|
27
|
+
if not header:
|
|
28
|
+
break
|
|
29
|
+
|
|
30
|
+
yield header, sequence, plus_line, quality
|
|
31
|
+
else: # copy-paste the code so that it doesn't have to check the conditional every iteration
|
|
32
|
+
with open_func(fastq_file, open_mode) as file:
|
|
33
|
+
while True:
|
|
34
|
+
header = file.readline().strip()
|
|
35
|
+
sequence = file.readline().strip()
|
|
36
|
+
plus_line = file.readline().strip()
|
|
37
|
+
quality = file.readline().strip()
|
|
38
|
+
|
|
39
|
+
if not header:
|
|
40
|
+
break
|
|
41
|
+
|
|
42
|
+
yield header, sequence, quality
|
|
43
|
+
except Exception as e:
|
|
44
|
+
raise RuntimeError(f"Error reading FASTQ file '{fastq_file}': {e}")
|
|
45
|
+
|
|
46
|
+
def make_function_parameter_to_value_dict(levels_up = 1):
|
|
47
|
+
# Collect parameters in a dictionary
|
|
48
|
+
params = OrderedDict()
|
|
49
|
+
|
|
50
|
+
# Get the caller's frame (one level up in the stack)
|
|
51
|
+
frame = inspect.currentframe()
|
|
52
|
+
|
|
53
|
+
for _ in range(levels_up):
|
|
54
|
+
if frame is None:
|
|
55
|
+
break
|
|
56
|
+
frame = frame.f_back
|
|
57
|
+
|
|
58
|
+
function_args, varargs, varkw, values = inspect.getargvalues(frame)
|
|
59
|
+
|
|
60
|
+
# handle explicit function arguments
|
|
61
|
+
for arg in function_args:
|
|
62
|
+
params[arg] = values[arg]
|
|
63
|
+
|
|
64
|
+
# handle *args
|
|
65
|
+
if varargs:
|
|
66
|
+
params["*args"] = values[varargs]
|
|
67
|
+
|
|
68
|
+
# handle **kwargs
|
|
69
|
+
if varkw:
|
|
70
|
+
for key, value in values[varkw].items():
|
|
71
|
+
params[key] = value
|
|
72
|
+
|
|
73
|
+
return params
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def save_params_to_config_file(out_file="run_config.json"):
|
|
77
|
+
out_file_directory = os.path.dirname(out_file)
|
|
78
|
+
if not out_file_directory:
|
|
79
|
+
out_file_directory = "."
|
|
80
|
+
else:
|
|
81
|
+
os.makedirs(out_file_directory, exist_ok=True)
|
|
82
|
+
|
|
83
|
+
# Collect parameters in a dictionary
|
|
84
|
+
params = make_function_parameter_to_value_dict(levels_up = 2)
|
|
85
|
+
|
|
86
|
+
# Write to JSON
|
|
87
|
+
with open(out_file, "w") as file:
|
|
88
|
+
json.dump(params, file, indent=4)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def is_directory_effectively_empty(directory_path):
|
|
92
|
+
# Get all non-hidden entries, excluding system files like `.DS_Store`
|
|
93
|
+
entries = [
|
|
94
|
+
entry for entry in os.listdir(directory_path)
|
|
95
|
+
if entry not in {".DS_Store"} and not entry.startswith(".")
|
|
96
|
+
]
|
|
97
|
+
return len(entries) == 0
|
|
98
|
+
|
|
99
|
+
def group_items(file_list, group_size=2):
|
|
100
|
+
if len(file_list) % group_size != 0:
|
|
101
|
+
raise ValueError(f"The list length must be divisible by {group_size} to form groups.")
|
|
102
|
+
return [tuple(file_list[i:i + group_size]) for i in range(0, len(file_list), group_size)]
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: fastQpick
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Fast and memory-efficient sampling of DNA-Seq or RNA-seq fastq data with or without replacement.
|
|
5
|
+
Author-email: Joseph Rich <josephrich98@gmail.com>
|
|
6
|
+
Maintainer-email: Joseph Rich <josephrich98@gmail.com>
|
|
7
|
+
License: BSD 2-Clause License
|
|
8
|
+
|
|
9
|
+
Copyright (c) 2024, Pachter Lab
|
|
10
|
+
|
|
11
|
+
Redistribution and use in source and binary forms, with or without
|
|
12
|
+
modification, are permitted provided that the following conditions are met:
|
|
13
|
+
|
|
14
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
|
15
|
+
list of conditions and the following disclaimer.
|
|
16
|
+
|
|
17
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
|
18
|
+
this list of conditions and the following disclaimer in the documentation
|
|
19
|
+
and/or other materials provided with the distribution.
|
|
20
|
+
|
|
21
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
22
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
23
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
24
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
25
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
26
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
27
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
28
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
29
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
30
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
31
|
+
|
|
32
|
+
Project-URL: Homepage, https://github.com/pachterlab/fastQpick
|
|
33
|
+
Keywords: fastQpick,bioinformatics,statistics,RNA-seq,DNA-seq
|
|
34
|
+
Classifier: Environment :: Console
|
|
35
|
+
Classifier: Framework :: Jupyter
|
|
36
|
+
Classifier: Intended Audience :: Science/Research
|
|
37
|
+
Classifier: License :: OSI Approved :: BSD License
|
|
38
|
+
Classifier: Operating System :: OS Independent
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
40
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
41
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
42
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
43
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
44
|
+
Classifier: Topic :: Utilities
|
|
45
|
+
Requires-Python: >=3.7
|
|
46
|
+
Description-Content-Type: text/markdown
|
|
47
|
+
License-File: LICENSE
|
|
48
|
+
Requires-Dist: pyfastx>=2.0.0
|
|
49
|
+
Requires-Dist: tqdm>=4.66.0
|
|
50
|
+
|
|
51
|
+
# fastQpick
|
|
52
|
+
|
|
53
|
+
Fast and memory-efficient sampling of DNA-seq or RNA-seq FASTQ data with or without replacement.
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## Installation
|
|
58
|
+
|
|
59
|
+
### Install via PyPI
|
|
60
|
+
```bash
|
|
61
|
+
pip install fastQpick
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Install from Source Code
|
|
65
|
+
|
|
66
|
+
Using pip:
|
|
67
|
+
```bash
|
|
68
|
+
pip install git+https://github.com/pachterlab/fastQpick.git
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Or clone the repository and build manually:
|
|
72
|
+
```bash
|
|
73
|
+
git clone https://github.com/pachterlab/fastQpick.git
|
|
74
|
+
cd fastQpick
|
|
75
|
+
python -m build
|
|
76
|
+
python -m pip install dist/fastQpick-x.x.x-py3-none-any.whl
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## Usage
|
|
82
|
+
|
|
83
|
+
### Command-line Interface
|
|
84
|
+
|
|
85
|
+
Run `fastQpick` with a specified fraction and options:
|
|
86
|
+
```bash
|
|
87
|
+
fastQpick --fraction FRACTION [OPTIONS] FASTQ_FILE1 FASTQ_FILE2 ...
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Python API
|
|
91
|
+
|
|
92
|
+
Use `fastQpick` in your Python code:
|
|
93
|
+
```python
|
|
94
|
+
from fastQpick import fastQpick
|
|
95
|
+
|
|
96
|
+
fastQpick(
|
|
97
|
+
input_file_list=['FASTQ_FILE1', 'FASTQ_FILE2', ...],
|
|
98
|
+
fraction=FRACTION,
|
|
99
|
+
...
|
|
100
|
+
)
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
## Documentation
|
|
106
|
+
|
|
107
|
+
- **Command-line Help**: Use the following command to see all available options:
|
|
108
|
+
```bash
|
|
109
|
+
fastQpick --help
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
- **Python API Help**: Use the `help` function to explore the API:
|
|
113
|
+
```python
|
|
114
|
+
help(fastQpick)
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
### Options
|
|
118
|
+
- input_files (str, list, or tuple) List of input FASTQ files or directories containing FASTQ files. Required. Positional argument on command line.
|
|
119
|
+
- fraction (int or float) The fraction of reads to sample, as a float greater than 0. Any value equal to or greater than 1 will turn on the -r flag automatically.
|
|
120
|
+
- seed (int or str) Random seed(s). Can provide multiple seeds separated by commas. Default: 42
|
|
121
|
+
- output_dir (str) Output directory. Default: ./fastQpick_output
|
|
122
|
+
- gzip_output (bool) Whether or not to gzip the output. Default: False (uncompressed)
|
|
123
|
+
- group_size (int) The size of grouped files. Provide each pair of files sequentially, separated by a space. E.g., I1, R1, R2 - would have group_size=3. Default: 1 (unpaired)
|
|
124
|
+
- replacement (bool) Sample with replacement. Default: False (without replacement).
|
|
125
|
+
- overwrite (bool) Overwrite existing output files. Default: False
|
|
126
|
+
- low_memory (bool) Whether to use low memory mode (uses ~5.5x less memory than default, but adds marginal time to the data - structure generation preprocessing). Default: False
|
|
127
|
+
- verbose (bool) Whether to print progress information. Default: True
|
|
128
|
+
|
|
129
|
+
---
|
|
130
|
+
|
|
131
|
+
## Features
|
|
132
|
+
|
|
133
|
+
- Efficient sampling of large FASTQ files.
|
|
134
|
+
- Works with both single and paired-end sequencing data.
|
|
135
|
+
- Supports sampling with or without replacement.
|
|
136
|
+
- Command-line interface and Python API for seamless integration.
|
|
137
|
+
- Memory efficient - in low-memory mode, only uses as much memory as a list of (small) integers the length of the number of reads in the fastq file for each file.
|
|
138
|
+
- Time efficient - only passes through the fastq once and writes to output in batches - can process 600M reads in 10-15 minutes
|
|
139
|
+
|
|
140
|
+
## Low memory mode vs. standard
|
|
141
|
+
Low memory mode vs. standard, when fraction=1 (i.e., number of reads to sample is the same as the number of reads in the fastq):
|
|
142
|
+
- Adds an extra ~1-3 seconds per million reads per group_size (i.e., 500M reads would take 30 minutes instead of 20-25 minutes)
|
|
143
|
+
- Saves an extra ~40MB RAM per million reads (i.e., 500M reads would take 3.75GB RAM vs 20.6GB RAM)
|
|
144
|
+
|
|
145
|
+
---
|
|
146
|
+
|
|
147
|
+
## Examples
|
|
148
|
+
|
|
149
|
+
### 1. Sample 10% of reads with replacement from a FASTQ file:
|
|
150
|
+
|
|
151
|
+
**Command-line**
|
|
152
|
+
```bash
|
|
153
|
+
fastQpick --fraction 0.1 -r input.fastq
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
**Python**
|
|
157
|
+
```python
|
|
158
|
+
from fastQpick import fastQpick
|
|
159
|
+
|
|
160
|
+
fastQpick(
|
|
161
|
+
input_files='input.fastq',
|
|
162
|
+
fraction=0.1,
|
|
163
|
+
replacement=True
|
|
164
|
+
)
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
### 2. Sample 100% of reads with replacement from multiple paired FASTQ files (R1, R2) across three seeds (i.e., bootstrapping):
|
|
168
|
+
|
|
169
|
+
**Command-line**
|
|
170
|
+
```bash
|
|
171
|
+
fastQpick --fraction 1 -s 42,43,44 -r -g 2 input1_R1.fastq input1_R2.fastq
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
**Python**
|
|
175
|
+
```python
|
|
176
|
+
from fastQpick import fastQpick
|
|
177
|
+
|
|
178
|
+
fastQpick(
|
|
179
|
+
input_files='input.fastq',
|
|
180
|
+
fraction=1,
|
|
181
|
+
seed="42,43,44",
|
|
182
|
+
replacement=True,
|
|
183
|
+
group_size=2,
|
|
184
|
+
)
|
|
185
|
+
```
|
|
186
|
+
---
|
|
187
|
+
|
|
188
|
+
## License
|
|
189
|
+
|
|
190
|
+
fastQpick is licensed under the 2-clause BSD license. See the [LICENSE](LICENSE) file for details.
|
|
191
|
+
|
|
192
|
+
---
|
|
193
|
+
|
|
194
|
+
## Contributing
|
|
195
|
+
|
|
196
|
+
We welcome contributions! Please see the [CONTRIBUTING.md](CONTRIBUTING.md) file for guidelines on how to get involved.
|
|
197
|
+
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
fastQpick/__init__.py
|
|
5
|
+
fastQpick/_version.py
|
|
6
|
+
fastQpick/main.py
|
|
7
|
+
fastQpick/utils.py
|
|
8
|
+
fastQpick.egg-info/PKG-INFO
|
|
9
|
+
fastQpick.egg-info/SOURCES.txt
|
|
10
|
+
fastQpick.egg-info/dependency_links.txt
|
|
11
|
+
fastQpick.egg-info/entry_points.txt
|
|
12
|
+
fastQpick.egg-info/requires.txt
|
|
13
|
+
fastQpick.egg-info/top_level.txt
|
|
14
|
+
tests/test_fastQpick.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
fastQpick
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=64", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "fastQpick"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Fast and memory-efficient sampling of DNA-Seq or RNA-seq fastq data with or without replacement."
|
|
9
|
+
readme = {file = "README.md", content-type = "text/markdown"}
|
|
10
|
+
license = {file = "LICENSE"}
|
|
11
|
+
authors = [
|
|
12
|
+
{name = "Joseph Rich", email = "josephrich98@gmail.com"}
|
|
13
|
+
]
|
|
14
|
+
maintainers = [
|
|
15
|
+
{name = "Joseph Rich", email = "josephrich98@gmail.com"}
|
|
16
|
+
]
|
|
17
|
+
requires-python = ">=3.7"
|
|
18
|
+
keywords = ["fastQpick", "bioinformatics", "statistics", "RNA-seq", "DNA-seq"]
|
|
19
|
+
classifiers = [
|
|
20
|
+
"Environment :: Console",
|
|
21
|
+
"Framework :: Jupyter",
|
|
22
|
+
"Intended Audience :: Science/Research",
|
|
23
|
+
"License :: OSI Approved :: BSD License",
|
|
24
|
+
"Operating System :: OS Independent",
|
|
25
|
+
"Programming Language :: Python :: 3.9",
|
|
26
|
+
"Programming Language :: Python :: 3.10",
|
|
27
|
+
"Programming Language :: Python :: 3.11",
|
|
28
|
+
"Programming Language :: Python :: 3.12",
|
|
29
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
30
|
+
"Topic :: Utilities"
|
|
31
|
+
]
|
|
32
|
+
dependencies = [
|
|
33
|
+
"pyfastx>=2.0.0",
|
|
34
|
+
"tqdm>=4.66.0",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[project.urls]
|
|
38
|
+
"Homepage" = "https://github.com/pachterlab/fastQpick"
|
|
39
|
+
|
|
40
|
+
[tool.setuptools]
|
|
41
|
+
packages = ["fastQpick"]
|
|
42
|
+
|
|
43
|
+
[tool.setuptools.package-data]
|
|
44
|
+
fastQpick = ["*.txt", "*.md", "*.csv"] # Ensure additional files like data or docs are included
|
|
45
|
+
|
|
46
|
+
[project.scripts]
|
|
47
|
+
fastQpick = "fastQpick.main:main" # This replaces the console_scripts entry
|
|
@@ -0,0 +1,320 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import tempfile
|
|
3
|
+
import pytest
|
|
4
|
+
from fastQpick import fastQpick
|
|
5
|
+
from fastQpick.utils import read_fastq, count_reads
|
|
6
|
+
from pdb import set_trace as st
|
|
7
|
+
|
|
8
|
+
@pytest.fixture
|
|
9
|
+
def temp_fastq_file():
|
|
10
|
+
content = """@Header1
|
|
11
|
+
AAAAAAAAAAAAAAAAAAAAA
|
|
12
|
+
+
|
|
13
|
+
IIIIIIIIIIIIIIIIIIIII
|
|
14
|
+
@Header2
|
|
15
|
+
CCCCCCCCCCCCCCCCCCCCC
|
|
16
|
+
+
|
|
17
|
+
IIIIIIIIIIIIIIIIIIIII
|
|
18
|
+
@Header3
|
|
19
|
+
GGGGGGGGGGGGGGGGGGGGG
|
|
20
|
+
+
|
|
21
|
+
IIIIIIIIIIIIIIIIIIIII
|
|
22
|
+
@Header4
|
|
23
|
+
TTTTTTTTTTTTTTTTTTTTT
|
|
24
|
+
+
|
|
25
|
+
IIIIIIIIIIIIIIIIIIIII
|
|
26
|
+
@Header5
|
|
27
|
+
AAAAAAAAAAAAACCCCCCCC
|
|
28
|
+
+
|
|
29
|
+
IIIIIIIIIIIIIIIIIIIII
|
|
30
|
+
"""
|
|
31
|
+
# Create a temporary file
|
|
32
|
+
with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".fastq") as temp_file:
|
|
33
|
+
temp_file.write(content)
|
|
34
|
+
temp_file.seek(0) # Move to the start of the file
|
|
35
|
+
yield temp_file.name # Provide the file path to the test
|
|
36
|
+
|
|
37
|
+
# Cleanup after the test
|
|
38
|
+
os.remove(temp_file.name)
|
|
39
|
+
|
|
40
|
+
# Fixture to create two temporary FASTQ files
|
|
41
|
+
@pytest.fixture
|
|
42
|
+
def temp_paired_fastq_files():
|
|
43
|
+
content_1 = """@Header1_1
|
|
44
|
+
AAAAAAAAAAAAAAAAAAAAA
|
|
45
|
+
+
|
|
46
|
+
IIIIIIIIIIIIIIIIIIIII
|
|
47
|
+
@Header2_1
|
|
48
|
+
CCCCCCCCCCCCCCCCCCCCC
|
|
49
|
+
+
|
|
50
|
+
IIIIIIIIIIIIIIIIIIIII
|
|
51
|
+
@Header3_1
|
|
52
|
+
GGGGGGGGGGGGGGGGGGGGG
|
|
53
|
+
+
|
|
54
|
+
IIIIIIIIIIIIIIIIIIIII
|
|
55
|
+
@Header4_1
|
|
56
|
+
TTTTTTTTTTTTTTTTTTTTT
|
|
57
|
+
+
|
|
58
|
+
IIIIIIIIIIIIIIIIIIIII
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
content_2 = """@Header1_2
|
|
62
|
+
AAAAAAAACCCCCCCCCCCCC
|
|
63
|
+
+
|
|
64
|
+
IIIIIIIIIIIIIIIIIIIII
|
|
65
|
+
@Header2_2
|
|
66
|
+
AAAAAAAGGGGGGGGGGGGGG
|
|
67
|
+
+
|
|
68
|
+
IIIIIIIIIIIIIIIIIIIII
|
|
69
|
+
@Header3_2
|
|
70
|
+
AAAAAAATTTTTTTTTTTTTT
|
|
71
|
+
+
|
|
72
|
+
IIIIIIIIIIIIIIIIIIIII
|
|
73
|
+
@Header4_2
|
|
74
|
+
CCCCCCCCAAAAAAAAAAAAA
|
|
75
|
+
+
|
|
76
|
+
IIIIIIIIIIIIIIIIIIIII
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
# Create two temporary files
|
|
80
|
+
with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".fastq") as temp_file1, \
|
|
81
|
+
tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".fastq") as temp_file2:
|
|
82
|
+
temp_file1.write(content_1)
|
|
83
|
+
temp_file2.write(content_2)
|
|
84
|
+
temp_file1.seek(0)
|
|
85
|
+
temp_file2.seek(0)
|
|
86
|
+
yield [temp_file1.name, temp_file2.name] # Yield the paths of both files
|
|
87
|
+
|
|
88
|
+
# Cleanup after the test
|
|
89
|
+
os.remove(temp_file1.name)
|
|
90
|
+
os.remove(temp_file2.name)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def is_gzipped(file_path):
|
|
96
|
+
with open(file_path, "rb") as f:
|
|
97
|
+
magic_number = f.read(2)
|
|
98
|
+
return magic_number == b"\x1f\x8b"
|
|
99
|
+
|
|
100
|
+
def validate_fastq_format(file_path, ground_truth=None):
|
|
101
|
+
for header, seq, plus_line, qual in read_fastq(file_path, include_plus_line=True):
|
|
102
|
+
assert header.startswith("@"), f"Header does not start with '@': {header}"
|
|
103
|
+
assert len(seq) == len(qual), f"Sequence and quality lengths do not match: {seq} {qual}"
|
|
104
|
+
assert plus_line.startswith("+"), f"Plus line does not start with '+': {plus_line}"
|
|
105
|
+
|
|
106
|
+
if ground_truth:
|
|
107
|
+
assert header in ground_truth, f"Header not found in ground truth: {header}"
|
|
108
|
+
assert seq == ground_truth[header]["sequence"], f"Sequence mismatch - expected: {seq}; got: {ground_truth[header]['sequence']}"
|
|
109
|
+
assert plus_line == ground_truth[header]["plus_line"], f"Plus line mismatch - expected: {plus_line}; got: {ground_truth[header]['plus_line']}"
|
|
110
|
+
assert qual == ground_truth[header]["quality"], f"Quality mismatch - expected: {qual}; got: {ground_truth[header]['quality']}"
|
|
111
|
+
|
|
112
|
+
def count_number_of_unique_headers(file_path):
|
|
113
|
+
headers = set()
|
|
114
|
+
for header, _, _, _ in read_fastq(file_path, include_plus_line=True):
|
|
115
|
+
headers.add(header)
|
|
116
|
+
return len(headers)
|
|
117
|
+
|
|
118
|
+
def make_fastq_dict(file_path):
|
|
119
|
+
fastq_dict = {}
|
|
120
|
+
for header, seq, plus_line, qual in read_fastq(file_path, include_plus_line=True):
|
|
121
|
+
fastq_dict[header] = {}
|
|
122
|
+
fastq_dict[header]["sequence"] = seq
|
|
123
|
+
fastq_dict[header]["plus_line"] = plus_line
|
|
124
|
+
fastq_dict[header]["quality"] = qual
|
|
125
|
+
return fastq_dict
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def check_pairwise_agreement(temp_paired_fastq_files, temp_output_dir, gzip_output):
|
|
129
|
+
file1_base_name = os.path.basename(temp_paired_fastq_files[0])
|
|
130
|
+
file2_base_name = os.path.basename(temp_paired_fastq_files[1])
|
|
131
|
+
|
|
132
|
+
output_fastq_file1 = os.path.join(temp_output_dir, file1_base_name)
|
|
133
|
+
output_fastq_file2 = os.path.join(temp_output_dir, file2_base_name)
|
|
134
|
+
|
|
135
|
+
if gzip_output:
|
|
136
|
+
output_fastq_file1 += ".gz"
|
|
137
|
+
output_fastq_file2 += ".gz"
|
|
138
|
+
|
|
139
|
+
for (header1, seq1, plus_line1, qual1), (header2, seq2, plus_line2, qual2) in zip(
|
|
140
|
+
read_fastq(output_fastq_file1, include_plus_line=True),
|
|
141
|
+
read_fastq(output_fastq_file2, include_plus_line=True)
|
|
142
|
+
):
|
|
143
|
+
# Split headers up to the last underscore
|
|
144
|
+
split_header1 = header1.rsplit('_', 1)[0]
|
|
145
|
+
split_header2 = header2.rsplit('_', 1)[0]
|
|
146
|
+
|
|
147
|
+
# Assert that the two headers are equal
|
|
148
|
+
assert split_header1 == split_header2, f"Headers do not match: {split_header1} != {split_header2}"
|
|
149
|
+
|
|
150
|
+
def run_all_single_file_tests(temp_output_dir, temp_fastq_file, gzip_output, fraction, replacement):
|
|
151
|
+
# Assert that the output directory exists
|
|
152
|
+
assert os.path.exists(temp_output_dir), "Output directory does not exist!"
|
|
153
|
+
|
|
154
|
+
# Optionally, verify the output files
|
|
155
|
+
output_files = os.listdir(temp_output_dir)
|
|
156
|
+
assert len(output_files) > 0, "No output files were created!"
|
|
157
|
+
|
|
158
|
+
file_base_name = os.path.basename(temp_fastq_file)
|
|
159
|
+
output_fastq_file = os.path.join(temp_output_dir, file_base_name)
|
|
160
|
+
|
|
161
|
+
if gzip_output:
|
|
162
|
+
output_fastq_file += ".gz"
|
|
163
|
+
|
|
164
|
+
input_fastq_dict = make_fastq_dict(temp_fastq_file)
|
|
165
|
+
validate_fastq_format(output_fastq_file, ground_truth=input_fastq_dict)
|
|
166
|
+
|
|
167
|
+
output_is_gzipped = is_gzipped(output_fastq_file)
|
|
168
|
+
assert output_is_gzipped == gzip_output, f"Gzipped output - expected: {gzip_output}; got: {output_is_gzipped}"
|
|
169
|
+
|
|
170
|
+
num_reads_truth = count_reads(temp_fastq_file)
|
|
171
|
+
num_reads_output = count_reads(output_fastq_file)
|
|
172
|
+
|
|
173
|
+
assert num_reads_output == num_reads_truth * fraction, f"Number of reads mismatch - expected: {num_reads_truth * fraction}; got: {num_reads_output}"
|
|
174
|
+
|
|
175
|
+
num_unique_reads = count_number_of_unique_headers(output_fastq_file)
|
|
176
|
+
|
|
177
|
+
if not replacement:
|
|
178
|
+
assert num_unique_reads == num_reads_output, f"Number of unique reads mismatch - expected: {num_reads_output}; got: {num_unique_reads}"
|
|
179
|
+
|
|
180
|
+
if replacement and fraction > 1:
|
|
181
|
+
assert num_unique_reads < num_reads_output, f"Number of unique reads mismatch - expected: less than {num_reads_output}; got: {num_unique_reads}"
|
|
182
|
+
|
|
183
|
+
def test_single_file(temp_fastq_file):
|
|
184
|
+
fraction = 0.6
|
|
185
|
+
seed = 42
|
|
186
|
+
gzip_output = False
|
|
187
|
+
group_size = 1
|
|
188
|
+
replacement = False
|
|
189
|
+
|
|
190
|
+
with tempfile.TemporaryDirectory() as temp_output_dir:
|
|
191
|
+
fastQpick(input_files=temp_fastq_file,
|
|
192
|
+
fraction=fraction,
|
|
193
|
+
seed=seed,
|
|
194
|
+
output_dir=temp_output_dir,
|
|
195
|
+
gzip_output=gzip_output,
|
|
196
|
+
group_size=group_size,
|
|
197
|
+
replacement=replacement,
|
|
198
|
+
overwrite=True
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
run_all_single_file_tests(temp_output_dir=temp_output_dir, temp_fastq_file=temp_fastq_file, gzip_output=gzip_output, fraction=fraction, replacement=replacement)
|
|
202
|
+
|
|
203
|
+
# st()
|
|
204
|
+
|
|
205
|
+
def test_single_file_bootstrapped(temp_fastq_file):
|
|
206
|
+
fraction = 1
|
|
207
|
+
seed = 42
|
|
208
|
+
gzip_output = False
|
|
209
|
+
group_size = 1
|
|
210
|
+
replacement = True
|
|
211
|
+
|
|
212
|
+
with tempfile.TemporaryDirectory() as temp_output_dir:
|
|
213
|
+
fastQpick(input_files=temp_fastq_file,
|
|
214
|
+
fraction=fraction,
|
|
215
|
+
seed=seed,
|
|
216
|
+
output_dir=temp_output_dir,
|
|
217
|
+
gzip_output=gzip_output,
|
|
218
|
+
group_size=group_size,
|
|
219
|
+
replacement=replacement,
|
|
220
|
+
overwrite=True
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
run_all_single_file_tests(temp_output_dir=temp_output_dir, temp_fastq_file=temp_fastq_file, gzip_output=gzip_output, fraction=fraction, replacement=replacement)
|
|
224
|
+
|
|
225
|
+
# st()
|
|
226
|
+
|
|
227
|
+
def test_single_file_oversampled(temp_fastq_file):
|
|
228
|
+
fraction = 3
|
|
229
|
+
seed = 42
|
|
230
|
+
gzip_output = False
|
|
231
|
+
group_size = 1
|
|
232
|
+
replacement = True
|
|
233
|
+
|
|
234
|
+
with tempfile.TemporaryDirectory() as temp_output_dir:
|
|
235
|
+
fastQpick(input_files=temp_fastq_file,
|
|
236
|
+
fraction=fraction,
|
|
237
|
+
seed=seed,
|
|
238
|
+
output_dir=temp_output_dir,
|
|
239
|
+
gzip_output=gzip_output,
|
|
240
|
+
group_size=group_size,
|
|
241
|
+
replacement=replacement,
|
|
242
|
+
overwrite=True
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
run_all_single_file_tests(temp_output_dir=temp_output_dir, temp_fastq_file=temp_fastq_file, gzip_output=gzip_output, fraction=fraction, replacement=replacement)
|
|
246
|
+
|
|
247
|
+
# st()
|
|
248
|
+
|
|
249
|
+
def test_single_gzipped(temp_fastq_file):
|
|
250
|
+
fraction = 0.6
|
|
251
|
+
seed = 42
|
|
252
|
+
gzip_output = True
|
|
253
|
+
group_size = 1
|
|
254
|
+
replacement = False
|
|
255
|
+
|
|
256
|
+
with tempfile.TemporaryDirectory() as temp_output_dir:
|
|
257
|
+
fastQpick(input_files=temp_fastq_file,
|
|
258
|
+
fraction=fraction,
|
|
259
|
+
seed=seed,
|
|
260
|
+
output_dir=temp_output_dir,
|
|
261
|
+
gzip_output=gzip_output,
|
|
262
|
+
group_size=group_size,
|
|
263
|
+
replacement=replacement,
|
|
264
|
+
overwrite=True
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
run_all_single_file_tests(temp_output_dir=temp_output_dir, temp_fastq_file=temp_fastq_file, gzip_output=gzip_output, fraction=fraction, replacement=replacement)
|
|
268
|
+
|
|
269
|
+
# st()
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def test_paired_files(temp_paired_fastq_files):
|
|
273
|
+
fraction = 0.75
|
|
274
|
+
seed = 42
|
|
275
|
+
gzip_output = False
|
|
276
|
+
group_size = 2
|
|
277
|
+
replacement = False
|
|
278
|
+
|
|
279
|
+
with tempfile.TemporaryDirectory() as temp_output_dir:
|
|
280
|
+
fastQpick(input_files=temp_paired_fastq_files,
|
|
281
|
+
fraction=fraction,
|
|
282
|
+
seed=seed,
|
|
283
|
+
output_dir=temp_output_dir,
|
|
284
|
+
gzip_output=gzip_output,
|
|
285
|
+
group_size=group_size,
|
|
286
|
+
replacement=replacement,
|
|
287
|
+
overwrite=True
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
for fastq_file in temp_paired_fastq_files:
|
|
291
|
+
run_all_single_file_tests(temp_output_dir=temp_output_dir, temp_fastq_file=fastq_file, gzip_output=gzip_output, fraction=fraction, replacement=replacement)
|
|
292
|
+
|
|
293
|
+
check_pairwise_agreement(temp_paired_fastq_files=temp_paired_fastq_files, temp_output_dir=temp_output_dir, gzip_output=gzip_output)
|
|
294
|
+
|
|
295
|
+
# st()
|
|
296
|
+
|
|
297
|
+
def test_paired_files_bootstrapped(temp_paired_fastq_files):
|
|
298
|
+
fraction = 1
|
|
299
|
+
seed = 42
|
|
300
|
+
gzip_output = False
|
|
301
|
+
group_size = 2
|
|
302
|
+
replacement = True
|
|
303
|
+
|
|
304
|
+
with tempfile.TemporaryDirectory() as temp_output_dir:
|
|
305
|
+
fastQpick(input_files=temp_paired_fastq_files,
|
|
306
|
+
fraction=fraction,
|
|
307
|
+
seed=seed,
|
|
308
|
+
output_dir=temp_output_dir,
|
|
309
|
+
gzip_output=gzip_output,
|
|
310
|
+
group_size=group_size,
|
|
311
|
+
replacement=replacement,
|
|
312
|
+
overwrite=True
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
for fastq_file in temp_paired_fastq_files:
|
|
316
|
+
run_all_single_file_tests(temp_output_dir=temp_output_dir, temp_fastq_file=fastq_file, gzip_output=gzip_output, fraction=fraction, replacement=replacement)
|
|
317
|
+
|
|
318
|
+
check_pairwise_agreement(temp_paired_fastq_files=temp_paired_fastq_files, temp_output_dir=temp_output_dir, gzip_output=gzip_output)
|
|
319
|
+
|
|
320
|
+
# st()
|