Demultiplex 1.2.3__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,28 @@
1
+ from importlib.metadata import PackageNotFoundError, metadata
2
+ from re import split
3
+ from typing import Callable
4
+
5
+ from .demultiplex import Extractor, count, demultiplex
6
+
7
+
8
+ def _extract(key: str, delim: str = r'[^\s\S]', index: int = 0) -> str:
9
+ try:
10
+ value = metadata(__package__).get(key, '')
11
+ except PackageNotFoundError:
12
+ return '<NO DATA>'
13
+ return split(delim, value)[index]
14
+
15
+
16
+ def doc_split(func: Callable) -> str:
17
+ return func.__doc__.split('\n\n')[0]
18
+
19
+
20
+ _project = _extract('Name')
21
+ _version = _extract('Version')
22
+ _year = '2013-2026'
23
+ _author = _extract('Author')
24
+ _email = _extract('Author-email')
25
+ _description = _extract('Summary')
26
+ _copyright = f'Copyright (c) {_year} by {_author} <{_email}>'
27
+ _url = _extract('Project-URL')
28
+ _info = f'{_project} version {_version}\n\n{_copyright}\nHomepage: {_url}'
demultiplex/cli.py ADDED
@@ -0,0 +1,150 @@
1
+ from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
2
+ from sys import stdin
3
+
4
+ from fastools import Peeker
5
+
6
+ from . import _copyright, _description, _info, doc_split
7
+ from .demultiplex import (
8
+ _get_barcode, _type_handler, Extractor, count, demultiplex, match)
9
+
10
+
11
+ def _file_type(*args, **kwargs):
12
+ """Argparse FileType replacement."""
13
+ def _open(name):
14
+ return _type_handler[name.split('.')[-1]](name, *args, **kwargs)
15
+
16
+ return _open
17
+
18
+
19
+ def guess(
20
+ input_handle, output_handle, in_read, fmt, start, end, sample_size,
21
+ threshold, use_freq):
22
+ """Retrieve the most frequent barcodes."""
23
+ extractor = Extractor(input_handle, in_read, fmt, start, end)
24
+ barcodes = count(input_handle, extractor, sample_size, threshold, use_freq)
25
+
26
+ for i, barcode in enumerate(sorted(barcodes)):
27
+ output_handle.write('{} {}\n'.format(i + 1, barcode))
28
+
29
+
30
+ def demux(
31
+ input_handles, barcodes_handle, in_read, fmt, start, end, mismatch,
32
+ use_edit, path='.'):
33
+ """Demultiplex any number of files given a list of barcodes."""
34
+ extractor = Extractor(input_handles[0], in_read, fmt, start, end)
35
+ demultiplex(
36
+ input_handles, barcodes_handle, extractor, mismatch, use_edit, path)
37
+
38
+
39
+ def bcmatch(
40
+ input_handles, barcodes_handle, mismatch, use_edit, path='.',
41
+ filter_multiple=False, directional=False):
42
+ """Demultiplex one file given a list of barcode tuples."""
43
+ match(
44
+ input_handles, barcodes_handle, mismatch, use_edit, path,
45
+ filter_multiple, directional)
46
+
47
+
48
+ def _arg_parser() -> object:
49
+ """Command line argument parsing."""
50
+ common_parser = ArgumentParser(add_help=False)
51
+ common_parser.add_argument(
52
+ '-r', dest='in_read', action='store_true',
53
+ help='extract the barcodes from the read')
54
+ common_parser.add_argument(
55
+ '--format', dest='fmt', default=None, choices=_get_barcode.keys(),
56
+ help='provdide the header format')
57
+ common_parser.add_argument(
58
+ '-s', dest='start', type=int, default=None,
59
+ help='start of the selection')
60
+ common_parser.add_argument(
61
+ '-e', dest='end', type=int, default=None, help='end of the selection')
62
+
63
+ common_options_parser = ArgumentParser(add_help=False)
64
+ common_options_parser.add_argument(
65
+ '-m', dest='mismatch', type=int, default=1,
66
+ help='number of mismatches')
67
+ common_options_parser.add_argument(
68
+ '-d', dest='use_edit', action='store_true',
69
+ help='use Levenshtein distance')
70
+ common_options_parser.add_argument(
71
+ '-p', dest='path', type=str, default='.', help='output directory')
72
+
73
+ input_parser = ArgumentParser(add_help=False)
74
+ input_parser.add_argument(
75
+ 'barcodes_handle', metavar='BARCODES', type=_file_type('rt'),
76
+ help='barcodes file')
77
+ input_parser.add_argument(
78
+ 'input_handles', metavar='INPUT', nargs='+', type=_file_type('rt'),
79
+ help='input files')
80
+
81
+ parser = ArgumentParser(
82
+ formatter_class=ArgumentDefaultsHelpFormatter, description=_description,
83
+ epilog=_copyright)
84
+ parser.add_argument('-v', action='version', version=_info)
85
+ subparsers = parser.add_subparsers(dest='subcommand')
86
+ subparsers.required = True
87
+
88
+ subparser = subparsers.add_parser(
89
+ 'guess', formatter_class=ArgumentDefaultsHelpFormatter,
90
+ parents=[common_parser], description=doc_split(guess))
91
+ subparser.add_argument(
92
+ 'input_handle', metavar='INPUT', type=_file_type('rt'),
93
+ help='input file')
94
+ subparser.add_argument(
95
+ '-o', dest='output_handle', metavar='OUTPUT', type=_file_type('wt'),
96
+ default='-', help='output file')
97
+ subparser.add_argument(
98
+ '-n', dest='sample_size', type=int, default=1000000,
99
+ help='sample size')
100
+ subparser.add_argument(
101
+ '-f', dest='use_freq', action='store_true',
102
+ help='select on frequency instead of a fixed amount')
103
+ subparser.add_argument(
104
+ '-t', dest='threshold', type=int, default=12,
105
+ help='threshold for the selection method')
106
+ subparser.set_defaults(func=guess)
107
+
108
+ subparser = subparsers.add_parser(
109
+ 'demux', formatter_class=ArgumentDefaultsHelpFormatter,
110
+ parents=[common_parser, common_options_parser, input_parser],
111
+ description=doc_split(demux))
112
+ subparser.set_defaults(func=demux)
113
+
114
+ subparser = subparsers.add_parser(
115
+ 'match', formatter_class=ArgumentDefaultsHelpFormatter,
116
+ parents=[common_options_parser, input_parser],
117
+ description=doc_split(bcmatch))
118
+ subparser.add_argument(
119
+ '-f', dest='filter_multiple', default=False, action='store_true',
120
+ help='write multiple matches to separate files')
121
+ subparser.add_argument(
122
+ '-D', dest='directional', default=False, action='store_true',
123
+ help='directional input data')
124
+ subparser.set_defaults(func=bcmatch)
125
+
126
+ return parser
127
+
128
+
129
+ def main():
130
+ """Main entry point."""
131
+ parser = _arg_parser()
132
+
133
+ global stdin
134
+ stdin = Peeker(stdin)
135
+
136
+ try:
137
+ args = parser.parse_args()
138
+ except IOError as error:
139
+ parser.error(error)
140
+
141
+ try:
142
+ args.func(
143
+ **{k: v for k, v in vars(args).items()
144
+ if k not in ('func', 'subcommand')})
145
+ except (ValueError, OSError) as error:
146
+ parser.error(error)
147
+
148
+
149
+ if __name__ == '__main__':
150
+ main()
@@ -0,0 +1,222 @@
1
+ from bz2 import open as bz2_open
2
+ from collections import defaultdict
3
+ from gzip import open as gzip_open
4
+ from os import mkdir
5
+ from os.path import basename, exists
6
+
7
+ from Bio import SeqIO
8
+ from Bio.Seq import reverse_complement
9
+ from dict_trie import Trie
10
+ from fastools import guess_file_format, guess_header_format
11
+ from jit_open import Handle, Queue
12
+
13
+ from .match import multi_align
14
+
15
+
16
+ _get_barcode = {
17
+ 'normal': lambda record: record.id.split('#')[1].split('/')[0],
18
+ 'x': lambda record: record.description.split(':')[-1],
19
+ 'umi': lambda record: record.description.split(' ')[0].split(':')[-1],
20
+ 'unknown': lambda record: str(record.seq)}
21
+
22
+ _type_handler = defaultdict(lambda: open, {
23
+ 'bz2': bz2_open,
24
+ 'bzip2': bz2_open,
25
+ 'gz': gzip_open,
26
+ 'gzip': gzip_open})
27
+
28
+
29
+ def _name(handle):
30
+ if hasattr(handle.buffer, '_fp'):
31
+ return handle.buffer._fp.name
32
+ return handle.name
33
+
34
+
35
+ class Extractor(object):
36
+ def __init__(self, handle, in_read=False, fmt=None, start=None, end=None):
37
+ """Configure a barcode extractor.
38
+
39
+ :arg stream handle: Handle to an NGS data file.
40
+ :arg bool in_read: Inspect the read instead of the header.
41
+ :arg str fmt: Header format.
42
+ :arg int start: Start of the barcode.
43
+ :arg int end: End of the barcode.
44
+ """
45
+ self._start = start
46
+ self._end = end
47
+
48
+ if self._start:
49
+ self._start -= 1
50
+
51
+ if not fmt:
52
+ if not in_read:
53
+ self._get_barcode = _get_barcode[guess_header_format(handle)]
54
+ else:
55
+ self._get_barcode = _get_barcode['unknown']
56
+ else:
57
+ self._get_barcode = _get_barcode[fmt]
58
+
59
+ def get(self, record):
60
+ return self._get_barcode(record)[self._start:self._end]
61
+
62
+
63
+ def count(handle, extractor, sample_size, threshold, use_freq=False):
64
+ """Get the most frequent barcodes from an NGS data file.
65
+
66
+ :arg stream handle: Handle to an NGS data file.
67
+ :arg Extractor extractor: A barcode extractor.
68
+ :arg int sample_size: Number of records to probe.
69
+ :arg int threshold: Threshold for the selection method.
70
+ :arg bool use_freq: Select frequent barcodes instead of a fixed amount.
71
+
72
+ :returns list: A list of barcodes.
73
+ """
74
+ barcodes = defaultdict(int)
75
+
76
+ for i, record in enumerate(SeqIO.parse(handle, guess_file_format(handle))):
77
+ if i > sample_size:
78
+ break
79
+ barcodes[extractor.get(record)] += 1
80
+
81
+ if use_freq:
82
+ return filter(lambda x: barcodes[x] >= threshold, barcodes)
83
+ return sorted(barcodes, key=barcodes.get, reverse=True)[:threshold]
84
+
85
+
86
+ def _open_files(path, filenames, barcode, queue):
87
+ """For a list of input files, open the corresponding output files.
88
+
89
+ :arg str path: Output directory.
90
+ :arg list filename: List of input filenames.
91
+ :arg str barcode: Name of the barcode.
92
+ :arg Queue queue: Queue for open files.
93
+
94
+ :returns list: List of handles of output files.
95
+ """
96
+ if not exists(path):
97
+ mkdir(path)
98
+
99
+ handles = []
100
+
101
+ for filename in filenames:
102
+ base, ext = basename(filename).split('.', True)
103
+ handles.append(
104
+ Handle('{}/{}_{}.{}'.format(path, base, barcode, ext), queue,
105
+ f_open=_type_handler[ext.split('.')[-1]]))
106
+
107
+ return handles
108
+
109
+
110
+ def _write(handles, records, file_format):
111
+ for i, record in enumerate(records):
112
+ SeqIO.write(record, handles[i], file_format)
113
+
114
+
115
+ def demultiplex(
116
+ input_handles, barcodes_handle, extractor, mismatch, use_edit,
117
+ path='.'):
118
+ """Demultiplex a list of NGS data files.
119
+
120
+ :arg list input_handles: List of handles to NGS data files.
121
+ :arg stream barcodes_handle: Handle to a file containing barcodes.
122
+ :arg Extractor extractor: A barcode extractor.
123
+ :arg int mismatch: Number of allowed mismatches.
124
+ :arg bool use_edit: Use Levenshtein distance instead of Hamming distance.
125
+ :arg str path: Output directory.
126
+ """
127
+ filenames = list(map(lambda x: _name(x), input_handles))
128
+ queue = Queue()
129
+ default_handles = _open_files(path, filenames, 'UNKNOWN', queue)
130
+
131
+ barcodes = {}
132
+ for line in barcodes_handle.readlines():
133
+ try:
134
+ name, barcode = line.strip().split()
135
+ except ValueError:
136
+ raise ValueError('invalid barcodes file format')
137
+ barcodes[barcode] = _open_files(path, filenames, name, queue)
138
+
139
+ trie = Trie(barcodes.keys())
140
+ distance_function = trie.best_hamming
141
+ if use_edit:
142
+ distance_function = trie.best_levenshtein
143
+
144
+ file_format = guess_file_format(input_handles[0])
145
+ readers = list(map(
146
+ lambda x: SeqIO.parse(x, file_format), input_handles))
147
+
148
+ while True:
149
+ records = list(map(lambda x: next(x), readers))
150
+ if not records:
151
+ break
152
+
153
+ barcode = distance_function(extractor.get(records[0]), mismatch)
154
+ if barcode:
155
+ _write(barcodes[barcode], records, file_format)
156
+ else:
157
+ _write(default_handles, records, file_format)
158
+
159
+ queue.flush()
160
+
161
+
162
+ def match(
163
+ input_handles, barcodes_handle, mismatch, use_edit, path='.',
164
+ filter_multiple=False, directional=False):
165
+ """Demultiplex a list of NGS data files.
166
+
167
+ :arg list input_handles: List of handles to NGS data files.
168
+ :arg stream barcodes_handle: Handle to a file containing barcodes.
169
+ :arg int mismatch: Number of allowed mismatches.
170
+ :arg bool use_edit: Use Levenshtein distance instead of Hamming distance.
171
+ :arg str path: Output directory.
172
+ :arg bool filter_multiple: Write multiple matches to separate files.
173
+ :arg bool directional: Directional input data.
174
+ """
175
+ filenames = list(map(lambda x: _name(x), input_handles))
176
+ queue = Queue()
177
+ default_handles = _open_files(path, filenames, 'UNKNOWN', queue)
178
+ multiple_handles = _open_files(path, filenames, 'MULTIPLE', queue)
179
+
180
+ indel_score = 1
181
+ if not use_edit:
182
+ indel_score = 1000
183
+
184
+ barcodes = []
185
+ for line in map(lambda x: x.strip().split(), barcodes_handle.readlines()):
186
+ try:
187
+ name = line.pop(0)
188
+ except (IndexError, ValueError):
189
+ raise ValueError('invalid barcodes file format')
190
+ barcodes.append((_open_files(path, filenames, name, queue), line))
191
+
192
+ file_format = guess_file_format(input_handles[0])
193
+ readers = list(map(
194
+ lambda x: SeqIO.parse(x, file_format), input_handles))
195
+
196
+ while True:
197
+ records = list(map(lambda x: next(x), readers))
198
+ if not records:
199
+ break
200
+
201
+ reference = str(records[0].seq)
202
+ if directional:
203
+ reference_rc = reverse_complement(reference)
204
+
205
+ found_handles = []
206
+ for handles, barcode in barcodes:
207
+ if multi_align(reference, barcode, mismatch, indel_score):
208
+ found_handles.append(handles)
209
+ elif directional and multi_align(
210
+ reference_rc, barcode, mismatch, indel_score):
211
+ found_handles.append(handles)
212
+
213
+ if found_handles:
214
+ if not filter_multiple or len(found_handles) == 1:
215
+ for handles in found_handles:
216
+ _write(handles, records, file_format)
217
+ else:
218
+ _write(multiple_handles, records, file_format)
219
+ else:
220
+ _write(default_handles, records, file_format)
221
+
222
+ queue.flush()
demultiplex/match.py ADDED
@@ -0,0 +1,24 @@
1
+ from tssv import align
2
+
3
+
4
+ def multi_align(reference, barcodes, distance, indel_score):
5
+ """Align multiple barcodes in order to a reference.
6
+
7
+ :arg str reference:
8
+ :arg list barcodes:
9
+ :arg int distance:
10
+ :arg int indel_score:
11
+
12
+ :returns bool: True if all barcodes align in order, False otherwise.
13
+ """
14
+ _reference = reference
15
+
16
+ for barcode in barcodes:
17
+ alignment = align(_reference, barcode, indel_score)
18
+
19
+ if alignment['distance'] > distance:
20
+ return False
21
+
22
+ _reference = _reference[alignment['position']:]
23
+
24
+ return True
@@ -0,0 +1,66 @@
1
+ Metadata-Version: 2.4
2
+ Name: Demultiplex
3
+ Version: 1.2.3
4
+ Summary: Demultiplex any number of FASTA or a FASTQ files based on a list of barcodes.
5
+ Project-URL: homepage, https://github.com/jfjlaros/demultiplex
6
+ Author-email: "Jeroen F.J. Laros" <jlaros@fixedpoint.nl>
7
+ License-Expression: MIT
8
+ License-File: LICENSE.md
9
+ Keywords: FASTA,FASTQ,barcode,bioinformatics,demultiplex
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Topic :: Scientific/Engineering
15
+ Requires-Dist: biopython
16
+ Requires-Dist: dict-trie
17
+ Requires-Dist: fastools
18
+ Requires-Dist: jit-open
19
+ Requires-Dist: tssv
20
+ Description-Content-Type: text/x-rst
21
+
22
+ Demultiplex: FASTA/FASTQ demultiplexer
23
+ ======================================
24
+
25
+ .. image:: https://img.shields.io/github/last-commit/jfjlaros/demultiplex.svg
26
+ :target: https://github.com/jfjlaros/demultiplex/graphs/commit-activity
27
+ .. image:: https://github.com/jfjlaros/demultiplex/actions/workflows/test.yml/badge.svg
28
+ :target: https://github.com/jfjlaros/demultiplex/actions/workflows/test.yml
29
+ .. image:: https://readthedocs.org/projects/demultiplex/badge/?version=latest
30
+ :target: https://demultiplex.readthedocs.io/en/latest
31
+ .. image:: https://img.shields.io/github/release-date/jfjlaros/demultiplex.svg
32
+ :target: https://github.com/jfjlaros/demultiplex/releases
33
+ .. image:: https://img.shields.io/github/release/jfjlaros/demultiplex.svg
34
+ :target: https://github.com/jfjlaros/demultiplex/releases
35
+ .. image:: https://img.shields.io/pypi/v/demultiplex.svg
36
+ :target: https://pypi.org/project/demultiplex/
37
+ .. image:: https://img.shields.io/github/languages/code-size/jfjlaros/demultiplex.svg
38
+ :target: https://github.com/jfjlaros/demultiplex
39
+ .. image:: https://img.shields.io/github/languages/count/jfjlaros/demultiplex.svg
40
+ :target: https://github.com/jfjlaros/demultiplex
41
+ .. image:: https://img.shields.io/github/languages/top/jfjlaros/demultiplex.svg
42
+ :target: https://github.com/jfjlaros/demultiplex
43
+ .. image:: https://img.shields.io/github/license/jfjlaros/demultiplex.svg
44
+ :target: https://raw.githubusercontent.com/jfjlaros/demultiplex/master/LICENSE.md
45
+ .. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.8362959.svg
46
+ :target: https://zenodo.org/record/8362959
47
+
48
+ ----
49
+
50
+ Versatile NGS demultiplexer with the following features:
51
+
52
+ - Support for FASTA and FASTQ files.
53
+ - Support for gzip and bzip2 compressed files.
54
+ - Support for multiple reads per fragment, e.g., paired-end.
55
+ - Handles barcodes in the header and in the reads.
56
+ - Handles barcodes at *unknown* locations in reads (e.g., PacBio or Nanopore
57
+ barcodes).
58
+ - Support for selection of part of a barcode.
59
+ - Allows for mismatches, insertions and deletions.
60
+ - Barcode guessing by frequency or fixed amount.
61
+ - Handles large numbers (over one million) of barcodes.
62
+
63
+ Please see ReadTheDocs_ for the latest documentation.
64
+
65
+
66
+ .. _ReadTheDocs: https://demultiplex.readthedocs.io/en/latest/index.html
@@ -0,0 +1,9 @@
1
+ demultiplex/__init__.py,sha256=ibYYpUFoquxlchobG9SBPEjvHLHkOn8qPP2n_0BT7FQ,827
2
+ demultiplex/cli.py,sha256=E9Hdz-JuxVP8brMHMMd1dQ-OEj2q-xUZRMWkRrtgCzU,5365
3
+ demultiplex/demultiplex.py,sha256=SsyYjNhYrGjHfYEO_Fc4nxUMuWqrXq2qkCGmUSe2fQw,7352
4
+ demultiplex/match.py,sha256=GwvngjAotvkm5tSDgTGVgfd9Y7ZDsEKMVThchGuS6Cg,584
5
+ demultiplex-1.2.3.dist-info/METADATA,sha256=UBX4zlH2flCZNjCWWmUvrDCRDxiHrQ--pI1u9TAmiHw,3032
6
+ demultiplex-1.2.3.dist-info/WHEEL,sha256=VX-VJ7c6dw9Ge3EqJIbA6W3pOUbz24SnnGGFNr55jY4,105
7
+ demultiplex-1.2.3.dist-info/entry_points.txt,sha256=kg7ARrQaxKCFhUPPE-_EDOje-y3iXuZAtSuW_MptWUo,53
8
+ demultiplex-1.2.3.dist-info/licenses/LICENSE.md,sha256=kSdtuXPyVgLRqkNJH1nLyEy4jm8VHh0MyCp1VWPOAZU,1023
9
+ demultiplex-1.2.3.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py2-none-any
5
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ demultiplex = demultiplex.cli:main
@@ -0,0 +1,17 @@
1
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
2
+ this software and associated documentation files (the "Software"), to deal in
3
+ the Software without restriction, including without limitation the rights to
4
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
5
+ of the Software, and to permit persons to whom the Software is furnished to do
6
+ so, subject to the following conditions:
7
+
8
+ The above copyright notice and this permission notice shall be included in all
9
+ copies or substantial portions of the Software.
10
+
11
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
13
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
14
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
15
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
17
+ SOFTWARE.