phantombuster 0.12.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. phantombuster-0.12.0/PKG-INFO +24 -0
  2. phantombuster-0.12.0/README.md +0 -0
  3. phantombuster-0.12.0/build.py +39 -0
  4. phantombuster-0.12.0/pyproject.toml +50 -0
  5. phantombuster-0.12.0/setup.py +47 -0
  6. phantombuster-0.12.0/src/phantombuster/__init__.py +3 -0
  7. phantombuster-0.12.0/src/phantombuster/bamindexer.py +101 -0
  8. phantombuster-0.12.0/src/phantombuster/cli.py +186 -0
  9. phantombuster-0.12.0/src/phantombuster/config_files.py +179 -0
  10. phantombuster-0.12.0/src/phantombuster/core.py +122 -0
  11. phantombuster-0.12.0/src/phantombuster/error_corrector.py +347 -0
  12. phantombuster-0.12.0/src/phantombuster/handler.py +82 -0
  13. phantombuster-0.12.0/src/phantombuster/io_.py +242 -0
  14. phantombuster-0.12.0/src/phantombuster/merge_cython.pyx +247 -0
  15. phantombuster-0.12.0/src/phantombuster/plumbing.py +821 -0
  16. phantombuster-0.12.0/src/phantombuster/porcelain.py +324 -0
  17. phantombuster-0.12.0/src/phantombuster/project.py +109 -0
  18. phantombuster-0.12.0/src/phantombuster/remoter/__init__.py +6 -0
  19. phantombuster-0.12.0/src/phantombuster/remoter/async_.py +21 -0
  20. phantombuster-0.12.0/src/phantombuster/remoter/cli.py +32 -0
  21. phantombuster-0.12.0/src/phantombuster/remoter/globaladdress.py +2 -0
  22. phantombuster-0.12.0/src/phantombuster/remoter/lock.py +73 -0
  23. phantombuster-0.12.0/src/phantombuster/remoter/logging_.py +91 -0
  24. phantombuster-0.12.0/src/phantombuster/remoter/messages.py +336 -0
  25. phantombuster-0.12.0/src/phantombuster/remoter/persisters.py +255 -0
  26. phantombuster-0.12.0/src/phantombuster/remoter/scheduler.py +426 -0
  27. phantombuster-0.12.0/src/phantombuster/remoter/serialization.py +87 -0
  28. phantombuster-0.12.0/src/phantombuster/remoter/server.py +442 -0
  29. phantombuster-0.12.0/src/phantombuster/remoter/socket_.py +76 -0
  30. phantombuster-0.12.0/src/phantombuster/remoter/store.py +223 -0
  31. phantombuster-0.12.0/src/phantombuster/remoter/task.py +187 -0
  32. phantombuster-0.12.0/src/phantombuster/remoter/worker.py +242 -0
  33. phantombuster-0.12.0/src/phantombuster/store.py +475 -0
  34. phantombuster-0.12.0/src/phantombuster/stores.py +51 -0
  35. phantombuster-0.12.0/src/phantombuster/vault.py +249 -0
@@ -0,0 +1,24 @@
1
+ Metadata-Version: 2.1
2
+ Name: phantombuster
3
+ Version: 0.12.0
4
+ Summary:
5
+ Author: Simon Haendeler
6
+ Author-email: simon.emanuel.haendeler@univie.ac.at
7
+ Requires-Python: >=3.9,<3.13
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.9
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Requires-Dist: click (>=8.1.3,<9.0.0)
14
+ Requires-Dist: pandas (>=2.0,<3.0)
15
+ Requires-Dist: polars (>=0.19.12,<0.20.0)
16
+ Requires-Dist: pyarrow (>=15,<16)
17
+ Requires-Dist: pysam (>=0.20.0,<0.21.0)
18
+ Requires-Dist: regex (>=2022.10.31,<2023.0.0)
19
+ Requires-Dist: scipy (>=1.10.1,<2.0.0)
20
+ Requires-Dist: trio (>=0.22.0,<0.23.0)
21
+ Requires-Dist: zmq (>=0.0.0,<0.0.1)
22
+ Description-Content-Type: text/markdown
23
+
24
+
File without changes
@@ -0,0 +1,39 @@
1
+ import numpy as np
2
+ import pyarrow as pa
3
+
4
+ import os
5
+
6
+ from setuptools.extension import Extension
7
+ from Cython.Build import cythonize
8
+
9
+ def build(setup_kwargs):
10
+ """
11
+ This is a callback for poetry used to hook in our extensions.
12
+ """
13
+ ext_modules = cythonize("src/phantombuster/merge_cython.pyx")
14
+
15
+ import pyarrow
16
+ pyarrow.create_library_symlinks()
17
+
18
+ for ext in ext_modules:
19
+ # The Numpy C headers are currently required
20
+ ext.include_dirs.append(np.get_include())
21
+ ext.include_dirs.append(pa.get_include())
22
+ ext.libraries.extend(pa.get_libraries())
23
+ ext.library_dirs.extend(pa.get_library_dirs())
24
+
25
+ if os.name == 'posix':
26
+ ext.extra_compile_args.append('-std=c++17')
27
+ ext.runtime_library_dirs.append("$ORIGIN/../pyarrow")
28
+
29
+ # Try uncommenting the following line on Linux
30
+ # if you get weird linker errors or runtime crashes
31
+ ext.define_macros.append(("_GLIBCXX_USE_CXX11_ABI", "0"))
32
+
33
+
34
+ setup_kwargs.update(
35
+ {
36
+ # declare the extension so that setuptools will compile it
37
+ "ext_modules": ext_modules,
38
+ }
39
+ )
@@ -0,0 +1,50 @@
1
+ [tool.poetry]
2
+ name = "phantombuster"
3
+ version = "0.12.0"
4
+ description = ""
5
+ authors = ["Simon Haendeler <simon.emanuel.haendeler@univie.ac.at>"]
6
+ readme = "README.md"
7
+
8
+ [tool.poetry.dependencies]
9
+ python = ">=3.9,<3.13"
10
+ pandas = "^2.0"
11
+ pysam = "^0.20.0"
12
+ regex = "^2022.10.31"
13
+ pyarrow = "^15"
14
+ click = "^8.1.3"
15
+ zmq = "^0.0.0"
16
+ trio = "^0.22.0"
17
+ scipy = "^1.10.1"
18
+ polars = "^0.19.12"
19
+
20
+ [tool.poetry.dev-dependencies]
21
+ mypy = "^0.990"
22
+ pytest = "^7.2.0"
23
+ Cython = "^0.29.32"
24
+ cibuildwheel = "^2.12.0"
25
+
26
+ [tool.poetry.group.dev.dependencies]
27
+ auditwheel = "^5.3.0"
28
+
29
+ [tool.pytest.ini_options]
30
+ markers = [
31
+ "slow: marks tests as slow (deselect with '-m \"not slow\"')",
32
+ ]
33
+
34
+ [[tool.poetry.source]]
35
+ name = "syntonym"
36
+ url = "http://localhost:8080/"
37
+ priority = 'supplemental'
38
+
39
+ [build-system]
40
+ requires = ["poetry-core", "cython", "numpy", "pyarrow==15.0.2", "setuptools"]
41
+ build-backend = "poetry.core.masonry.api"
42
+
43
+ [tool.poetry.build]
44
+ script = "build.py"
45
+ generate-setup-file = true
46
+
47
+ [tool.poetry.scripts]
48
+ phantombuster = 'phantombuster.cli:phantombuster'
49
+
50
+
@@ -0,0 +1,47 @@
1
+ # -*- coding: utf-8 -*-
2
+ from setuptools import setup
3
+
4
+ package_dir = \
5
+ {'': 'src'}
6
+
7
+ packages = \
8
+ ['phantombuster', 'phantombuster.remoter']
9
+
10
+ package_data = \
11
+ {'': ['*']}
12
+
13
+ install_requires = \
14
+ ['click>=8.1.3,<9.0.0',
15
+ 'pandas>=2.0,<3.0',
16
+ 'polars>=0.19.12,<0.20.0',
17
+ 'pyarrow>=15,<16',
18
+ 'pysam>=0.20.0,<0.21.0',
19
+ 'regex>=2022.10.31,<2023.0.0',
20
+ 'scipy>=1.10.1,<2.0.0',
21
+ 'trio>=0.22.0,<0.23.0',
22
+ 'zmq>=0.0.0,<0.0.1']
23
+
24
+ entry_points = \
25
+ {'console_scripts': ['phantombuster = phantombuster.cli:phantombuster']}
26
+
27
+ setup_kwargs = {
28
+ 'name': 'phantombuster',
29
+ 'version': '0.12.0',
30
+ 'description': '',
31
+ 'long_description': '',
32
+ 'author': 'Simon Haendeler',
33
+ 'author_email': 'simon.emanuel.haendeler@univie.ac.at',
34
+ 'maintainer': 'None',
35
+ 'maintainer_email': 'None',
36
+ 'url': 'None',
37
+ 'package_dir': package_dir,
38
+ 'packages': packages,
39
+ 'package_data': package_data,
40
+ 'install_requires': install_requires,
41
+ 'entry_points': entry_points,
42
+ 'python_requires': '>=3.9,<3.13',
43
+ }
44
+ from build import *
45
+ build(setup_kwargs)
46
+
47
+ setup(**setup_kwargs)
@@ -0,0 +1,3 @@
1
+ import importlib.metadata
2
+
3
+ __version__ = importlib.metadata.version("phantombuster")
@@ -0,0 +1,101 @@
1
+ import zlib
2
+ import re
3
+ import numpy as np
4
+
5
+
6
+ class EndOfFile(Exception):
7
+ pass
8
+
9
+ def read_header(f):
10
+ header = f.read(12)
11
+ if header == b"":
12
+ raise EndOfFile()
13
+ xlen = parse_xlen(header)
14
+
15
+ subfields = f.read(xlen)
16
+ if subfields == b"":
17
+ raise EndOfFile()
18
+
19
+ bsize = parse_bsize(subfields, xlen)
20
+ return xlen, bsize
21
+
22
+ def read_block(f, bsize, xlen):
23
+ """Assumes you called read_header before"""
24
+ rest = f.read(bsize - 12 - xlen)
25
+ return zlib.decompress(rest, -15)
26
+
27
+ def parse_xlen(header):
28
+ return int.from_bytes(header[-2:], "little")
29
+
30
+ def parse_bsize(subfields, xlen):
31
+ offset = 0
32
+ block_size = None
33
+ while offset <= xlen:
34
+ if subfields[offset:offset+2] == b"BC":
35
+ block_size = int.from_bytes(subfields[offset+4:offset+6], "little") +1
36
+ break
37
+ else:
38
+ offset += int.from_bytes(subfields[offset+2:offset+4], "little") + 4
39
+ if block_size is None:
40
+ raise Exception("Could not find BSIZE")
41
+
42
+ return block_size
43
+
44
+ def parse_read_start(data):
45
+ if data == b"":
46
+ raise EndOfFile()
47
+
48
+ found = None
49
+ i = 0
50
+ for m in re.finditer(8*b"\xff", data):
51
+ if found:
52
+ offset = m.start() - found
53
+ if offset == 20 and (found-4 >= 0):
54
+ return found-4
55
+ else:
56
+ found = m.start()
57
+ else:
58
+ found = m.start()
59
+ i += 1
60
+ if i == 10:
61
+ raise Exception("Could not find the beginning of the read, either read contains FF bytes or assumptions are violated")
62
+ raise Exception("Could not find the beginning of the read, assumptions are violated")
63
+
64
+ def index(f, every):
65
+ coffset = 0
66
+ index = []
67
+ global_i = 0
68
+ i = 0
69
+ first_block = True
70
+
71
+ while True:
72
+ i += 1
73
+ global_i += 1
74
+ f.seek(coffset)
75
+ try:
76
+ xlen, bsize = read_header(f)
77
+ except EndOfFile:
78
+ break
79
+
80
+ #if True:
81
+ if i % every == 0 or global_i <= 2:
82
+ block_data = read_block(f, bsize, xlen)
83
+ if first_block:
84
+ uoffset = 0
85
+ else:
86
+ try:
87
+ uoffset = parse_read_start(block_data)
88
+ except EndOfFile:
89
+ break
90
+ idx = (coffset << 16) | uoffset
91
+ if idx >= 0:
92
+ index.append(idx)
93
+
94
+ coffset += bsize
95
+ first_block = False
96
+
97
+ index = index[1:] + [np.inf]
98
+
99
+ return index
100
+
101
+
@@ -0,0 +1,186 @@
1
+ #!/usr/bin/env python
2
+
3
+ import logging
4
+ import logging.config
5
+ import os
6
+ import os.path
7
+ import sys
8
+ import multiprocessing as mp
9
+ import glob
10
+
11
+ import ntpath
12
+ from dataclasses import dataclass
13
+ import json
14
+
15
+ from phantombuster import porcelain, plumbing, stores
16
+ from phantombuster import core
17
+ from phantombuster.stores import deduplicator_to_pyarrow_table
18
+ from phantombuster.remoter import Worker
19
+ from phantombuster.io_ import write_parquet
20
+ from phantombuster.project import Project
21
+ import click
22
+ from typing import Optional, List
23
+ import pyarrow.parquet
24
+ import pyarrow.csv
25
+
26
+ from pathlib import Path
27
+
28
+ def configure_logging(outputlog, verbose):
29
+ logging_config = {
30
+ 'version': 1,
31
+ 'formatters':{'default': {'format': "%(asctime)s %(levelname)-8s %(name)-15s %(message)s",
32
+ 'datefmt': "%Y-%m-%d %H:%M:%S"}},
33
+ 'handlers': {'console': {'class': 'logging.StreamHandler', 'formatter': 'default', 'stream': 'ext://sys.stdout'}},
34
+ 'loggers': {'remoter': {'level': 'WARNING'}},
35
+ 'root': {'handlers': ['console'], 'level': 'INFO'}
36
+ }
37
+ if outputlog:
38
+ logging_config['handlers']['file'] = {'class': 'logging.FileHandler', 'formatter': 'default', 'filename': outputlog}
39
+ logging_config['root']['handlers'].append('file')
40
+ if verbose:
41
+ logging_config['root']['level'] = 'DEBUG'
42
+
43
+ logging.config.dictConfig(logging_config)
44
+ logging.info('Logging configured')
45
+
46
+
47
+ def log_call(function, **kwargs):
48
+ """Log a CLI call with all arguments"""
49
+ logging.info(f"PhantomBuster {function} was called with the following arguments: {kwargs}")
50
+
51
+
52
+ @click.group()
53
+ @click.version_option(package_name='phantombuster')
54
+ @click.option("--verbose/--silent", default=False, help="Enable verbose debugging")
55
+ @click.option("-o", "--outputlog", type=click.Path(), help="Output file for logs")
56
+ @click.option("--save-results/--no-save-results", type=bool, default=True, help="DEBUG OPTION set no-save-results to not save which stages were already done")
57
+ def phantombuster(verbose: bool, outputlog: str, save_results: bool) -> None:
58
+ print('configuring logging')
59
+ configure_logging(outputlog, verbose)
60
+ print('configured logging')
61
+
62
+
63
+ # -- Main Commands -- #
64
+
65
+
66
+ @phantombuster.command()
67
+ @click.argument("input", type=click.Path(exists=True))
68
+ @click.option("--outdir", required=True)
69
+ @click.option("--regex-file", required=True)
70
+ @click.option("--barcode-hierarchy-file", type=click.Path(exists=True), required=True)
71
+ @click.option("--debug/--production", default=False)
72
+ @click.option("--show-qc/--no-qc", default=False)
73
+ @click.option("--force/--no-force", default=False)
74
+ def demultiplex(input, regex_file, debug, outdir, show_qc, force, barcode_hierarchy_file):
75
+ print('start demultiplex CLI command')
76
+ log_call("demultiplex", input=input, regex_file=regex_file, debug=debug,
77
+ outdir=outdir, show_qc=show_qc, barcode_hierarchy_file=barcode_hierarchy_file)
78
+ project = Project(outdir)
79
+
80
+ print('logged call, on to work')
81
+ try:
82
+ core.demultiplex(input, regex_file, barcode_hierarchy_file, project, debug=debug, show_qc=show_qc)
83
+ except Exception as e:
84
+ logging.exception("Pipeline encountered an error. Aborting.")
85
+ raise click.Abort()
86
+ return
87
+
88
+
89
+ @phantombuster.command()
90
+ @click.option("--outdir", required=True)
91
+ @click.option("--error-threshold", default=1)
92
+ @click.option("--barcode-hierarchy-file", required=True)
93
+ def error_correct(outdir, error_threshold, barcode_hierarchy_file):
94
+ log_call("error-correct", outdir=outdir, error_threshold=error_threshold)
95
+ project = Project(outdir)
96
+ core.error_correct(project, error_threshold, barcode_hierarchy_file)
97
+
98
+
99
+ @phantombuster.command()
100
+ @click.argument('hopping-barcodes', nargs=-1)
101
+ @click.option("--outdir", required=True)
102
+ @click.option("--threshold", default=0.05, type=float)
103
+ def hopping_removal(outdir, threshold, hopping_barcodes):
104
+ log_call("hopping-removal", outdir=outdir, threshold=threshold, hopping_barcodes=hopping_barcodes)
105
+ project = Project(outdir)
106
+ hopping_barcodes = [bc.split(',') for bc in hopping_barcodes]
107
+ core.hopping_removal(project, hopping_barcodes, threshold)
108
+
109
+ @phantombuster.command()
110
+ @click.option("--outdir", required=True)
111
+ @click.option("--prefix")
112
+ @click.option("--threshold-file", required=True)
113
+ def threshold(outdir, prefix, threshold_file):
114
+ log_call("threshold", outdir=outdir, prefix=prefix, threshold_file=threshold_file)
115
+ core.threshold(outdir, prefix, threshold_file)
116
+
117
+ # -- Helper Commands -- #
118
+
119
+ @phantombuster.command()
120
+ @click.argument("prefixes", nargs=-1)
121
+ @click.option("--outdir", required=True)
122
+ @click.option("--prefix")
123
+ @click.option("--barcode-hierarchy-file", type=click.Path(exists=True), required=True)
124
+ def merge(prefixes, outdir, prefix, barcode_hierarchy_file):
125
+ """
126
+ Merge multiple prefixes under one prefix
127
+ """
128
+ # Log the call of this function with all parameters to the logfile
129
+ log_call("merge", prefixes=prefixes, outdir=outdir, prefix=prefix, barcode_hierarchy_file=barcode_hierarchy_file)
130
+
131
+ master_paths = PathsAndFiles(outdir, prefix, None)
132
+ master_paths.create()
133
+
134
+ try:
135
+ barcode_hierarchy = plumbing.read_barcode_hierarchy_file(barcode_hierarchy_file)
136
+ except Exception:
137
+ raise Exception("Could not read barcode hierarchy file correctly")
138
+
139
+ to_merge = [PathsAndFiles(outdir, prefix, None) for prefix in prefixes]
140
+
141
+ results = [stores.load(('deduplication', True), paths.stage_path('deduplication')) for paths in to_merge]
142
+ out = plumbing.combine(results, barcode_hierarchy)
143
+
144
+ stores.save(out, master_paths.stage_path('deduplication'), id='deduplication')
145
+
146
+
147
+
148
+ @phantombuster.command()
149
+ @click.argument("parquetfile")
150
+ @click.argument("outfile", default=None, required=False)
151
+ def to_csv(parquetfile, outfile):
152
+ log_call("to_csv", sample=parquetfile, outdir=outfile)
153
+ table = pyarrow.parquet.read_table(parquetfile)
154
+ if outfile is None:
155
+ outfile = parquetfile.replace(".parquet", ".csv")
156
+ pyarrow.csv.write_csv(table, outfile)
157
+
158
+
159
+ @phantombuster.command()
160
+ @click.argument("csvfile")
161
+ @click.argument("outfile", default=None, required=False)
162
+ def to_parquet(csvfile, outfile):
163
+ log_call("to_parquet", csvfile=csvfile, outfile=outfile)
164
+ table = pyarrow.csv.read_csv(csvfile)
165
+ if outfile is None:
166
+ outfile = csvfile.replace(".csv", ".parquet")
167
+ write_parquet(table, outfile)
168
+
169
+
170
+ @phantombuster.command()
171
+ @click.option("--outdir", default=None, required=True)
172
+ @click.option("--name", default=None)
173
+ def worker(outdir, name):
174
+ import phantombuster as phantombuster
175
+ import phantombuster.plumbing
176
+
177
+ project = Project(outdir)
178
+ project.create()
179
+ path = project._get_server_path()
180
+
181
+ print(f"Connecting to {path}")
182
+ worker = Worker(path, name=name)
183
+ worker.start_async()
184
+
185
+ if __name__ == "__main__":
186
+ phantombuster()
@@ -0,0 +1,179 @@
1
+ from dataclasses import dataclass
2
+ import logging
3
+ import pyarrow
4
+ import pyarrow.csv
5
+ import regex
6
+ from phantombuster.plumbing import calculate_threshold
7
+
8
+ VALID_TAGS = ['b2', 'query', 'bc', 'name', 'seq']
9
+
10
+
11
+ def read_file(name: str) -> pyarrow.Table:
12
+ if name.endswith('tsv'):
13
+ options = pyarrow.csv.ParseOptions(delimiter="\t")
14
+ else:
15
+ options = pyarrow.csv.ParseOptions()
16
+ table = pyarrow.csv.read_csv(name, parse_options=options)
17
+ return table
18
+
19
+
20
+ @dataclass
21
+ class InputFile:
22
+ path: str
23
+ group: str
24
+ prefix: str
25
+
26
+ def _to_json_e(self):
27
+ return {'path': self.path, 'group': self.group, 'prefix': self.prefix}
28
+
29
+ @classmethod
30
+ def _from_json_e(cls, d: dict):
31
+ return InputFile(**d)
32
+
33
+
34
+ @dataclass
35
+ class InputGroup:
36
+ files: list[InputFile]
37
+ group: str
38
+
39
+ def _to_json_e(self) -> dict:
40
+ return {'files': [f._to_json_e() for f in self.files], 'group': self.group}
41
+
42
+ @classmethod
43
+ def _from_json_e(cls, d: dict) -> 'InputGroup':
44
+ ig = InputGroup(files=[InputFile._from_json_e(f) for f in d['files']], group=d['group'])
45
+ return ig
46
+
47
+
48
+ class RegexDictionary:
49
+
50
+ def __init__(self, d=None):
51
+ if d is None:
52
+ d = {}
53
+ self._groups = d
54
+
55
+ def __eq__(self, o):
56
+ if isinstance(o, RegexDictionary):
57
+ return self._groups == o._groups
58
+ else:
59
+ return NotImplemented()
60
+
61
+ def add_regex(self, tag, rex, prefix='', group='*'):
62
+ d = self._groups.get(group, {})
63
+ d[prefix+tag] = rex
64
+ self._groups[group] = d
65
+
66
+ def get_regexes_for_group(self, group):
67
+ d1 = self._groups.get(group, {})
68
+ d2 = self._groups.get('*', {})
69
+ d = d1|d2
70
+ assert len(d) > 0
71
+ return {key: regex.compile(re) for key, re in d.items()}
72
+
73
+ def _to_json_e(self):
74
+ return self._groups
75
+
76
+ @classmethod
77
+ def _from_json_e(cls, d):
78
+ r = RegexDictionary(d)
79
+ return r
80
+
81
+
82
+ def read_regex_file(path):
83
+ table = read_file(path)
84
+ column_names = ['group', 'prefix', 'tag', 'regex']
85
+
86
+ tags = table['tag'].to_pylist()
87
+
88
+ invalid_tags = [tag for tag in tags if tag not in VALID_TAGS]
89
+ if len(invalid_tags) > 0:
90
+ raise KeyError(f'Invalid tags: {", ".join(invalid_tags)}')
91
+
92
+ regexes = table['regex'].to_pylist()
93
+
94
+ if 'group' not in table.column_names:
95
+ groups = ['*'] * len(tags)
96
+ else:
97
+ groups = table['group'].to_pylist()
98
+
99
+ if 'prefix' not in table.column_names:
100
+ prefixs = [''] * len(tags)
101
+ else:
102
+ prefixs = table['prefix'].to_pylist()
103
+
104
+ regex_dict = RegexDictionary()
105
+
106
+ for group, tag, re, prefix in zip(groups, tags, regexes, prefixs):
107
+ regex_dict.add_regex(tag, re, prefix, group)
108
+ return regex_dict
109
+
110
+
111
+ def read_barcode_hierarchy_file(name):
112
+ table = read_file(name)
113
+ column_names = ["barcode", "type", "referencefile", "threshold", "min_length", "max_length"]
114
+ missing_columns = [col for col in column_names if col not in table.column_names]
115
+ if len(missing_columns) > 0 :
116
+ raise Exception(f"Barcode Hierarchy File Incorrect, missing columns: {missing_columns}") # TODO better error reporting
117
+
118
+ def parse_length(value):
119
+ if value == "-":
120
+ return None
121
+ else:
122
+ try:
123
+ return int(value)
124
+ except Exception:
125
+ raise Exception("Can not parse min or max length correctly in barcode hierarchy file, must be either '-' or an integer")
126
+
127
+ def parse_type(value):
128
+ if value == "reference":
129
+ return "reference"
130
+ else:
131
+ return "random"
132
+
133
+ barcodes = [{"name": name, "type": parse_type(type), "referencefile": reference, "threshold": threshold,
134
+ "min_length": parse_length(min_length), "max_length": parse_length(max_length)}
135
+ for name, type, reference, threshold, min_length, max_length in zip(*[table[name].to_pylist() for name in column_names])]
136
+
137
+ for bc in barcodes:
138
+ if bc["type"] == "reference":
139
+ table = read_file(bc["referencefile"])
140
+ bc["reference"] = {bc: name for bc, name in zip([str(bc) for bc in table["barcode"].to_pylist()], [str(name) for name in table["name"].to_pylist()])}
141
+ if bc["threshold"] == "auto":
142
+ bc["threshold"] = calculate_threshold(bc["reference"])
143
+ else:
144
+ try:
145
+ bc["threshold"] = int(bc["threshold"])
146
+ except Exception:
147
+ raise Exception("Can not properly read threshold column of barcode hierarchy file, needs to be 'auto' or an integer")
148
+ return barcodes
149
+
150
+
151
+ def read_input_files_file(path):
152
+ table = read_file(path)
153
+
154
+ column_names = ['file', 'group', 'prefix']
155
+
156
+ files = table['file'].to_pylist()
157
+
158
+ if 'group' not in table.column_names:
159
+ groups = table["file"].to_pylist()
160
+ else:
161
+ groups = table['group'].to_pylist()
162
+
163
+ if 'prefix' not in table.column_names:
164
+ prefixs = [''] * len(files)
165
+ else:
166
+ prefixs = table['prefix'].to_pylist()
167
+
168
+ input_files = [InputFile(file, group if group is not None else "", prefix if prefix is not None else "") for file, group, prefix in zip(files, groups, prefixs)]
169
+ groups = {}
170
+ for f in input_files:
171
+ l = groups.get(f.group, [])
172
+ l.append(f)
173
+ groups[f.group] = l
174
+
175
+ igs = []
176
+ for group_name, files in groups.items():
177
+ igs.append(InputGroup(files, group_name))
178
+
179
+ return igs