phantombuster 0.12.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phantombuster-0.12.0/PKG-INFO +24 -0
- phantombuster-0.12.0/README.md +0 -0
- phantombuster-0.12.0/build.py +39 -0
- phantombuster-0.12.0/pyproject.toml +50 -0
- phantombuster-0.12.0/setup.py +47 -0
- phantombuster-0.12.0/src/phantombuster/__init__.py +3 -0
- phantombuster-0.12.0/src/phantombuster/bamindexer.py +101 -0
- phantombuster-0.12.0/src/phantombuster/cli.py +186 -0
- phantombuster-0.12.0/src/phantombuster/config_files.py +179 -0
- phantombuster-0.12.0/src/phantombuster/core.py +122 -0
- phantombuster-0.12.0/src/phantombuster/error_corrector.py +347 -0
- phantombuster-0.12.0/src/phantombuster/handler.py +82 -0
- phantombuster-0.12.0/src/phantombuster/io_.py +242 -0
- phantombuster-0.12.0/src/phantombuster/merge_cython.pyx +247 -0
- phantombuster-0.12.0/src/phantombuster/plumbing.py +821 -0
- phantombuster-0.12.0/src/phantombuster/porcelain.py +324 -0
- phantombuster-0.12.0/src/phantombuster/project.py +109 -0
- phantombuster-0.12.0/src/phantombuster/remoter/__init__.py +6 -0
- phantombuster-0.12.0/src/phantombuster/remoter/async_.py +21 -0
- phantombuster-0.12.0/src/phantombuster/remoter/cli.py +32 -0
- phantombuster-0.12.0/src/phantombuster/remoter/globaladdress.py +2 -0
- phantombuster-0.12.0/src/phantombuster/remoter/lock.py +73 -0
- phantombuster-0.12.0/src/phantombuster/remoter/logging_.py +91 -0
- phantombuster-0.12.0/src/phantombuster/remoter/messages.py +336 -0
- phantombuster-0.12.0/src/phantombuster/remoter/persisters.py +255 -0
- phantombuster-0.12.0/src/phantombuster/remoter/scheduler.py +426 -0
- phantombuster-0.12.0/src/phantombuster/remoter/serialization.py +87 -0
- phantombuster-0.12.0/src/phantombuster/remoter/server.py +442 -0
- phantombuster-0.12.0/src/phantombuster/remoter/socket_.py +76 -0
- phantombuster-0.12.0/src/phantombuster/remoter/store.py +223 -0
- phantombuster-0.12.0/src/phantombuster/remoter/task.py +187 -0
- phantombuster-0.12.0/src/phantombuster/remoter/worker.py +242 -0
- phantombuster-0.12.0/src/phantombuster/store.py +475 -0
- phantombuster-0.12.0/src/phantombuster/stores.py +51 -0
- phantombuster-0.12.0/src/phantombuster/vault.py +249 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: phantombuster
|
|
3
|
+
Version: 0.12.0
|
|
4
|
+
Summary:
|
|
5
|
+
Author: Simon Haendeler
|
|
6
|
+
Author-email: simon.emanuel.haendeler@univie.ac.at
|
|
7
|
+
Requires-Python: >=3.9,<3.13
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Requires-Dist: click (>=8.1.3,<9.0.0)
|
|
14
|
+
Requires-Dist: pandas (>=2.0,<3.0)
|
|
15
|
+
Requires-Dist: polars (>=0.19.12,<0.20.0)
|
|
16
|
+
Requires-Dist: pyarrow (>=15,<16)
|
|
17
|
+
Requires-Dist: pysam (>=0.20.0,<0.21.0)
|
|
18
|
+
Requires-Dist: regex (>=2022.10.31,<2023.0.0)
|
|
19
|
+
Requires-Dist: scipy (>=1.10.1,<2.0.0)
|
|
20
|
+
Requires-Dist: trio (>=0.22.0,<0.23.0)
|
|
21
|
+
Requires-Dist: zmq (>=0.0.0,<0.0.1)
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
|
|
24
|
+
|
|
File without changes
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pyarrow as pa
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
from setuptools.extension import Extension
|
|
7
|
+
from Cython.Build import cythonize
|
|
8
|
+
|
|
9
|
+
def build(setup_kwargs):
|
|
10
|
+
"""
|
|
11
|
+
This is a callback for poetry used to hook in our extensions.
|
|
12
|
+
"""
|
|
13
|
+
ext_modules = cythonize("src/phantombuster/merge_cython.pyx")
|
|
14
|
+
|
|
15
|
+
import pyarrow
|
|
16
|
+
pyarrow.create_library_symlinks()
|
|
17
|
+
|
|
18
|
+
for ext in ext_modules:
|
|
19
|
+
# The Numpy C headers are currently required
|
|
20
|
+
ext.include_dirs.append(np.get_include())
|
|
21
|
+
ext.include_dirs.append(pa.get_include())
|
|
22
|
+
ext.libraries.extend(pa.get_libraries())
|
|
23
|
+
ext.library_dirs.extend(pa.get_library_dirs())
|
|
24
|
+
|
|
25
|
+
if os.name == 'posix':
|
|
26
|
+
ext.extra_compile_args.append('-std=c++17')
|
|
27
|
+
ext.runtime_library_dirs.append("$ORIGIN/../pyarrow")
|
|
28
|
+
|
|
29
|
+
# Try uncommenting the following line on Linux
|
|
30
|
+
# if you get weird linker errors or runtime crashes
|
|
31
|
+
ext.define_macros.append(("_GLIBCXX_USE_CXX11_ABI", "0"))
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
setup_kwargs.update(
|
|
35
|
+
{
|
|
36
|
+
# declare the extension so that setuptools will compile it
|
|
37
|
+
"ext_modules": ext_modules,
|
|
38
|
+
}
|
|
39
|
+
)
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "phantombuster"
|
|
3
|
+
version = "0.12.0"
|
|
4
|
+
description = ""
|
|
5
|
+
authors = ["Simon Haendeler <simon.emanuel.haendeler@univie.ac.at>"]
|
|
6
|
+
readme = "README.md"
|
|
7
|
+
|
|
8
|
+
[tool.poetry.dependencies]
|
|
9
|
+
python = ">=3.9,<3.13"
|
|
10
|
+
pandas = "^2.0"
|
|
11
|
+
pysam = "^0.20.0"
|
|
12
|
+
regex = "^2022.10.31"
|
|
13
|
+
pyarrow = "^15"
|
|
14
|
+
click = "^8.1.3"
|
|
15
|
+
zmq = "^0.0.0"
|
|
16
|
+
trio = "^0.22.0"
|
|
17
|
+
scipy = "^1.10.1"
|
|
18
|
+
polars = "^0.19.12"
|
|
19
|
+
|
|
20
|
+
[tool.poetry.dev-dependencies]
|
|
21
|
+
mypy = "^0.990"
|
|
22
|
+
pytest = "^7.2.0"
|
|
23
|
+
Cython = "^0.29.32"
|
|
24
|
+
cibuildwheel = "^2.12.0"
|
|
25
|
+
|
|
26
|
+
[tool.poetry.group.dev.dependencies]
|
|
27
|
+
auditwheel = "^5.3.0"
|
|
28
|
+
|
|
29
|
+
[tool.pytest.ini_options]
|
|
30
|
+
markers = [
|
|
31
|
+
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
[[tool.poetry.source]]
|
|
35
|
+
name = "syntonym"
|
|
36
|
+
url = "http://localhost:8080/"
|
|
37
|
+
priority = 'supplemental'
|
|
38
|
+
|
|
39
|
+
[build-system]
|
|
40
|
+
requires = ["poetry-core", "cython", "numpy", "pyarrow==15.0.2", "setuptools"]
|
|
41
|
+
build-backend = "poetry.core.masonry.api"
|
|
42
|
+
|
|
43
|
+
[tool.poetry.build]
|
|
44
|
+
script = "build.py"
|
|
45
|
+
generate-setup-file = true
|
|
46
|
+
|
|
47
|
+
[tool.poetry.scripts]
|
|
48
|
+
phantombuster = 'phantombuster.cli:phantombuster'
|
|
49
|
+
|
|
50
|
+
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
from setuptools import setup
|
|
3
|
+
|
|
4
|
+
package_dir = \
|
|
5
|
+
{'': 'src'}
|
|
6
|
+
|
|
7
|
+
packages = \
|
|
8
|
+
['phantombuster', 'phantombuster.remoter']
|
|
9
|
+
|
|
10
|
+
package_data = \
|
|
11
|
+
{'': ['*']}
|
|
12
|
+
|
|
13
|
+
install_requires = \
|
|
14
|
+
['click>=8.1.3,<9.0.0',
|
|
15
|
+
'pandas>=2.0,<3.0',
|
|
16
|
+
'polars>=0.19.12,<0.20.0',
|
|
17
|
+
'pyarrow>=15,<16',
|
|
18
|
+
'pysam>=0.20.0,<0.21.0',
|
|
19
|
+
'regex>=2022.10.31,<2023.0.0',
|
|
20
|
+
'scipy>=1.10.1,<2.0.0',
|
|
21
|
+
'trio>=0.22.0,<0.23.0',
|
|
22
|
+
'zmq>=0.0.0,<0.0.1']
|
|
23
|
+
|
|
24
|
+
entry_points = \
|
|
25
|
+
{'console_scripts': ['phantombuster = phantombuster.cli:phantombuster']}
|
|
26
|
+
|
|
27
|
+
setup_kwargs = {
|
|
28
|
+
'name': 'phantombuster',
|
|
29
|
+
'version': '0.12.0',
|
|
30
|
+
'description': '',
|
|
31
|
+
'long_description': '',
|
|
32
|
+
'author': 'Simon Haendeler',
|
|
33
|
+
'author_email': 'simon.emanuel.haendeler@univie.ac.at',
|
|
34
|
+
'maintainer': 'None',
|
|
35
|
+
'maintainer_email': 'None',
|
|
36
|
+
'url': 'None',
|
|
37
|
+
'package_dir': package_dir,
|
|
38
|
+
'packages': packages,
|
|
39
|
+
'package_data': package_data,
|
|
40
|
+
'install_requires': install_requires,
|
|
41
|
+
'entry_points': entry_points,
|
|
42
|
+
'python_requires': '>=3.9,<3.13',
|
|
43
|
+
}
|
|
44
|
+
from build import *
|
|
45
|
+
build(setup_kwargs)
|
|
46
|
+
|
|
47
|
+
setup(**setup_kwargs)
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import zlib
|
|
2
|
+
import re
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class EndOfFile(Exception):
|
|
7
|
+
pass
|
|
8
|
+
|
|
9
|
+
def read_header(f):
|
|
10
|
+
header = f.read(12)
|
|
11
|
+
if header == b"":
|
|
12
|
+
raise EndOfFile()
|
|
13
|
+
xlen = parse_xlen(header)
|
|
14
|
+
|
|
15
|
+
subfields = f.read(xlen)
|
|
16
|
+
if subfields == b"":
|
|
17
|
+
raise EndOfFile()
|
|
18
|
+
|
|
19
|
+
bsize = parse_bsize(subfields, xlen)
|
|
20
|
+
return xlen, bsize
|
|
21
|
+
|
|
22
|
+
def read_block(f, bsize, xlen):
|
|
23
|
+
"""Assumes you called read_header before"""
|
|
24
|
+
rest = f.read(bsize - 12 - xlen)
|
|
25
|
+
return zlib.decompress(rest, -15)
|
|
26
|
+
|
|
27
|
+
def parse_xlen(header):
|
|
28
|
+
return int.from_bytes(header[-2:], "little")
|
|
29
|
+
|
|
30
|
+
def parse_bsize(subfields, xlen):
|
|
31
|
+
offset = 0
|
|
32
|
+
block_size = None
|
|
33
|
+
while offset <= xlen:
|
|
34
|
+
if subfields[offset:offset+2] == b"BC":
|
|
35
|
+
block_size = int.from_bytes(subfields[offset+4:offset+6], "little") +1
|
|
36
|
+
break
|
|
37
|
+
else:
|
|
38
|
+
offset += int.from_bytes(subfields[offset+2:offset+4], "little") + 4
|
|
39
|
+
if block_size is None:
|
|
40
|
+
raise Exception("Could not find BSIZE")
|
|
41
|
+
|
|
42
|
+
return block_size
|
|
43
|
+
|
|
44
|
+
def parse_read_start(data):
|
|
45
|
+
if data == b"":
|
|
46
|
+
raise EndOfFile()
|
|
47
|
+
|
|
48
|
+
found = None
|
|
49
|
+
i = 0
|
|
50
|
+
for m in re.finditer(8*b"\xff", data):
|
|
51
|
+
if found:
|
|
52
|
+
offset = m.start() - found
|
|
53
|
+
if offset == 20 and (found-4 >= 0):
|
|
54
|
+
return found-4
|
|
55
|
+
else:
|
|
56
|
+
found = m.start()
|
|
57
|
+
else:
|
|
58
|
+
found = m.start()
|
|
59
|
+
i += 1
|
|
60
|
+
if i == 10:
|
|
61
|
+
raise Exception("Could not find the beginning of the read, either read contains FF bytes or assumptions are violated")
|
|
62
|
+
raise Exception("Could not find the beginning of the read, assumptions are violated")
|
|
63
|
+
|
|
64
|
+
def index(f, every):
|
|
65
|
+
coffset = 0
|
|
66
|
+
index = []
|
|
67
|
+
global_i = 0
|
|
68
|
+
i = 0
|
|
69
|
+
first_block = True
|
|
70
|
+
|
|
71
|
+
while True:
|
|
72
|
+
i += 1
|
|
73
|
+
global_i += 1
|
|
74
|
+
f.seek(coffset)
|
|
75
|
+
try:
|
|
76
|
+
xlen, bsize = read_header(f)
|
|
77
|
+
except EndOfFile:
|
|
78
|
+
break
|
|
79
|
+
|
|
80
|
+
#if True:
|
|
81
|
+
if i % every == 0 or global_i <= 2:
|
|
82
|
+
block_data = read_block(f, bsize, xlen)
|
|
83
|
+
if first_block:
|
|
84
|
+
uoffset = 0
|
|
85
|
+
else:
|
|
86
|
+
try:
|
|
87
|
+
uoffset = parse_read_start(block_data)
|
|
88
|
+
except EndOfFile:
|
|
89
|
+
break
|
|
90
|
+
idx = (coffset << 16) | uoffset
|
|
91
|
+
if idx >= 0:
|
|
92
|
+
index.append(idx)
|
|
93
|
+
|
|
94
|
+
coffset += bsize
|
|
95
|
+
first_block = False
|
|
96
|
+
|
|
97
|
+
index = index[1:] + [np.inf]
|
|
98
|
+
|
|
99
|
+
return index
|
|
100
|
+
|
|
101
|
+
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import logging.config
|
|
5
|
+
import os
|
|
6
|
+
import os.path
|
|
7
|
+
import sys
|
|
8
|
+
import multiprocessing as mp
|
|
9
|
+
import glob
|
|
10
|
+
|
|
11
|
+
import ntpath
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
import json
|
|
14
|
+
|
|
15
|
+
from phantombuster import porcelain, plumbing, stores
|
|
16
|
+
from phantombuster import core
|
|
17
|
+
from phantombuster.stores import deduplicator_to_pyarrow_table
|
|
18
|
+
from phantombuster.remoter import Worker
|
|
19
|
+
from phantombuster.io_ import write_parquet
|
|
20
|
+
from phantombuster.project import Project
|
|
21
|
+
import click
|
|
22
|
+
from typing import Optional, List
|
|
23
|
+
import pyarrow.parquet
|
|
24
|
+
import pyarrow.csv
|
|
25
|
+
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
|
|
28
|
+
def configure_logging(outputlog, verbose):
|
|
29
|
+
logging_config = {
|
|
30
|
+
'version': 1,
|
|
31
|
+
'formatters':{'default': {'format': "%(asctime)s %(levelname)-8s %(name)-15s %(message)s",
|
|
32
|
+
'datefmt': "%Y-%m-%d %H:%M:%S"}},
|
|
33
|
+
'handlers': {'console': {'class': 'logging.StreamHandler', 'formatter': 'default', 'stream': 'ext://sys.stdout'}},
|
|
34
|
+
'loggers': {'remoter': {'level': 'WARNING'}},
|
|
35
|
+
'root': {'handlers': ['console'], 'level': 'INFO'}
|
|
36
|
+
}
|
|
37
|
+
if outputlog:
|
|
38
|
+
logging_config['handlers']['file'] = {'class': 'logging.FileHandler', 'formatter': 'default', 'filename': outputlog}
|
|
39
|
+
logging_config['root']['handlers'].append('file')
|
|
40
|
+
if verbose:
|
|
41
|
+
logging_config['root']['level'] = 'DEBUG'
|
|
42
|
+
|
|
43
|
+
logging.config.dictConfig(logging_config)
|
|
44
|
+
logging.info('Logging configured')
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def log_call(function, **kwargs):
|
|
48
|
+
"""Log a CLI call with all arguments"""
|
|
49
|
+
logging.info(f"PhantomBuster {function} was called with the following arguments: {kwargs}")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@click.group()
|
|
53
|
+
@click.version_option(package_name='phantombuster')
|
|
54
|
+
@click.option("--verbose/--silent", default=False, help="Enable verbose debugging")
|
|
55
|
+
@click.option("-o", "--outputlog", type=click.Path(), help="Output file for logs")
|
|
56
|
+
@click.option("--save-results/--no-save-results", type=bool, default=True, help="DEBUG OPTION set no-save-results to not save which stages were already done")
|
|
57
|
+
def phantombuster(verbose: bool, outputlog: str, save_results: bool) -> None:
|
|
58
|
+
print('configuring logging')
|
|
59
|
+
configure_logging(outputlog, verbose)
|
|
60
|
+
print('configured logging')
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# -- Main Commands -- #
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@phantombuster.command()
|
|
67
|
+
@click.argument("input", type=click.Path(exists=True))
|
|
68
|
+
@click.option("--outdir", required=True)
|
|
69
|
+
@click.option("--regex-file", required=True)
|
|
70
|
+
@click.option("--barcode-hierarchy-file", type=click.Path(exists=True), required=True)
|
|
71
|
+
@click.option("--debug/--production", default=False)
|
|
72
|
+
@click.option("--show-qc/--no-qc", default=False)
|
|
73
|
+
@click.option("--force/--no-force", default=False)
|
|
74
|
+
def demultiplex(input, regex_file, debug, outdir, show_qc, force, barcode_hierarchy_file):
|
|
75
|
+
print('start demultiplex CLI command')
|
|
76
|
+
log_call("demultiplex", input=input, regex_file=regex_file, debug=debug,
|
|
77
|
+
outdir=outdir, show_qc=show_qc, barcode_hierarchy_file=barcode_hierarchy_file)
|
|
78
|
+
project = Project(outdir)
|
|
79
|
+
|
|
80
|
+
print('logged call, on to work')
|
|
81
|
+
try:
|
|
82
|
+
core.demultiplex(input, regex_file, barcode_hierarchy_file, project, debug=debug, show_qc=show_qc)
|
|
83
|
+
except Exception as e:
|
|
84
|
+
logging.exception("Pipeline encountered an error. Aborting.")
|
|
85
|
+
raise click.Abort()
|
|
86
|
+
return
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@phantombuster.command()
|
|
90
|
+
@click.option("--outdir", required=True)
|
|
91
|
+
@click.option("--error-threshold", default=1)
|
|
92
|
+
@click.option("--barcode-hierarchy-file", required=True)
|
|
93
|
+
def error_correct(outdir, error_threshold, barcode_hierarchy_file):
|
|
94
|
+
log_call("error-correct", outdir=outdir, error_threshold=error_threshold)
|
|
95
|
+
project = Project(outdir)
|
|
96
|
+
core.error_correct(project, error_threshold, barcode_hierarchy_file)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@phantombuster.command()
|
|
100
|
+
@click.argument('hopping-barcodes', nargs=-1)
|
|
101
|
+
@click.option("--outdir", required=True)
|
|
102
|
+
@click.option("--threshold", default=0.05, type=float)
|
|
103
|
+
def hopping_removal(outdir, threshold, hopping_barcodes):
|
|
104
|
+
log_call("hopping-removal", outdir=outdir, threshold=threshold, hopping_barcodes=hopping_barcodes)
|
|
105
|
+
project = Project(outdir)
|
|
106
|
+
hopping_barcodes = [bc.split(',') for bc in hopping_barcodes]
|
|
107
|
+
core.hopping_removal(project, hopping_barcodes, threshold)
|
|
108
|
+
|
|
109
|
+
@phantombuster.command()
|
|
110
|
+
@click.option("--outdir", required=True)
|
|
111
|
+
@click.option("--prefix")
|
|
112
|
+
@click.option("--threshold-file", required=True)
|
|
113
|
+
def threshold(outdir, prefix, threshold_file):
|
|
114
|
+
log_call("threshold", outdir=outdir, prefix=prefix, threshold_file=threshold_file)
|
|
115
|
+
core.threshold(outdir, prefix, threshold_file)
|
|
116
|
+
|
|
117
|
+
# -- Helper Commands -- #
|
|
118
|
+
|
|
119
|
+
@phantombuster.command()
|
|
120
|
+
@click.argument("prefixes", nargs=-1)
|
|
121
|
+
@click.option("--outdir", required=True)
|
|
122
|
+
@click.option("--prefix")
|
|
123
|
+
@click.option("--barcode-hierarchy-file", type=click.Path(exists=True), required=True)
|
|
124
|
+
def merge(prefixes, outdir, prefix, barcode_hierarchy_file):
|
|
125
|
+
"""
|
|
126
|
+
Merge multiple prefixes under one prefix
|
|
127
|
+
"""
|
|
128
|
+
# Log the call of this function with all parameters to the logfile
|
|
129
|
+
log_call("merge", prefixes=prefixes, outdir=outdir, prefix=prefix, barcode_hierarchy_file=barcode_hierarchy_file)
|
|
130
|
+
|
|
131
|
+
master_paths = PathsAndFiles(outdir, prefix, None)
|
|
132
|
+
master_paths.create()
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
barcode_hierarchy = plumbing.read_barcode_hierarchy_file(barcode_hierarchy_file)
|
|
136
|
+
except Exception:
|
|
137
|
+
raise Exception("Could not read barcode hierarchy file correctly")
|
|
138
|
+
|
|
139
|
+
to_merge = [PathsAndFiles(outdir, prefix, None) for prefix in prefixes]
|
|
140
|
+
|
|
141
|
+
results = [stores.load(('deduplication', True), paths.stage_path('deduplication')) for paths in to_merge]
|
|
142
|
+
out = plumbing.combine(results, barcode_hierarchy)
|
|
143
|
+
|
|
144
|
+
stores.save(out, master_paths.stage_path('deduplication'), id='deduplication')
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
@phantombuster.command()
|
|
149
|
+
@click.argument("parquetfile")
|
|
150
|
+
@click.argument("outfile", default=None, required=False)
|
|
151
|
+
def to_csv(parquetfile, outfile):
|
|
152
|
+
log_call("to_csv", sample=parquetfile, outdir=outfile)
|
|
153
|
+
table = pyarrow.parquet.read_table(parquetfile)
|
|
154
|
+
if outfile is None:
|
|
155
|
+
outfile = parquetfile.replace(".parquet", ".csv")
|
|
156
|
+
pyarrow.csv.write_csv(table, outfile)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
@phantombuster.command()
|
|
160
|
+
@click.argument("csvfile")
|
|
161
|
+
@click.argument("outfile", default=None, required=False)
|
|
162
|
+
def to_parquet(csvfile, outfile):
|
|
163
|
+
log_call("to_parquet", csvfile=csvfile, outfile=outfile)
|
|
164
|
+
table = pyarrow.csv.read_csv(csvfile)
|
|
165
|
+
if outfile is None:
|
|
166
|
+
outfile = csvfile.replace(".csv", ".parquet")
|
|
167
|
+
write_parquet(table, outfile)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
@phantombuster.command()
|
|
171
|
+
@click.option("--outdir", default=None, required=True)
|
|
172
|
+
@click.option("--name", default=None)
|
|
173
|
+
def worker(outdir, name):
|
|
174
|
+
import phantombuster as phantombuster
|
|
175
|
+
import phantombuster.plumbing
|
|
176
|
+
|
|
177
|
+
project = Project(outdir)
|
|
178
|
+
project.create()
|
|
179
|
+
path = project._get_server_path()
|
|
180
|
+
|
|
181
|
+
print(f"Connecting to {path}")
|
|
182
|
+
worker = Worker(path, name=name)
|
|
183
|
+
worker.start_async()
|
|
184
|
+
|
|
185
|
+
if __name__ == "__main__":
|
|
186
|
+
phantombuster()
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
import logging
|
|
3
|
+
import pyarrow
|
|
4
|
+
import pyarrow.csv
|
|
5
|
+
import regex
|
|
6
|
+
from phantombuster.plumbing import calculate_threshold
|
|
7
|
+
|
|
8
|
+
VALID_TAGS = ['b2', 'query', 'bc', 'name', 'seq']
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def read_file(name: str) -> pyarrow.Table:
|
|
12
|
+
if name.endswith('tsv'):
|
|
13
|
+
options = pyarrow.csv.ParseOptions(delimiter="\t")
|
|
14
|
+
else:
|
|
15
|
+
options = pyarrow.csv.ParseOptions()
|
|
16
|
+
table = pyarrow.csv.read_csv(name, parse_options=options)
|
|
17
|
+
return table
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class InputFile:
|
|
22
|
+
path: str
|
|
23
|
+
group: str
|
|
24
|
+
prefix: str
|
|
25
|
+
|
|
26
|
+
def _to_json_e(self):
|
|
27
|
+
return {'path': self.path, 'group': self.group, 'prefix': self.prefix}
|
|
28
|
+
|
|
29
|
+
@classmethod
|
|
30
|
+
def _from_json_e(cls, d: dict):
|
|
31
|
+
return InputFile(**d)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class InputGroup:
|
|
36
|
+
files: list[InputFile]
|
|
37
|
+
group: str
|
|
38
|
+
|
|
39
|
+
def _to_json_e(self) -> dict:
|
|
40
|
+
return {'files': [f._to_json_e() for f in self.files], 'group': self.group}
|
|
41
|
+
|
|
42
|
+
@classmethod
|
|
43
|
+
def _from_json_e(cls, d: dict) -> 'InputGroup':
|
|
44
|
+
ig = InputGroup(files=[InputFile._from_json_e(f) for f in d['files']], group=d['group'])
|
|
45
|
+
return ig
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class RegexDictionary:
|
|
49
|
+
|
|
50
|
+
def __init__(self, d=None):
|
|
51
|
+
if d is None:
|
|
52
|
+
d = {}
|
|
53
|
+
self._groups = d
|
|
54
|
+
|
|
55
|
+
def __eq__(self, o):
|
|
56
|
+
if isinstance(o, RegexDictionary):
|
|
57
|
+
return self._groups == o._groups
|
|
58
|
+
else:
|
|
59
|
+
return NotImplemented()
|
|
60
|
+
|
|
61
|
+
def add_regex(self, tag, rex, prefix='', group='*'):
|
|
62
|
+
d = self._groups.get(group, {})
|
|
63
|
+
d[prefix+tag] = rex
|
|
64
|
+
self._groups[group] = d
|
|
65
|
+
|
|
66
|
+
def get_regexes_for_group(self, group):
|
|
67
|
+
d1 = self._groups.get(group, {})
|
|
68
|
+
d2 = self._groups.get('*', {})
|
|
69
|
+
d = d1|d2
|
|
70
|
+
assert len(d) > 0
|
|
71
|
+
return {key: regex.compile(re) for key, re in d.items()}
|
|
72
|
+
|
|
73
|
+
def _to_json_e(self):
|
|
74
|
+
return self._groups
|
|
75
|
+
|
|
76
|
+
@classmethod
|
|
77
|
+
def _from_json_e(cls, d):
|
|
78
|
+
r = RegexDictionary(d)
|
|
79
|
+
return r
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def read_regex_file(path):
|
|
83
|
+
table = read_file(path)
|
|
84
|
+
column_names = ['group', 'prefix', 'tag', 'regex']
|
|
85
|
+
|
|
86
|
+
tags = table['tag'].to_pylist()
|
|
87
|
+
|
|
88
|
+
invalid_tags = [tag for tag in tags if tag not in VALID_TAGS]
|
|
89
|
+
if len(invalid_tags) > 0:
|
|
90
|
+
raise KeyError(f'Invalid tags: {", ".join(invalid_tags)}')
|
|
91
|
+
|
|
92
|
+
regexes = table['regex'].to_pylist()
|
|
93
|
+
|
|
94
|
+
if 'group' not in table.column_names:
|
|
95
|
+
groups = ['*'] * len(tags)
|
|
96
|
+
else:
|
|
97
|
+
groups = table['group'].to_pylist()
|
|
98
|
+
|
|
99
|
+
if 'prefix' not in table.column_names:
|
|
100
|
+
prefixs = [''] * len(tags)
|
|
101
|
+
else:
|
|
102
|
+
prefixs = table['prefix'].to_pylist()
|
|
103
|
+
|
|
104
|
+
regex_dict = RegexDictionary()
|
|
105
|
+
|
|
106
|
+
for group, tag, re, prefix in zip(groups, tags, regexes, prefixs):
|
|
107
|
+
regex_dict.add_regex(tag, re, prefix, group)
|
|
108
|
+
return regex_dict
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def read_barcode_hierarchy_file(name):
|
|
112
|
+
table = read_file(name)
|
|
113
|
+
column_names = ["barcode", "type", "referencefile", "threshold", "min_length", "max_length"]
|
|
114
|
+
missing_columns = [col for col in column_names if col not in table.column_names]
|
|
115
|
+
if len(missing_columns) > 0 :
|
|
116
|
+
raise Exception(f"Barcode Hierarchy File Incorrect, missing columns: {missing_columns}") # TODO better error reporting
|
|
117
|
+
|
|
118
|
+
def parse_length(value):
|
|
119
|
+
if value == "-":
|
|
120
|
+
return None
|
|
121
|
+
else:
|
|
122
|
+
try:
|
|
123
|
+
return int(value)
|
|
124
|
+
except Exception:
|
|
125
|
+
raise Exception("Can not parse min or max length correctly in barcode hierarchy file, must be either '-' or an integer")
|
|
126
|
+
|
|
127
|
+
def parse_type(value):
|
|
128
|
+
if value == "reference":
|
|
129
|
+
return "reference"
|
|
130
|
+
else:
|
|
131
|
+
return "random"
|
|
132
|
+
|
|
133
|
+
barcodes = [{"name": name, "type": parse_type(type), "referencefile": reference, "threshold": threshold,
|
|
134
|
+
"min_length": parse_length(min_length), "max_length": parse_length(max_length)}
|
|
135
|
+
for name, type, reference, threshold, min_length, max_length in zip(*[table[name].to_pylist() for name in column_names])]
|
|
136
|
+
|
|
137
|
+
for bc in barcodes:
|
|
138
|
+
if bc["type"] == "reference":
|
|
139
|
+
table = read_file(bc["referencefile"])
|
|
140
|
+
bc["reference"] = {bc: name for bc, name in zip([str(bc) for bc in table["barcode"].to_pylist()], [str(name) for name in table["name"].to_pylist()])}
|
|
141
|
+
if bc["threshold"] == "auto":
|
|
142
|
+
bc["threshold"] = calculate_threshold(bc["reference"])
|
|
143
|
+
else:
|
|
144
|
+
try:
|
|
145
|
+
bc["threshold"] = int(bc["threshold"])
|
|
146
|
+
except Exception:
|
|
147
|
+
raise Exception("Can not properly read threshold column of barcode hierarchy file, needs to be 'auto' or an integer")
|
|
148
|
+
return barcodes
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def read_input_files_file(path):
|
|
152
|
+
table = read_file(path)
|
|
153
|
+
|
|
154
|
+
column_names = ['file', 'group', 'prefix']
|
|
155
|
+
|
|
156
|
+
files = table['file'].to_pylist()
|
|
157
|
+
|
|
158
|
+
if 'group' not in table.column_names:
|
|
159
|
+
groups = table["file"].to_pylist()
|
|
160
|
+
else:
|
|
161
|
+
groups = table['group'].to_pylist()
|
|
162
|
+
|
|
163
|
+
if 'prefix' not in table.column_names:
|
|
164
|
+
prefixs = [''] * len(files)
|
|
165
|
+
else:
|
|
166
|
+
prefixs = table['prefix'].to_pylist()
|
|
167
|
+
|
|
168
|
+
input_files = [InputFile(file, group if group is not None else "", prefix if prefix is not None else "") for file, group, prefix in zip(files, groups, prefixs)]
|
|
169
|
+
groups = {}
|
|
170
|
+
for f in input_files:
|
|
171
|
+
l = groups.get(f.group, [])
|
|
172
|
+
l.append(f)
|
|
173
|
+
groups[f.group] = l
|
|
174
|
+
|
|
175
|
+
igs = []
|
|
176
|
+
for group_name, files in groups.items():
|
|
177
|
+
igs.append(InputGroup(files, group_name))
|
|
178
|
+
|
|
179
|
+
return igs
|