PyPI - TSSV - Versions diffs - 1.2.1__tar.gz - Mend

TSSV 1.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

tssv-1.2.1/LICENSE.md +21 -0
tssv-1.2.1/PKG-INFO +55 -0
tssv-1.2.1/README.rst +36 -0
tssv-1.2.1/TSSV.egg-info/PKG-INFO +55 -0
tssv-1.2.1/TSSV.egg-info/SOURCES.txt +20 -0
tssv-1.2.1/TSSV.egg-info/dependency_links.txt +1 -0
tssv-1.2.1/TSSV.egg-info/entry_points.txt +3 -0
tssv-1.2.1/TSSV.egg-info/requires.txt +3 -0
tssv-1.2.1/TSSV.egg-info/top_level.txt +2 -0
tssv-1.2.1/extras/annotate/annotate.py +95 -0
tssv-1.2.1/pyproject.toml +32 -0
tssv-1.2.1/setup.cfg +4 -0
tssv-1.2.1/tests/test_align.py +41 -0
tssv-1.2.1/tests/test_annotate.py +18 -0
tssv-1.2.1/tests/test_tssv.py +25 -0
tssv-1.2.1/tssv/__init__.py +38 -0
tssv-1.2.1/tssv/align_pair.py +22 -0
tssv-1.2.1/tssv/cli.py +64 -0
tssv-1.2.1/tssv/sgAlign.c +104 -0
tssv-1.2.1/tssv/sgAlign.h +18 -0
tssv-1.2.1/tssv/sgAlignWrapper.c +62 -0
tssv-1.2.1/tssv/tssv.py +439 -0

tssv-1.2.1/LICENSE.md ADDED Viewed

@@ -0,0 +1,21 @@
+Copyright (c) 2012-2018 Jeroen F.J. Laros <j.f.j.laros@lumc.nl>
+Copyright (c) 2016 Jerry Hoogenboom <j.hoogenboom@nfi.minvenj.nl>
+Copyright (c) 2012 Jaap W.F. van der Heijden
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

tssv-1.2.1/PKG-INFO ADDED Viewed

@@ -0,0 +1,55 @@
+Metadata-Version: 2.4
+Name: TSSV
+Version: 1.2.1
+Summary: Targeted characterisation of short structural variation.
+Author-email: "Jeroen F.J. Laros" <jlaros@fixedpoint.nl>
+License-Expression: MIT
+Project-URL: homepage, https://git.lumc.nl/j.f.j.laros/tssv
+Keywords: bioinformatics
+Classifier: Programming Language :: Python :: 3
+Classifier: Operating System :: OS Independent
+Classifier: Intended Audience :: Science/Research
+Classifier: Topic :: Scientific/Engineering
+Description-Content-Type: text/x-rst
+License-File: LICENSE.md
+Requires-Dist: biopython
+Requires-Dist: requests
+Requires-Dist: xopen
+Dynamic: license-file
+TSSV: Targeted characterisation of short structural variation
+=============================================================
+.. image:: https://img.shields.io/github/last-commit/jfjlaros/tssv.svg
+   :target: https://github.com/jfjlaros/tssv/graphs/commit-activity
+.. image:: https://github.com/jfjlaros/tssv/actions/workflows/test.yml/badge.svg
+   :target: https://github.com/jfjlaros/tssv/actions/workflows/test.yml
+.. image:: https://readthedocs.org/projects/tssv/badge/?version=latest
+   :target: https://tssv.readthedocs.io/en/latest
+.. image:: https://img.shields.io/github/release-date/jfjlaros/tssv.svg
+   :target: https://github.com/jfjlaros/tssv/releases
+.. image:: https://img.shields.io/github/release/jfjlaros/tssv.svg
+   :target: https://github.com/jfjlaros/tssv/releases
+.. image:: https://img.shields.io/pypi/v/tssv.svg
+   :target: https://pypi.org/project/tssv/
+.. image:: https://img.shields.io/github/languages/code-size/jfjlaros/tssv.svg
+   :target: https://github.com/jfjlaros/tssv
+.. image:: https://img.shields.io/github/languages/count/jfjlaros/tssv.svg
+   :target: https://github.com/jfjlaros/tssv
+.. image:: https://img.shields.io/github/languages/top/jfjlaros/tssv.svg
+   :target: https://github.com/jfjlaros/tssv
+.. image:: https://img.shields.io/github/license/jfjlaros/tssv.svg
+   :target: https://raw.githubusercontent.com/jfjlaros/tssv/master/LICENSE.md
+----
+TSSV is a program that does targeted characterisation of short structural
+variation. It can be used for STR analysis, or any other type of targeted
+analysis. It characterises any variation between a set of user-defined markers.
+TSSV is platform-independent. It has been tested on Linux, macOS, and Windows.
+Please see ReadTheDocs_ for the latest documentation.
+.. _ReadTheDocs: https://tssv.readthedocs.io/en/latest/index.html

tssv-1.2.1/README.rst ADDED Viewed

@@ -0,0 +1,36 @@
+TSSV: Targeted characterisation of short structural variation
+=============================================================
+.. image:: https://img.shields.io/github/last-commit/jfjlaros/tssv.svg
+   :target: https://github.com/jfjlaros/tssv/graphs/commit-activity
+.. image:: https://github.com/jfjlaros/tssv/actions/workflows/test.yml/badge.svg
+   :target: https://github.com/jfjlaros/tssv/actions/workflows/test.yml
+.. image:: https://readthedocs.org/projects/tssv/badge/?version=latest
+   :target: https://tssv.readthedocs.io/en/latest
+.. image:: https://img.shields.io/github/release-date/jfjlaros/tssv.svg
+   :target: https://github.com/jfjlaros/tssv/releases
+.. image:: https://img.shields.io/github/release/jfjlaros/tssv.svg
+   :target: https://github.com/jfjlaros/tssv/releases
+.. image:: https://img.shields.io/pypi/v/tssv.svg
+   :target: https://pypi.org/project/tssv/
+.. image:: https://img.shields.io/github/languages/code-size/jfjlaros/tssv.svg
+   :target: https://github.com/jfjlaros/tssv
+.. image:: https://img.shields.io/github/languages/count/jfjlaros/tssv.svg
+   :target: https://github.com/jfjlaros/tssv
+.. image:: https://img.shields.io/github/languages/top/jfjlaros/tssv.svg
+   :target: https://github.com/jfjlaros/tssv
+.. image:: https://img.shields.io/github/license/jfjlaros/tssv.svg
+   :target: https://raw.githubusercontent.com/jfjlaros/tssv/master/LICENSE.md
+----
+TSSV is a program that does targeted characterisation of short structural
+variation. It can be used for STR analysis, or any other type of targeted
+analysis. It characterises any variation between a set of user-defined markers.
+TSSV is platform-independent. It has been tested on Linux, macOS, and Windows.
+Please see ReadTheDocs_ for the latest documentation.
+.. _ReadTheDocs: https://tssv.readthedocs.io/en/latest/index.html

tssv-1.2.1/TSSV.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,55 @@
+Metadata-Version: 2.4
+Name: TSSV
+Version: 1.2.1
+Summary: Targeted characterisation of short structural variation.
+Author-email: "Jeroen F.J. Laros" <jlaros@fixedpoint.nl>
+License-Expression: MIT
+Project-URL: homepage, https://git.lumc.nl/j.f.j.laros/tssv
+Keywords: bioinformatics
+Classifier: Programming Language :: Python :: 3
+Classifier: Operating System :: OS Independent
+Classifier: Intended Audience :: Science/Research
+Classifier: Topic :: Scientific/Engineering
+Description-Content-Type: text/x-rst
+License-File: LICENSE.md
+Requires-Dist: biopython
+Requires-Dist: requests
+Requires-Dist: xopen
+Dynamic: license-file
+TSSV: Targeted characterisation of short structural variation
+=============================================================
+.. image:: https://img.shields.io/github/last-commit/jfjlaros/tssv.svg
+   :target: https://github.com/jfjlaros/tssv/graphs/commit-activity
+.. image:: https://github.com/jfjlaros/tssv/actions/workflows/test.yml/badge.svg
+   :target: https://github.com/jfjlaros/tssv/actions/workflows/test.yml
+.. image:: https://readthedocs.org/projects/tssv/badge/?version=latest
+   :target: https://tssv.readthedocs.io/en/latest
+.. image:: https://img.shields.io/github/release-date/jfjlaros/tssv.svg
+   :target: https://github.com/jfjlaros/tssv/releases
+.. image:: https://img.shields.io/github/release/jfjlaros/tssv.svg
+   :target: https://github.com/jfjlaros/tssv/releases
+.. image:: https://img.shields.io/pypi/v/tssv.svg
+   :target: https://pypi.org/project/tssv/
+.. image:: https://img.shields.io/github/languages/code-size/jfjlaros/tssv.svg
+   :target: https://github.com/jfjlaros/tssv
+.. image:: https://img.shields.io/github/languages/count/jfjlaros/tssv.svg
+   :target: https://github.com/jfjlaros/tssv
+.. image:: https://img.shields.io/github/languages/top/jfjlaros/tssv.svg
+   :target: https://github.com/jfjlaros/tssv
+.. image:: https://img.shields.io/github/license/jfjlaros/tssv.svg
+   :target: https://raw.githubusercontent.com/jfjlaros/tssv/master/LICENSE.md
+----
+TSSV is a program that does targeted characterisation of short structural
+variation. It can be used for STR analysis, or any other type of targeted
+analysis. It characterises any variation between a set of user-defined markers.
+TSSV is platform-independent. It has been tested on Linux, macOS, and Windows.
+Please see ReadTheDocs_ for the latest documentation.
+.. _ReadTheDocs: https://tssv.readthedocs.io/en/latest/index.html

tssv-1.2.1/TSSV.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,20 @@
+LICENSE.md
+README.rst
+pyproject.toml
+TSSV.egg-info/PKG-INFO
+TSSV.egg-info/SOURCES.txt
+TSSV.egg-info/dependency_links.txt
+TSSV.egg-info/entry_points.txt
+TSSV.egg-info/requires.txt
+TSSV.egg-info/top_level.txt
+extras/annotate/annotate.py
+tests/test_align.py
+tests/test_annotate.py
+tests/test_tssv.py
+tssv/__init__.py
+tssv/align_pair.py
+tssv/cli.py
+tssv/sgAlign.c
+tssv/sgAlign.h
+tssv/sgAlignWrapper.c
+tssv/tssv.py

tssv-1.2.1/TSSV.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

tssv-1.2.1/TSSV.egg-info/entry_points.txt ADDED Viewed

@@ -0,0 +1,3 @@
+[console_scripts]
+tannotate = tssv_extras.annotate:main
+tssv = tssv.cli:main

tssv-1.2.1/TSSV.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,3 @@
+biopython
+requests
+xopen

tssv-1.2.1/TSSV.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ tssv
2	+ tssv_extras

tssv-1.2.1/extras/annotate/annotate.py ADDED Viewed

@@ -0,0 +1,95 @@
+"""Convert a csv files containing alleles and counts to HGVS descriptions of
+alleles and single variants. Also report statistics about the variant types.
+The input file is typically one of the output files of tssv.
+"""
+from argparse import ArgumentParser, FileType, RawDescriptionHelpFormatter
+from collections import defaultdict
+from sys import stdout
+from requests import get as req_get
+def write_table(data, title, report_handle, minimum):
+    """Write a table to a file.
+    :arg dict data: Dictionary containing counts per type.
+    :arg str title: Name of the first column.
+    :arg stream report_handle: Open writeable handle to the report file.
+    :arg int minimum: Minimum count.
+    """
+    report_handle.write('{}\ttotal\tforward\treverse\n'.format(title))
+    for i in sorted(data, key=lambda x: x[0], reverse=True):
+        if data[i][0] < minimum:
+            return
+        report_handle.write('{}\t{}\t{}\t{}\n'.format(i, *data[i]))
+def annotate(alleles_handle, reference, report_handle, minimum):
+    """Convert a csv files containing alleles and counts to HGVS descriptions
+    of alleles, single variants and variant types.
+    :arg stream alleles_handle: Open handle to the alleles file.
+    :arg str reference: The reference sequence.
+    :arg stream report_handle: Open writeable handle to the report file.
+    :arg int minimum: Minimum count.
+    """
+    alleles = defaultdict(lambda: [0, 0, 0])
+    raw_vars = defaultdict(lambda: [0, 0, 0])
+    classification = defaultdict(lambda: [0, 0, 0])
+    data = list(map(
+        lambda x: x.strip('\n').split('\t'), alleles_handle.readlines()[1:]))
+    for i in data:
+        allele_description = req_get(
+            'https://v2.mutalyzer.nl/json/descriptionExtract?' +
+            'reference={}&observed={}'.format(reference, i[0])).json()
+        encountered = list(map(int, (i[1:])))
+        alleles[allele_description['description']] = list(map(
+            sum, zip(alleles[allele_description['description']], encountered)))
+        for j in allele_description['allele']:
+            raw_vars[j['description']] = list(map(
+                sum, zip(raw_vars[j['description']], encountered)))
+            classification[j['type']] = list(map(
+                sum, zip(classification[j['type']], encountered)))
+    write_table(alleles, 'allele', report_handle, minimum)
+    report_handle.write('\n')
+    write_table(raw_vars, 'variant', report_handle, minimum)
+    report_handle.write('\n')
+    write_table(classification, 'class', report_handle, minimum)
+def main():
+    """Main entry point."""
+    usage = __doc__.split('\n\n\n')
+    parser = ArgumentParser(
+        description=usage[0], epilog=usage[1],
+        formatter_class=RawDescriptionHelpFormatter)
+    parser.add_argument(
+        'alleles', metavar='alleles', type=FileType('r'),
+        help='the alleles file')
+    parser.add_argument(
+        'reference', metavar='reference', type=str,
+        help='sequence of the reference allele')
+    parser.add_argument(
+        '-r', dest='report', type=FileType('w'), default=stdout,
+        help='name of the report file')
+    parser.add_argument(
+        '-a', dest='minimum', type=int, default=0,
+        help='minimum count (default=%(default)s)')
+    args = parser.parse_args()
+    try:
+        annotate(args.alleles, args.reference, args.report, args.minimum)
+    except OSError as error:
+        parser.error(error)
+if __name__ == '__main__':
+    main()

tssv-1.2.1/pyproject.toml ADDED Viewed

@@ -0,0 +1,32 @@
+[build-system]
+requires = ['setuptools']
+build-backend = 'setuptools.build_meta'
+[tool.setuptools]
+package-dir = {tssv = 'tssv', tssv_extras = 'extras/annotate'}
+ext-modules = [{name = 'tssv.sg_align', sources = ['tssv/sgAlignWrapper.c', 'tssv/sgAlign.c'], extra-compile-args = ['-O3']}]
+[tool.setuptools.package-data]
+tssv = ['sgAlign.h']
+[project]
+name = 'TSSV'
+description = 'Targeted characterisation of short structural variation.'
+version = '1.2.1'
+authors = [{name = 'Jeroen F.J. Laros', email = 'jlaros@fixedpoint.nl'}]
+urls = {homepage = 'https://git.lumc.nl/j.f.j.laros/tssv'}
+readme = 'README.rst'
+keywords = ['bioinformatics']
+classifiers = [
+    'Programming Language :: Python :: 3',
+    'Operating System :: OS Independent',
+    'Intended Audience :: Science/Research',
+    'Topic :: Scientific/Engineering']
+license = 'MIT'
+dependencies = ['biopython', 'requests', 'xopen']
+[project.scripts]
+tssv = 'tssv.cli:main'
+tannotate = 'tssv_extras.annotate:main'

tssv-1.2.1/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

tssv-1.2.1/tests/test_align.py ADDED Viewed

@@ -0,0 +1,41 @@
+"""Tests for the alignment modules."""
+from Bio import Seq
+from tssv import align_pair
+class TestAlign(object):
+    def setup_method(self):
+        self._reference = 'GACTGTCGTGGGCTCTTACGCACATATTATAACTTTCATAAGTTTGTCAG'
+        self._reference_rc = Seq.reverse_complement(self._reference)
+    def test_align_pair_perfect(self):
+        result = align_pair(
+            self._reference, self._reference_rc, ('TTATGAAAGT', 'CGTAAGAGC'))
+        assert result == ((3, 34), (0, 11))
+    def test_align_pair_subst(self):
+        result = align_pair(
+            self._reference, self._reference_rc, ('TTATGTAAGT', 'CGTATGAGC'))
+        assert result == ((3, 34), (1, 11))
+    def test_align_pair_del(self):
+        result = align_pair(
+            self._reference, self._reference_rc, ('TTATGAAGT', 'CGTAGAGC'))
+        assert result == ((2, 34), (1, 11))
+    def test_align_pair_ins(self):
+        result = align_pair(
+            self._reference, self._reference_rc, ('TTATGACAAGT', 'CGTAACGAGC'))
+        assert result == ((3, 43), (1, 11))
+    def test_align_pair_del_punish_indel(self):
+        result = align_pair(
+            self._reference, self._reference_rc, ('TTATGAAGT', 'CGTAGAGC'), 3)
+        assert result == ((3, 35), (3, 11))
+    def test_align_pair_ins_punish_indel(self):
+        result = align_pair(
+            self._reference, self._reference_rc, ('TTATGACAAGT', 'CGTAACGAGC'),
+            3)
+        assert result == ((4, 34), (3, 11))

tssv-1.2.1/tests/test_annotate.py ADDED Viewed

@@ -0,0 +1,18 @@
+"""Tests for the annotation CLI."""
+from io import StringIO
+from fake_open import md5_check
+from tssv_extras import annotate
+class TestAnnotation(object):
+    """Test the annotation CLI."""
+    def setup_method(self):
+        self._input = open('data/m1_newalleles.csv')
+        self._output = StringIO()
+    def test_annotate(self):
+        annotate.annotate(self._input, 'TCCGTCCCATGCATGC', self._output, 0)
+        assert md5_check(
+            self._output.getvalue(), '596241e84c6d20a4155034236c234c51')

tssv-1.2.1/tests/test_tssv.py ADDED Viewed

@@ -0,0 +1,25 @@
+"""Tests for the annotation CLI."""
+from io import StringIO
+from tssv.tssv import parse_library
+class TestTSSV(object):
+    """Test the annotation CLI."""
+    def setup_method(self):
+        self._output = StringIO()
+    def test_parse_library(self):
+        library = parse_library(open('data/library.csv'), 0.1)
+        assert len(library) == 4
+        assert library['m3']['flanks'] == ['TTATTATCTCTC', 'CTATCGAGAGAGAT']
+        assert library['m3']['reg_exp'].pattern == '^(TTTAT){1,1}(GGGA){0,1}$'
+        assert library['m3']['thresholds'] == [2, 2]
+    def test_parse_library_without_pattern(self):
+        library = parse_library(open('data/library_lite.csv'), 0)
+        assert library['m3']['reg_exp'].pattern == '(?!x)x'
+    def test_parse_library_with_mismatches(self):
+        library = parse_library(open('data/library.csv'), 0.1, 1)
+        assert library['m3']['thresholds'] == [1, 1]

tssv-1.2.1/tssv/__init__.py ADDED Viewed

@@ -0,0 +1,38 @@
+from argparse import FileType
+from importlib.metadata import PackageNotFoundError, metadata
+from os.path import exists
+from re import split
+from typing import Callable
+from .align_pair import align_pair
+from .sg_align import align
+class ProtectedFileType(FileType):
+    def __call__(self, string):
+        if 'w' in self._mode and exists(string):
+            raise IOError('failed to create "{}": file exists.'.format(string))
+        return super(ProtectedFileType, self).__call__(string)
+def _extract(key: str, delim: str = r'[^\s\S]', index: int = 0) -> str:
+    try:
+        value = metadata(__package__).get(key, '')
+    except PackageNotFoundError:
+        return '<NO DATA>'
+    return split(delim, value)[index]
+def doc_split(func: Callable) -> str:
+    return func.__doc__.split('\n\n')[0]
+_project = _extract('Name')
+_version = _extract('Version')
+_year = '2010-2026'
+_author = _extract('Author-email', r'"', 1)
+_email = _extract('Author-email', r'<|>', 1)
+_description = _extract('Summary')
+_copyright = f'Copyright (c) {_year} by {_author} <{_email}>'
+_url = _extract('Project-URL', r', ', 1)
+_info = f'{_project} version {_version}\n\n{_copyright}\nHomepage: {_url}'

tssv-1.2.1/tssv/align_pair.py ADDED Viewed

@@ -0,0 +1,22 @@
+from .sg_align import align
+def align_pair(reference, reference_rc, pair, indel_score=1):
+    """Align a pair of markers to the forward reference sequence. The reverse
+    complement is used to align the second element of the pair (which is also
+    reverse complemented).
+    :arg str reference: Reference sequence to align to.
+    :arg str reference_rc: Reverse complement of the reference sequence.
+    :arg list pair: A pair (forward, reverse) of markers to align.
+    :arg int indel_score: Penalty score for insertions and deletions per
+        nucleotide.
+    :returns tuple: A tuple (score, position) of the best alignment.
+    """
+    left = align(reference, pair[0], indel_score)
+    right = align(reference_rc, pair[1], indel_score)
+    return (
+        (left['distance'], left['position']),
+        (right['distance'], len(reference) - right['position']))

tssv-1.2.1/tssv/cli.py ADDED Viewed

@@ -0,0 +1,64 @@
+from argparse import ArgumentParser, FileType, RawDescriptionHelpFormatter
+from sys import stdout
+from xopen import xopen
+from . import _copyright, _description, _info
+from .tssv import tssv
+def main():
+    """Main entry point."""
+    parser = ArgumentParser(
+        description=_description, epilog=_copyright,
+        formatter_class=RawDescriptionHelpFormatter)
+    parser.add_argument(
+        'input_handle', metavar='INPUT',
+        help='a FASTA/FASTQ file')
+    parser.add_argument(
+        'library_handle', metavar='LIBRARY', type=FileType('r'),
+        help='library of flanking sequences')
+    parser.add_argument(
+        '-m', dest='threshold', type=float, default=0.08,
+        help='mismatches per nucleotide (default=%(default)s)')
+    parser.add_argument(
+        '-M', dest='mismatches', type=int,
+        help='fixed number of mismatches, overrides -m (default=%(default)s)')
+    parser.add_argument(
+        '-n', dest='indel_score', type=int, default=1,
+        help='insertions and deletions are penalised this number of times '
+             'more heavily than mismatches (default=%(default)s)')
+    parser.add_argument(
+        '-r', dest='report_handle', type=FileType('w'), default=stdout,
+        help='name of the report file')
+    parser.add_argument(
+        '-j', dest='json_report', action='store_true', default=False,
+        help='use json format for the output file')
+    parser.add_argument('-d', dest='path', type=str, help='output directory')
+    parser.add_argument(
+        '-a', dest='minimum', type=int, default=0,
+        help='minimum count per allele (default=%(default)s)')
+    parser.add_argument('-v', action='version', version=_info)
+    args = parser.parse_args()
+    # Have a little look in the input file to determine the file format.
+    with xopen(args.input_handle, 'r') as fin:
+        if next(fin).startswith('>'):
+            args.file_format = 'fasta'
+        else:
+            args.file_format = 'fastq'
+    # Now that we we know the file format, we can open the file again and
+    # have access to the full file content.
+    args.input_handle = xopen(args.input_handle)
+    try:
+        tssv(**{k: v for k, v in vars(args).items()
+             if k not in ('func', 'subcommand')})
+    except OSError as error:
+        parser.error(error)
+if __name__ == '__main__':
+    main()

tssv-1.2.1/tssv/sgAlign.c ADDED Viewed

@@ -0,0 +1,104 @@
+/*
+ * Library for semi-global alignment.
+ */
+#include <string.h>
+#include <stdlib.h>
+#include "sgAlign.h"
+/*! Calculate the minimum of two values.
+ *
+ * \param [in] a A value.
+ * \param [in] b A value.
+ *
+ * \return The minimum of `a` and `b`.
+ */
+static inline int min_(int const a, int const b) {
+  if (a < b) {
+    return a;
+  }
+  return b;
+}
+/*! Initialise a matrix for semi-global alignment.
+ *
+ * \param [in,out] matrix The alignment matrix.
+ * \param [in] rows Number of rows in the matrix.
+ * \param [in] columns Number of columns in the matrix.
+ * \param [in] indelScore Penalty score for insertions and deletions.
+ */
+void initMatrix_(
+    int *const matrix, size_t const rows, size_t const columns,
+    int const indelScore) {
+  int (*const matrix_)[columns] = (int (*const)[columns])matrix;
+  for (size_t i = 1; i < rows; i++) {
+    matrix_[i][0] = 0;
+  }
+  for (size_t i = 0; i < columns; i++) {
+    matrix_[0][i] = i * indelScore;
+  }
+}
+/*! Fill the alignment matrix.
+ *
+ * \param [in, out] matrix The alignment matrix.
+ * \param [in] rows Number of rows in the matrix.
+ * \param [in] columns Number of columns in the matrix.
+ * \param [in] seq1 The sequence to be aligned to.
+ * \param [in] seq2 The sequence to be aligned.
+ * \param [in] indelScore Penalty score for insertions and deletions.
+ */
+void align_(
+    int *const matrix, size_t const rows, size_t const columns,
+    char const *const seq1, char const *const seq2, int const indelScore) {
+  int (*const matrix_)[columns] = (int (*const)[columns])matrix;
+  for (size_t r = 1; r < rows; r++) {
+    for (size_t c = 1; c < columns; c++) {
+      matrix_[r][c] = min_(
+        min_(matrix_[r - 1][c], matrix_[r][c - 1]) + indelScore,
+        matrix_[r - 1][c - 1] + (seq1[r - 1] != seq2[c - 1]));
+    }
+  }
+}
+/*! Find the minimum distance, ignoring a trailing gap in the sequence
+ * associated with the number of rows in an alignment matrix. If the minimum
+ * distance is found, also return the row number.
+ *
+ * \param [in] matrix A `rows` * `columns` matrix.
+ * \param [in] rows Number of rows in the matrix.
+ * \param [in] columns Number of columns in the matrix.
+ *
+ * \return The minimum distance and its row number.
+ */
+Alignment findMin_(
+    int const *const matrix, size_t const rows, size_t const columns) {
+  int const (*const matrix_)[columns] = (int const (*const)[columns])matrix;
+  Alignment a = {columns - 1, 0};
+  for (size_t r = 1; r < rows; r++) {
+    if (matrix_[r][columns - 1] < a.distance) {
+      a.distance = matrix_[r][columns - 1];
+      a.position = r;
+    }
+  }
+  return a;
+}
+Alignment align(
+    char const *const seq1, char const *const seq2, int const indelScore) {
+  Alignment a;
+  size_t rows = strlen(seq1) + 1;
+  size_t columns = strlen(seq2) + 1;
+  int *matrix = (int *)malloc(rows * columns * sizeof(int));
+  initMatrix_(matrix, rows, columns, indelScore);
+  align_(matrix, rows, columns, seq1, seq2, indelScore);
+  a = findMin_(matrix, rows, columns);
+  free(matrix);
+  return a;
+}

tssv-1.2.1/tssv/sgAlign.h ADDED Viewed

@@ -0,0 +1,18 @@
+#pragma once
+typedef struct {
+  size_t distance;
+  size_t position;
+} Alignment;
+/*! Do a semi-global alignment of `seq2` to `seq1`.
+ *
+ * \param [in] seq1 The sequence to be aligned to.
+ * \param [in] seq2 The sequence to be aligned.
+ * \param [in] indelScore Penalty score for insertions and deletions.
+ *
+ * \return The minimum distance and its row number.
+ */
+Alignment align(
+  char const *const seq1, char const *const seq2, int const indelScore);

tssv-1.2.1/tssv/sgAlignWrapper.c ADDED Viewed

@@ -0,0 +1,62 @@
+#include <Python.h>
+#include "sgAlign.h"
+/**
+ * Converter for alignment struct.
+ */
+PyObject *pyAlignment(int distance, int position) {
+  return Py_BuildValue(
+    "{s: i, s: i}", "distance", distance, "position", position);
+}
+/**
+ * Wrapper for align function.
+ */
+PyObject *pyAlign(PyObject *self, PyObject *args) {
+  char *seq1;
+  char *seq2;
+  int indel_score;
+  Alignment a;
+  if (!PyArg_ParseTuple(args, "ssi", &seq1, &seq2, &indel_score)) {
+    return NULL;
+  }
+  a = align(seq1, seq2, indel_score);
+  return pyAlignment(a.distance, a.position);
+}
+/*
+ * Module methods.
+ */
+PyMethodDef pySgAlignMethods[] = {
+  {
+    "align", pyAlign, METH_VARARGS,
+    "Do a semi-global alignment of {seq2} to {seq1}.\n\n"
+    "  :arg str seq1: The sequence to be aligned to.\n"
+    "  :arg str seq2: The sequence to be aligned.\n"
+    "  :arg int indel_score: Penalty score for insertions and deletions.\n\n"
+    "  :returns dict alignment: The minimum distance and its row number.\n"},
+  {NULL, NULL, 0, NULL}
+};
+/*
+ * Module definition.
+ */
+struct PyModuleDef sgAlignModule = {
+  PyModuleDef_HEAD_INIT,
+  "sg_align",
+  "Library for semi-global alignment.",
+  -1,
+  pySgAlignMethods
+};
+/**
+ * Module init function.
+ */
+PyMODINIT_FUNC PyInit_sg_align(void) {
+  return PyModule_Create(&sgAlignModule);
+}

tssv-1.2.1/tssv/tssv.py ADDED Viewed

@@ -0,0 +1,439 @@
+from collections import defaultdict
+from functools import reduce
+from json import dump
+from math import ceil
+from os import mkdir
+from re import compile as re_compile
+from Bio import Seq, SeqIO
+from .align_pair import align_pair
+file_names = {
+    'unknown': 'unknown.seq',
+    'markers': 'markers.csv',
+    'known': 'knownalleles.csv',
+    'new': 'newalleles.csv',
+    'nostart': 'nostart.csv',
+    'noend': 'noend.csv',
+    'summary': 'summary.csv'}
+"""Names of the global report files."""
+marker_file_names = {
+    'known': 'known.seq',
+    'new': 'new.seq',
+    'noend': 'noend.seq',
+    'nostart': 'nostart.seq',
+    'knownalleles': 'knownalleles.csv',
+    'newalleles': 'newalleles.csv'}
+"""Names of the marker specific report files."""
+headers = {
+  'markers': 'name\tfPaired\trPaired\tfLeft\trLeft\tfRight\trRight\n',
+  'allele': 'allele\ttotal\tforward\treverse\n',
+  'nostartend': 'name\tforward\treverse\ttotal\n',
+  'overview': 'name\tforward\treverse\ttotal\tallele\n'}
+"""Headers for various tables."""
+def parse_library(library_handle, threshold, mismatches=0):
+    """Parse the library file and put the data in a nested dictionary
+    containing per marker the two forward flanking sequences, the two reverse
+    flanking sequences and a regular expression pattern object.
+    :arg stream library_handle: Open readable handle to a library file.
+    :arg float threshold: Number of allowed mismatches per nucleotide.
+    :arg int mismatches: If set, overrides the dynamic threshold calculation.
+    :returns dict: Nested dictionary containing library data.
+    """
+    library = {}
+    data = map(lambda x: x.strip().split('\t'), library_handle.readlines())
+    for i in data:
+        pattern = '(?!x)x'  # This will never match anything.
+        if len(i) == 4:
+            pat = i[3].split()
+            pattern = '^{}$'.format(''.join(map(
+                lambda x: ('({}){{{},{}}}'.format(
+                    pat[x], pat[x + 1], pat[x + 2])),
+                range(0, len(pat), 3))))
+        library[i[0]] = {
+            'flanks': [i[1], Seq.reverse_complement(i[2])],
+            'counts': [0, 0, 0, 0],
+            'pair_match': [0, 0],
+            'thresholds': [
+                mismatches or int(ceil(len(i[1]) * threshold)),
+                mismatches or int(ceil(len(i[2]) * threshold))],
+            'reg_exp': re_compile(pattern),
+            'new': defaultdict(lambda: [0, 0]),
+            'known': defaultdict(lambda: [0, 0])}
+    return library
+def open_files(path, markers):
+    """Make a directory structure and return a nested dictionary containing
+    open writable handles to the files in the newly created directory.
+    :arg str path: Name of the output folder.
+    :arg list markers: Name of the subfolders.
+    :returns dict: Nested dictionary containing writable file handles.
+    """
+    mkdir(path)
+    files = dict(map(lambda x:
+                     (x, open('{}/{}'.format(path, file_names[x]), 'w')),
+                     file_names))
+    for i in markers:
+        marker_path = '{}/{}'.format(path, i)
+        mkdir(marker_path)
+        files[i] = dict(map(lambda x:
+                        (x, open('{}/{}'.format(
+                            marker_path, marker_file_names[x]), 'w')),
+                        marker_file_names))
+    return files
+def write_table(table, header, handle):
+    """General function for saving tables.
+    :arg list table: Table content.
+    :arg str header: Table header.
+    :arg stream handle: Open writable handle to the output file.
+    """
+    if header:
+        handle.write(header)
+    if table:
+        for i in table:
+            handle.write('{}\n'.format('\t'.join(map(str, i))))
+def rewrite(regular_expression, pattern):
+    """Make a pattern that matches a regular expression more human readable.
+    :arg object regular_expression: A compiled regular expression object.
+    :arg str pattern: A pattern that matches {regular_expression}.
+    :returns str: A human readable version of {pattern}.
+    """
+    new_pattern = ""
+    match = regular_expression.match(pattern)
+    regs = reduce(lambda x, y:
+                  x if y == ((-1, -1), None) else
+                  x[:-1] + [y] if x[-1][1] == y[1] else
+                  x + [y],
+                  map(lambda x: (match.regs[x], match.group(x)),
+                      range(1, len(match.regs))), [((0, 0), None)])
+    for i in range(len(regs) - 1):
+        new_pattern += '{}({})'.format(
+            regs[i + 1][1], (
+                regs[i + 1][0][1] - regs[i][0][1]) //
+            (regs[i + 1][0][1] - regs[i + 1][0][0]))
+    return new_pattern
+def allele_table(new_allele, minimum):
+    """Make an allele statistics table.
+    :arg dict new_allele: Dictionary with count data of new alleles.
+    :arg int minimum: Minimum count per allele.
+    :returns list: Allele statistics table.
+    """
+    result = []
+    for i in sorted(
+            new_allele, key=lambda x: sum(new_allele[x]), reverse=True):
+        if sum(new_allele[i]) < minimum:
+            break
+        result.append([i] + [sum(new_allele[i])] + new_allele[i])
+    return result
+def summary_table(allele, minimum):
+    """Filter one of the global allele tables.
+    :arg list allele: List with count data of alleles.
+    :arg int minimum: Minimum count per allele.
+    :returns list: Allele statistics table.
+    """
+    return filter(lambda x: x[3] >= minimum, allele)
+def make_tables(total, unrecognised, library, minimum):
+    """Make overview tables of the results.
+    :arg int total: Total number of reads in the FASTA file.
+    :arg int unrecognised: Number of unrecognised reads in.
+    :arg dict library: Nested dictionary containing library data.
+    :arg int minimum: Minimum count per allele.
+    :returns dict: A nested dictionary containing overview tables.
+    """
+    known = []
+    new = []
+    no_start = []
+    no_end = []
+    tables = {
+        'library': map(lambda x:
+                       [x] + library[x]['pair_match'] + library[x]['counts'],
+                       library),
+        'allele': defaultdict(dict)}
+    for i in library:
+        for j in library[i]['known']:
+            fr = library[i]['known'][j]
+            known.append([i] + fr + [sum(fr), j])
+        for j in library[i]['new']:
+            fr = library[i]['new'][j]
+            new.append([i] + fr + [sum(fr), j])
+        no_start.append([
+            i, library[i]['counts'][2] - library[i]['pair_match'][0],
+            library[i]['counts'][3] - library[i]['pair_match'][1]])
+        no_end.append([
+            i, library[i]['counts'][0] - library[i]['pair_match'][0],
+            library[i]['counts'][1] - library[i]['pair_match'][1]])
+        tables['allele'][i]['known'] = allele_table(
+            library[i]['known'], minimum)
+        tables['allele'][i]['new'] = allele_table(library[i]['new'], minimum)
+    tables['known'] = sorted(
+        summary_table(known, minimum), key=lambda x: (x[0], x[4]))
+    tables['new'] = sorted(
+        summary_table(new, minimum), key=lambda x: (x[0], x[3]), reverse=True)
+    tables['nostart'] = map(lambda x: x + [sum(x[1:])], sorted(no_start))
+    tables['noend'] = map(lambda x: x + [sum(x[1:])], sorted(no_end))
+    tables['summary'] = [
+        ['total reads', total],
+        ['matched pairs', sum(map(lambda x:
+                                  sum(library[x]['pair_match']), library))],
+        ['new alleles',  sum(map(lambda x: x[3], tables['new']))],
+        ['new unique alleles', sum(map(lambda x:
+                                       len(allele_table(library[x]['new'],
+                                           minimum)),
+                                       library))],
+        ['no start', sum(map(lambda x: x[3], tables['nostart']))],
+        ['no end', sum(map(lambda x: x[3], tables['noend']))],
+        ['unrecognised reads', unrecognised]]
+    return tables
+def make_text_report(tables, handle):
+    """Make an overview of the results.
+    :arg dict tables: A nested dictionary containing overview tables.
+    :arg stream handle: Open writable handle to the report file.
+    """
+    write_table(tables['summary'], '', handle)
+    handle.write('\n')
+    write_table(tables['library'], headers['markers'], handle)
+    for i in tables['allele']:
+        handle.write('\nknown alleles for marker {}:\n'.format(i))
+        write_table(tables['allele'][i]['known'], headers['allele'], handle)
+        mean_length = 0
+        sum_of_lengths = sum(map(lambda x:
+                                 len(x[0]) * x[1], tables['allele'][i]['new']))
+        number_of_alleles = sum(map(lambda x:
+                                    x[1], tables['allele'][i]['new']))
+        if number_of_alleles:
+            mean_length = sum_of_lengths / number_of_alleles
+        handle.write('\nnew alleles for marker {} (mean length {}):\n'.format(
+            i, mean_length))
+        write_table(tables['allele'][i]['new'], headers['allele'], handle)
+def make_json_report(tables, handle):
+    """Make an overview of the results per marker, for downstream parsing.
+    :arg dict tables: A nested dictionary containing overview tables.
+    :arg stream handle: Open writable handle to the json file.
+    """
+    report = dict()
+    ## Parse the allele data
+    alleles  = tables['allele']
+    head = headers['allele'].strip().split('\t')
+    # Add 'marker' section to the json report
+    report['marker'] = dict()
+    for marker, data in alleles.items():
+        # Add the individual marker to the report
+        report['marker'][marker] = dict()
+        known = [ {k:v for k,v in zip(head, mark)} for mark in data['known']]
+        new = [ {k:v for k,v in zip(head, mark)} for mark in data['new']]
+        report['marker'][marker]['allele'] = { 'known': known, 'new': new }
+    ## Parse the summary data
+    summary = {field:value for field,value in tables['summary']}
+    report['summary'] = summary
+    ## Parse library data
+    head = headers['markers'].strip().split('\t')
+    for i in tables['library']:
+        row = {field:value for field, value in zip(head, i)}
+        marker = row.pop('name')
+        report['marker'][marker]['library'] = row
+    dump(report, indent=True, fp=handle)
+def write_files(tables, files):
+    """Write the overview tables to the appropriate files.
+    :arg dict tables: A nested dictionary containing overview tables.
+    :arg dict files: Nested dictionary containing writable file handles.
+    """
+    write_table(tables['summary'], '', files['summary'])
+    write_table(tables['library'], headers['markers'], files['markers'])
+    write_table(tables['known'], headers['overview'], files['known'])
+    write_table(tables['new'], headers['overview'], files['new'])
+    write_table(tables['nostart'], headers['nostartend'], files['nostart'])
+    write_table(tables['noend'], headers['nostartend'], files['noend'])
+    for i in tables['allele']:
+        write_table(
+            tables['allele'][i]['known'], headers['allele'],
+            files[i]['knownalleles'])
+        write_table(
+            tables['allele'][i]['new'], headers['allele'],
+            files[i]['newalleles'])
+def tssv(
+        input_handle, library_handle, report_handle, json_report, path,
+        threshold, mismatches, minimum, indel_score, file_format):
+    """Do the short structural variation analysis.
+    :arg stream input_handle: Open readable handle to a FASTA file.
+    :arg stream library_handle: Open readable handle to a library file.
+    :arg stream report_handle: Open writable handle to the report file.
+    :arg str report_format: Format for the report file.
+    :arg str path: Name of the output folder.
+    :arg float threshold: Number of allowed mismatches per nucleotide.
+    :arg int mismatches: If set, overrides the dynamic threshold calculation.
+    :arg int minimum: Minimum count per allele.
+    :arg int indel_score: Penalty score for insertions and deletions per
+        nucleotide
+    :arg str file_format: File format of input_handle, either 'fasta' or 'fastq'.
+    """
+    total = 0
+    unrecognised = 0
+    library = parse_library(library_handle, threshold, mismatches)
+    if path:
+        files = open_files(path, library)
+    for record in SeqIO.parse(input_handle, file_format):
+        ref = [str(record.seq), Seq.reverse_complement(str(record.seq))]
+        ref_up = list(map(str.upper, ref))
+        total += 1
+        unknown = True
+        for i in library:
+            # Align against all-uppercase reference sequence.
+            alignments = (
+                align_pair(
+                    ref_up[0], ref_up[1], library[i]['flanks'], indel_score),
+                align_pair(
+                    ref_up[1], ref_up[0], library[i]['flanks'], indel_score))
+            matches = [False, False, False, False]
+            classification = ''
+            if alignments[0][0][0] <= library[i]['thresholds'][0]:
+                cutout = ref[0][
+                    max(0, alignments[0][0][1]-len(library[i]['flanks'][0])):
+                    alignments[0][0][1]]
+                if cutout.lower() != cutout:
+                    library[i]['counts'][0] += 1
+                    classification = 'noend'
+                    matches[0] = True
+            if alignments[0][1][0] <= library[i]['thresholds'][1]:
+                cutout = ref[0][
+                    alignments[0][1][1]:
+                    alignments[0][1][1]+len(library[i]['flanks'][1])]
+                if cutout.lower() != cutout:
+                    library[i]['counts'][2] += 1
+                    classification = 'nostart'
+                    matches[1] = True
+            if alignments[1][0][0] <= library[i]['thresholds'][0]:
+                cutout = ref[1][
+                    max(0, alignments[1][0][1]-len(library[i]['flanks'][0])):
+                    alignments[1][0][1]]
+                if cutout.lower() != cutout:
+                    library[i]['counts'][1] += 1
+                    classification = 'noend'
+                    matches[2] = True
+            if alignments[1][1][0] <= library[i]['thresholds'][1]:
+                cutout = ref[1][
+                    alignments[1][1][1]:
+                    alignments[1][1][1]+len(library[i]['flanks'][1])]
+                if cutout.lower() != cutout:
+                    library[i]['counts'][3] += 1
+                    classification = 'nostart'
+                    matches[3] = True
+            if (matches[0] and matches[1]) or (matches[2] and matches[3]):
+                hit = int(matches[2] and matches[3])
+                library[i]['pair_match'][hit] += 1
+                pat = ref_up[hit][alignments[hit][0][1]:alignments[hit][1][1]]
+                classification = 'new'
+                if library[i]['reg_exp'].match(pat):
+                    classification = 'known'
+                library[i][classification][pat][hit] += 1
+            if classification:
+                unknown = False
+                if path:
+                    SeqIO.write(
+                        [record], files[i][classification], file_format)
+        if unknown:
+            unrecognised += 1
+            if path:
+                SeqIO.write([record], files['unknown'], file_format)
+    tables = make_tables(total, unrecognised, library, minimum)
+    # Make the known alleles more human readable.
+    for i in tables['allele']:
+        for j in tables['allele'][i]['known']:
+            j[0] = rewrite(library[i]['reg_exp'], j[0])
+    for i in tables['known']:
+        i[4] = rewrite(library[i[0]]['reg_exp'], i[4])
+    if path:
+        write_files(tables, files)
+    if json_report:
+        make_json_report(tables, report_handle)
+    else:
+        make_text_report(tables, report_handle)