PyPI - HTSeq - Versions diffs - 2.1.2__cp313-cp313-macosx_10_15_x86_64.whl - Mend

HTSeq 2.1.2__cp313-cp313-macosx_10_15_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

HTSeq/StepVector.py +629 -0
HTSeq/StretchVector.py +491 -0
HTSeq/_HTSeq.cpython-313-darwin.so +0 -0
HTSeq/_HTSeq_internal.py +85 -0
HTSeq/_StepVector.cpython-313-darwin.so +0 -0
HTSeq/__init__.py +1249 -0
HTSeq/features.py +489 -0
HTSeq/scripts/__init__.py +0 -0
HTSeq/scripts/count.py +528 -0
HTSeq/scripts/count_features/__init__.py +0 -0
HTSeq/scripts/count_features/count_features_per_file.py +465 -0
HTSeq/scripts/count_features/reads_io_processor.py +187 -0
HTSeq/scripts/count_features/reads_stats.py +92 -0
HTSeq/scripts/count_with_barcodes.py +746 -0
HTSeq/scripts/qa.py +336 -0
HTSeq/scripts/utils.py +372 -0
HTSeq/utils.py +92 -0
htseq-2.1.2.dist-info/METADATA +813 -0
htseq-2.1.2.dist-info/RECORD +23 -0
htseq-2.1.2.dist-info/WHEEL +5 -0
htseq-2.1.2.dist-info/entry_points.txt +4 -0
htseq-2.1.2.dist-info/licenses/LICENSE +674 -0
htseq-2.1.2.dist-info/top_level.txt +1 -0

HTSeq/scripts/qa.py ADDED Viewed

@@ -0,0 +1,336 @@
+#!/usr/bin/env python
+# HTSeq_QA.py
+#
+# (c) Simon Anders, European Molecular Biology Laboratory, 2010
+# released under GNU General Public License
+import sys
+import os.path
+import argparse
+from itertools import islice
+import numpy as np
+import HTSeq
+try:
+    import matplotlib
+    import matplotlib.pyplot as plt
+    from matplotlib.pyplot import Normalize
+except ImportError:
+    sys.stderr.write("htseq-qa needs 'matplotlib >= 1.5'")
+    raise
+def get_read_length(readfile, isAlnmntFile):
+    readlen = 0
+    if isAlnmntFile:
+        reads = (a.read for a in readfile)
+    else:
+        reads = readfile
+    for r in islice(reads, 10000):
+        if len(r) > readlen:
+            readlen = len(r)
+    return readlen
+def compute_quality(
+        readfilename,
+        file_type,
+        nosplit,
+        readlen,
+        max_qual,
+        gamma,
+        primary_only=False,
+        max_records=-1,
+        ):
+    if file_type in ("sam", "bam"):
+        readfile = HTSeq.BAM_Reader(readfilename)
+        isAlnmntFile = True
+    elif file_type == "solexa-export":
+        readfile = HTSeq.SolexaExportReader(readfilename)
+        isAlnmntFile = True
+    elif file_type == "fastq":
+        readfile = HTSeq.FastqReader(readfilename)
+        isAlnmntFile = False
+    elif file_type == "solexa-fastq":
+        readfile = HTSeq.FastqReader(readfilename, "solexa")
+        isAlnmntFile = False
+    else:
+        raise ValueError('File format not recognized: {:}'.format(file_type))
+    twoColumns = isAlnmntFile and (not nosplit)
+    if readlen is None:
+        readlen = get_read_length(readfile, isAlnmntFile)
+    # Initialize count arrays
+    base_arr_U = np.zeros((readlen, 5), np.int64)
+    qual_arr_U = np.zeros((readlen, max_qual+1), np.int64)
+    if twoColumns:
+        base_arr_A = np.zeros((readlen, 5), np.int64)
+        qual_arr_A = np.zeros((readlen, max_qual+1), np.int64)
+    # Main counting loop
+    i = 0
+    try:
+        for a in readfile:
+            if isAlnmntFile:
+                r = a.read
+            else:
+                r = a
+            # Exclude non-primary alignments if requested
+            if isAlnmntFile and primary_only:
+                if a.aligned and a.not_primary_alignment:
+                    continue
+            if twoColumns and isAlnmntFile and a.aligned:
+                r.add_bases_to_count_array(base_arr_A)
+                r.add_qual_to_count_array(qual_arr_A)
+            else:
+                r.add_bases_to_count_array(base_arr_U)
+                r.add_qual_to_count_array(qual_arr_U)
+            i += 1
+            if i == max_records:
+                break
+            if (i % 200000) == 0:
+                if (not isAlnmntFile) or primary_only:
+                    print(i, "reads processed")
+                else:
+                    print(i, "alignments processed")
+    except:
+        sys.stderr.write("Error occured in: %s\n" %
+                         readfile.get_line_number_string())
+        raise
+    if (not isAlnmntFile) or primary_only:
+        print(i, "reads processed")
+    else:
+        print(i, "alignments processed")
+    # Normalize result
+    def norm_by_pos(arr):
+        arr = np.array(arr, np.float64)
+        arr_n = (arr.T / arr.sum(1)).T
+        arr_n[arr == 0] = 0
+        return arr_n
+    def norm_by_start(arr):
+        arr = np.array(arr, np.float64)
+        arr_n = (arr.T / arr.sum(1)[0]).T
+        arr_n[arr == 0] = 0
+        return arr_n
+    result = {
+        'isAlnmntFile': isAlnmntFile,
+        'readlen': readlen,
+        'twoColumns': twoColumns,
+        'base_arr_U_n': norm_by_pos(base_arr_U),
+        'qual_arr_U_n': norm_by_start(qual_arr_U),
+        'nreads_U': base_arr_U[0, :].sum(),
+        }
+    if twoColumns:
+        result['base_arr_A_n'] = norm_by_pos(base_arr_A)
+        result['qual_arr_A_n'] = norm_by_start(qual_arr_A)
+        result['nreads_A'] = base_arr_A[0, :].sum()
+    return result
+def plot(
+        result,
+        readfilename,
+        outfile,
+        max_qual,
+        gamma,
+        primary_only=False,
+        ):
+    def plot_bases(arr, ax):
+        xg = np.arange(readlen)
+        ax.plot(xg, arr[:, 0], marker='.', color='red')
+        ax.plot(xg, arr[:, 1], marker='.', color='darkgreen')
+        ax.plot(xg, arr[:, 2], marker='.', color='lightgreen')
+        ax.plot(xg, arr[:, 3], marker='.', color='orange')
+        ax.plot(xg, arr[:, 4], marker='.', color='grey')
+        ax.set_xlim(0, readlen-1)
+        ax.set_ylim(0, 1)
+        ax.text(readlen*.70, .9, "A", color="red")
+        ax.text(readlen*.75, .9, "C", color="darkgreen")
+        ax.text(readlen*.80, .9, "G", color="lightgreen")
+        ax.text(readlen*.85, .9, "T", color="orange")
+        ax.text(readlen*.90, .9, "N", color="grey")
+    if outfile is None:
+        outfilename = os.path.basename(readfilename) + ".pdf"
+    else:
+        outfilename = outfile
+    isAlnmntFile = result['isAlnmntFile']
+    readlen = result['readlen']
+    twoColumns = result['twoColumns']
+    base_arr_U_n = result['base_arr_U_n']
+    qual_arr_U_n = result['qual_arr_U_n']
+    nreads_U = result['nreads_U']
+    if twoColumns:
+        base_arr_A_n = result['base_arr_A_n']
+        qual_arr_A_n = result['qual_arr_A_n']
+        nreads_A = result['nreads_A']
+    cur_backend = matplotlib.get_backend()
+    try:
+        matplotlib.use('PDF')
+        fig = plt.figure()
+        fig.subplots_adjust(top=.85)
+        fig.suptitle(os.path.basename(readfilename), fontweight='bold')
+        if twoColumns:
+            ax = fig.add_subplot(221)
+            plot_bases(base_arr_U_n, ax)
+            ax.set_ylabel("proportion of base")
+            ax.set_title(
+                    "non-aligned reads\n{:.0%} ({:.4f} million)".format(
+                    1.0 * nreads_U / (nreads_U+nreads_A),
+                    1.0 * nreads_U / 1e6,
+                    ))
+            ax2 = fig.add_subplot(222)
+            plot_bases(base_arr_A_n, ax2)
+            ax2.set_title(
+                    "{:}\n{:.0%} ({:.4f} million)".format(
+                        'aligned reads' if primary_only else 'alignments',
+                        1.0 * nreads_A / (nreads_U+nreads_A),
+                        1.0 * nreads_A / 1e6,
+                    ))
+            ax3 = fig.add_subplot(223)
+            ax3.pcolor(
+                    qual_arr_U_n.T ** gamma,
+                    cmap=plt.cm.Greens,
+                    norm=Normalize(0, 1))
+            ax3.set_xlim(0, readlen-1)
+            ax3.set_ylim(0, max_qual+1)
+            ax3.set_xlabel("position in read")
+            ax3.set_ylabel("base-call quality score")
+            ax4 = fig.add_subplot(224)
+            ax4.pcolor(
+                    qual_arr_A_n.T ** gamma,
+                    cmap=plt.cm.Greens,
+                    norm=Normalize(0, 1))
+            ax4.set_xlim(0, readlen-1)
+            ax4.set_ylim(0, max_qual+1)
+            ax4.set_xlabel("position in read")
+        else:
+            ax = fig.add_subplot(211)
+            plot_bases(base_arr_U_n, ax)
+            ax.set_ylabel("proportion of base")
+            ax.set_title("{:.3f} million {:}".format(
+                1.0 * nreads_U / 1e6,
+                'reads' if (not isAlnmntFile) or primary_only else 'alignments',
+                ))
+            ax2 = fig.add_subplot(212)
+            ax2.pcolor(
+                    qual_arr_U_n.T ** gamma,
+                    cmap=plt.cm.Greens,
+                    norm=Normalize(0, 1))
+            ax2.set_xlim(0, readlen-1)
+            ax2.set_ylim(0, max_qual+1)
+            ax2.set_xlabel("position in read")
+            ax2.set_ylabel("base-call quality score")
+        fig.savefig(outfilename)
+    finally:
+        matplotlib.use(cur_backend)
+def main():
+    # **** Parse command line ****
+    pa = argparse.ArgumentParser(
+        description=
+        "This script take a file with high-throughput sequencing reads " +
+        "(supported formats: SAM, Solexa _export.txt, FASTQ, Solexa " +
+        "_sequence.txt) and performs a simply quality assessment by " +
+        "producing plots showing the distribution of called bases and " +
+        "base-call quality scores by position within the reads. The " +
+        "plots are output as a PDF file.",
+        )
+    pa.add_argument(
+        'readfilename',
+        help='The file to count reads in (SAM/BAM or Fastq)',
+        )
+    pa.add_argument(
+        "-t", "--type", type=str, dest="type",
+        choices=("sam", "bam", "solexa-export", "fastq", "solexa-fastq"),
+        default="sam", help="type of read_file (one of: sam [default], bam, " +
+        "solexa-export, fastq, solexa-fastq)")
+    pa.add_argument(
+        "-o", "--outfile", type=str, dest="outfile",
+        help="output filename (default is <read_file>.pdf)")
+    pa.add_argument(
+        "-r", "--readlength", type=int, dest="readlen",
+        help="the maximum read length (when not specified, the script guesses from the file")
+    pa.add_argument(
+        "-g", "--gamma", type=float, dest="gamma",
+        default=0.3,
+        help="the gamma factor for the contrast adjustment of the quality score plot")
+    pa.add_argument(
+        "-n", "--nosplit", action="store_true", dest="nosplit",
+        help="do not split reads in unaligned and aligned ones")
+    pa.add_argument(
+        "-m", "--maxqual", type=int, dest="maxqual", default=41,
+        help="the maximum quality score that appears in the data (default: 41)")
+    pa.add_argument(
+        '--primary-only', action='store_true',
+        help="For SAM/BAM input files, ignore alignments that are not primary. " +
+        "This only affects 'multimapper' reads that align to several regions " +
+        "in the genome. By choosing this option, each read will only count as " +
+        "one; without this option, each of its alignments counts as one."
+    )
+    pa.add_argument(
+        '--max-records', type=int, default=-1, dest='max_records',
+        help="Limit the analysis to the first N reads/alignments."
+        )
+    args = pa.parse_args()
+    result = compute_quality(
+        args.readfilename,
+        args.type,
+        args.nosplit,
+        args.readlen,
+        args.maxqual,
+        args.gamma,
+        args.primary_only,
+        args.max_records,
+        )
+    plot(
+        result,
+        args.readfilename,
+        args.outfile,
+        args.maxqual,
+        args.gamma,
+        args.primary_only,
+        )
+if __name__ == "__main__":
+    main()

HTSeq/scripts/utils.py ADDED Viewed

@@ -0,0 +1,372 @@
+import sys
+import numpy as np
+class UnknownChrom(Exception):
+    pass
+def my_showwarning(message, category, filename, lineno=None, file=None,
+                   line=None):
+    sys.stderr.write("Warning: %s\n" % message)
+def invert_strand(iv):
+    iv2 = iv.copy()
+    if iv2.strand == "+":
+        iv2.strand = "-"
+    elif iv2.strand == "-":
+        iv2.strand = "+"
+    else:
+        raise ValueError("Illegal strand")
+    return iv2
+def _merge_counts(
+        results,
+        attributes,
+        additional_attributes,
+        sparse=False,
+        dtype=np.float32,
+        ):
+    barcodes = 'cell_barcodes' in results
+    if barcodes:
+        cbs = results['cell_barcodes']
+        counts = results['counts']
+    feature_attr = sorted(attributes.keys())
+    other_features = [
+        ('__no_feature', 'empty'),
+        ('__ambiguous', 'ambiguous'),
+        ('__too_low_aQual', 'lowqual'),
+        ('__not_aligned', 'notaligned'),
+        ('__alignment_not_unique', 'nonunique'),
+        ]
+    fea_names = [fea for fea in feature_attr] + [fea[0] for fea in other_features]
+    L = len(fea_names)
+    if barcodes:
+        n = len(cbs)
+    else:
+        n = len(results)
+    if not sparse:
+        table = np.zeros(
+            (n, L),
+            dtype=dtype,
+        )
+    else:
+        from scipy.sparse import lil_matrix
+        table = lil_matrix((n, L), dtype=dtype)
+    if not barcodes:
+        fea_ids = [fea for fea in feature_attr] + [fea[1] for fea in other_features]
+        for j, r in enumerate(results):
+            for i, fn in enumerate(fea_ids):
+                if i < len(feature_attr):
+                    countji = r['counts'][fn]
+                else:
+                    countji = r[fn]
+                if countji > 0:
+                    table[j, i] = countji
+    else:
+        for j, cb in enumerate(cbs):
+            for i, fn in enumerate(fea_names):
+                countji = counts[cb][fn]
+                if countji > 0:
+                    table[j, i] = countji
+    if sparse:
+        table = table.tocsr()
+    feature_metadata = {
+        'id': fea_names,
+    }
+    for iadd, attr in enumerate(additional_attributes):
+        feature_metadata[attr] = [attributes[fn][iadd] for fn in feature_attr]
+    return {
+        'feature_metadata': feature_metadata,
+        'table': table,
+    }
+def _count_results_to_tsv(
+        results,
+        samples_name,
+        attributes,
+        additional_attributes,
+        output_filename,
+        output_delimiter,
+        output_append=False,
+        add_tsv_header=False
+        ):
+    barcodes = 'cell_barcodes' in results
+    pad = ['' for attr in additional_attributes]
+    if barcodes:
+        cbs = results['cell_barcodes']
+        counts = results['counts']
+        # Print or write header
+        fields = [''] + pad + cbs
+        line = output_delimiter.join(fields)
+        if output_filename == '':
+            print(line)
+        else:
+            with open(output_filename, 'w') as f:
+                f.write(line)
+                f.write('\n')
+    elif add_tsv_header:
+        # Write the header.
+        # Only get here if we don't have cell barcodes, i.e. this is not called by htseq-count-barcode,
+        # and user wants the tsv header
+        file_header = output_delimiter.join([''] + pad + samples_name)
+        if output_filename == '':
+            print(file_header)
+        else:
+            # If append to existing file, then open as a
+            file_open_opt = 'a' if output_append else 'w'
+            with open(output_filename, file_open_opt) as f:
+                f.write(file_header)
+                f.write('\n')
+    # Each feature is a row with feature id, additional attrs, and counts
+    feature_attr = sorted(attributes.keys())
+    for ifn, fn in enumerate(feature_attr):
+        if not barcodes:
+            fields = [fn] + attributes[fn] + [str(r['counts'][fn]) for r in results]
+        else:
+            fields = [fn] + attributes[fn] + [str(counts[cb][fn]) for cb in cbs]
+        line = output_delimiter.join(fields)
+        if output_filename == '':
+            print(line)
+        else:
+            omode = 'a' if output_append or (ifn > 0) or barcodes or add_tsv_header else 'w'
+            with open(output_filename, omode) as f:
+                f.write(line)
+                f.write('\n')
+    # Add other features (unmapped, etc.)
+    other_features = [
+        ('__no_feature', 'empty'),
+        ('__ambiguous', 'ambiguous'),
+        ('__too_low_aQual', 'lowqual'),
+        ('__not_aligned', 'notaligned'),
+        ('__alignment_not_unique', 'nonunique'),
+        ]
+    for title, fn in other_features:
+        if not barcodes:
+            fields = [title] + pad + [str(r[fn]) for r in results]
+        else:
+            fields = [title] + pad + [str(counts[cb][title]) for cb in cbs]
+        line = output_delimiter.join(fields)
+        if output_filename == '':
+            print(line)
+        else:
+            with open(output_filename, 'a') as f:
+                f.write(line)
+                f.write('\n')
+def _count_table_to_mtx(
+        filename,
+        table,
+        feature_metadata,
+        samples,
+        ):
+    if not str(filename).endswith('.mtx'):
+        raise ValueError('Matrix Marker filename should end with ".mtx"')
+    try:
+        from scipy.io import mmwrite
+    except ImportError:
+        raise ImportError('Install scipy for mtx support')
+    filename_pfx = str(filename)[:-4]
+    filename_feature_meta = filename_pfx+'_features.tsv'
+    filename_samples = filename_pfx+'_samples.tsv'
+    # Write main matrix (features as columns)
+    mmwrite(
+        filename,
+        table,
+    )
+    # Write input filenames
+    with open(filename_samples, 'wt') as fout:
+        for fn in samples:
+            fout.write(fn+'\n')
+    # Write feature metadata (ids and additional attributes)
+    with open(filename_feature_meta, 'wt') as fout:
+        nkeys = len(feature_metadata)
+        for ik, key in enumerate(feature_metadata):
+            if ik != nkeys - 1:
+                fout.write(key+'\t')
+            else:
+                fout.write(key+'\n')
+        nfeatures = len(feature_metadata[key])
+        for i in range(nfeatures):
+            for ik, key in enumerate(feature_metadata):
+                if ik != nkeys - 1:
+                    fout.write(feature_metadata[key][i]+'\t')
+                else:
+                    fout.write(feature_metadata[key][i]+'\n')
+def _count_table_to_h5ad(
+        filename,
+        table,
+        feature_metadata,
+        samples,
+        ):
+    try:
+        import anndata
+    except ImportError:
+        raise ImportError('Install the anndata package for h5ad support')
+    # If they have anndata, they have scipy and pandas too
+    import pandas as pd
+    # We don't have additional attribute (e.g. gene name) for htseq specific features like __no_feature.
+    # Hence the trick is to convert the array to series so the value for htseq specific features like __no_feature
+    # column is set NaN.
+    # See: https://stackoverflow.com/questions/19736080/creating-dataframe-from-a-dictionary-where-entries-have-different-lengths
+    feature_metadata = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in feature_metadata.items()]))
+    feature_metadata.set_index(feature_metadata.columns[0], inplace=True)
+    adata = anndata.AnnData(
+        X=table,
+        obs=pd.DataFrame([], index=samples),
+        var=feature_metadata,
+    )
+    adata.write_h5ad(filename)
+def _count_table_to_loom(
+        filename,
+        table,
+        feature_metadata,
+        samples,
+        ):
+    try:
+        import loompy
+    except ImportError:
+        raise ImportError('Install the loompy package for loom support')
+    # Loom uses features as rows...
+    layers = {'': table.T}
+    row_attrs = feature_metadata
+    col_attrs = {'_index': samples}
+    loompy.create(
+        filename,
+        layers=layers,
+        row_attrs=row_attrs,
+        col_attrs=col_attrs,
+    )
+def _write_output(
+    results,
+    samples,
+    attributes,
+    additional_attributes,
+    output_filename,
+    output_delimiter,
+    output_append,
+    sparse=False,
+    dtype=np.float32,
+    add_tsv_header=False
+    ):
+    """
+    Export the gene counts as tsv/csv, mtx, loom, h5ad files.
+    Note, need to update the parameter documentations.
+    Parameters
+    ----------
+    results : list
+        List of dictionaries with each element representing the counts for an input BAM file.
+        Note, the list is in order of the samples parameter. So the first element in the list corresponds to
+        the first file in samples parameter.
+    samples : list
+        List of input BAM files.
+    """
+    # Write output to stdout or TSV/CSV
+    if output_filename == '':
+        _count_results_to_tsv(
+            results,
+            samples,
+            attributes,
+            additional_attributes,
+            output_filename,
+            output_delimiter,
+            output_append=False,
+            add_tsv_header=add_tsv_header
+        )
+        return
+    # Get file extension/format
+    output_sfx = output_filename.split('.')[-1].lower()
+    if output_sfx in ('csv', 'tsv'):
+        _count_results_to_tsv(
+            results,
+            samples,
+            attributes,
+            additional_attributes,
+            output_filename,
+            output_delimiter,
+            output_append,
+            add_tsv_header=add_tsv_header
+        )
+        return
+    # Make unified object of counts and feature metadata
+    output_dict = _merge_counts(
+        results,
+        attributes,
+        additional_attributes,
+        sparse=sparse,
+        dtype=dtype,
+    )
+    if output_sfx == 'mtx':
+        _count_table_to_mtx(
+            output_filename,
+            output_dict['table'],
+            output_dict['feature_metadata'],
+            samples,
+        )
+        return
+    if output_sfx == 'loom':
+        _count_table_to_loom(
+            output_filename,
+            output_dict['table'],
+            output_dict['feature_metadata'],
+            samples,
+        )
+        return
+    if output_sfx == 'h5ad':
+        _count_table_to_h5ad(
+            output_filename,
+            output_dict['table'],
+            output_dict['feature_metadata'],
+            samples,
+        )
+        return
+    raise ValueError(
+        f'Format not recognized for output count file: {output_sfx}')