PyPI - gffkit - Versions diffs - 0.1.0__py3-none-any.whl - Mend

gffkit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

gffkit/__init__.py +3 -0
gffkit/__main__.py +4 -0
gffkit/add_utr.py +571 -0
gffkit/complement_annotations.py +1001 -0
gffkit/detect_bridge_merged_genes.py +530 -0
gffkit/main.py +178 -0
gffkit-0.1.0.dist-info/LICENSE +21 -0
gffkit-0.1.0.dist-info/METADATA +96 -0
gffkit-0.1.0.dist-info/RECORD +12 -0
gffkit-0.1.0.dist-info/WHEEL +5 -0
gffkit-0.1.0.dist-info/entry_points.txt +5 -0
gffkit-0.1.0.dist-info/top_level.txt +1 -0

gffkit/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""gffkit: region-aware GFF annotation integration utilities."""
+__version__ = "0.1.0"

gffkit/__main__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .main import main
+if __name__ == "__main__":
+    raise SystemExit(main())

gffkit/add_utr.py ADDED Viewed

@@ -0,0 +1,571 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+add_utr.py
+Only implement the UTR-padding behavior similar to AGAT _check_utrs():
+For each transcript:
+    exon - CDS => candidate UTR intervals
+Then classify:
+    + strand:
+        interval before leftmost CDS  => five_prime_UTR
+        interval after rightmost CDS  => three_prime_UTR
+    - strand:
+        interval before leftmost CDS  => three_prime_UTR
+        interval after rightmost CDS  => five_prime_UTR
+Intervals located between CDS blocks are skipped, following AGAT's behavior
+of not creating UTR inside CDS/ribosomal-slippage-like middle regions.
+Input:
+    GFF3 or simple GTF-like file with exon/CDS features.
+Requirements:
+    - exon and CDS must have Parent=transcript_id in GFF3
+      or transcript_id "xxx" in GTF.
+    - Existing UTR features can either be kept or removed.
+Usage:
+    python add_utr.py -i input.gff3 -o output.gff3
+    python add_utr.py -i input.gtf  -o output.gff3
+    python add_utr.py -i input.gff3 -o output.gff3 --replace-existing-utrs
+"""
+import argparse
+import gzip
+import sys
+from dataclasses import dataclass, field
+from collections import defaultdict
+from typing import Dict, List, Tuple, Optional
+@dataclass
+class Feature:
+    seqid: str
+    source: str
+    ftype: str
+    start: int
+    end: int
+    score: str
+    strand: str
+    phase: str
+    attrs: Dict[str, List[str]] = field(default_factory=dict)
+    raw_attr: str = "."
+    line_no: int = 0
+    def parent_ids(self) -> List[str]:
+        """
+        Prefer GFF3 Parent.
+        Fall back to GTF transcript_id.
+        """
+        if "Parent" in self.attrs:
+            return self.attrs["Parent"]
+        if "transcript_id" in self.attrs:
+            return self.attrs["transcript_id"]
+        return []
+    def get_id(self) -> Optional[str]:
+        values = self.attrs.get("ID")
+        if values:
+            return values[0]
+        return None
+    def to_gff3(self) -> str:
+        return "\t".join([
+            self.seqid,
+            self.source,
+            self.ftype,
+            str(self.start),
+            str(self.end),
+            self.score,
+            self.strand,
+            self.phase,
+            format_gff3_attrs(self.attrs),
+        ])
+def open_text(path: str):
+    if path == "-":
+        return sys.stdin
+    if path.endswith(".gz"):
+        return gzip.open(path, "rt")
+    return open(path, "r", encoding="utf-8")
+def parse_attrs(attr_text: str) -> Dict[str, List[str]]:
+    """
+    Parse both:
+        GFF3: ID=xxx;Parent=yyy
+        GTF : gene_id "g1"; transcript_id "t1";
+    """
+    attr_text = attr_text.strip()
+    attrs: Dict[str, List[str]] = {}
+    if not attr_text or attr_text == ".":
+        return attrs
+    parts = [x.strip() for x in attr_text.rstrip(";").split(";") if x.strip()]
+    for part in parts:
+        if "=" in part:
+            key, value = part.split("=", 1)
+            key = key.strip()
+            values = [v.strip() for v in value.split(",") if v.strip()]
+            attrs[key] = values
+        else:
+            fields = part.split(None, 1)
+            if len(fields) == 2:
+                key = fields[0].strip()
+                value = fields[1].strip().strip('"')
+                attrs[key] = [value]
+    return attrs
+def format_gff3_attrs(attrs: Dict[str, List[str]]) -> str:
+    if not attrs:
+        return "."
+    preferred = ["ID", "Parent", "Name", "gene_id", "transcript_id"]
+    keys = [k for k in preferred if k in attrs]
+    keys.extend(sorted(k for k in attrs if k not in keys))
+    out = []
+    for key in keys:
+        values = attrs.get(key, [])
+        if not values:
+            continue
+        out.append(f"{key}={','.join(values)}")
+    return ";".join(out) if out else "."
+def read_gff(path: str) -> Tuple[List[str], List[Feature]]:
+    headers: List[str] = []
+    features: List[Feature] = []
+    with open_text(path) as fh:
+        for i, line in enumerate(fh, start=1):
+            line = line.rstrip("\n")
+            if not line:
+                continue
+            if line.startswith("#"):
+                headers.append(line)
+                continue
+            cols = line.split("\t")
+            if len(cols) != 9:
+                print(
+                    f"[WARN] Skip line {i}: expected 9 columns, got {len(cols)}",
+                    file=sys.stderr,
+                )
+                continue
+            seqid, source, ftype, start, end, score, strand, phase, attr_text = cols
+            try:
+                start_i = int(start)
+                end_i = int(end)
+            except ValueError:
+                print(
+                    f"[WARN] Skip line {i}: start/end is not integer",
+                    file=sys.stderr,
+                )
+                continue
+            features.append(
+                Feature(
+                    seqid=seqid,
+                    source=source,
+                    ftype=ftype,
+                    start=start_i,
+                    end=end_i,
+                    score=score,
+                    strand=strand,
+                    phase=phase,
+                    attrs=parse_attrs(attr_text),
+                    raw_attr=attr_text,
+                    line_no=i,
+                )
+            )
+    return headers, features
+def merge_intervals(intervals: List[Tuple[int, int]]) -> List[Tuple[int, int]]:
+    """
+    Merge overlapping or adjacent intervals.
+    Coordinates are 1-based closed intervals.
+    """
+    if not intervals:
+        return []
+    intervals = sorted(intervals)
+    merged = [intervals[0]]
+    for s, e in intervals[1:]:
+        last_s, last_e = merged[-1]
+        if s <= last_e + 1:
+            merged[-1] = (last_s, max(last_e, e))
+        else:
+            merged.append((s, e))
+    return merged
+def subtract_intervals(
+    exon: Tuple[int, int],
+    cds_intervals: List[Tuple[int, int]],
+) -> List[Tuple[int, int]]:
+    """
+    Return exon - CDS.
+    Example:
+        exon = 100-500
+        CDS  = 200-400
+        result:
+            100-199
+            401-500
+    """
+    remaining = [exon]
+    for cds_s, cds_e in cds_intervals:
+        new_remaining = []
+        for s, e in remaining:
+            if cds_e < s or cds_s > e:
+                new_remaining.append((s, e))
+                continue
+            if s < cds_s:
+                new_remaining.append((s, cds_s - 1))
+            if cds_e < e:
+                new_remaining.append((cds_e + 1, e))
+        remaining = new_remaining
+    return remaining
+def is_utr_type(ftype: str) -> bool:
+    return ftype.lower() in {
+        "utr",
+        "five_prime_utr",
+        "three_prime_utr",
+        "5utr",
+        "3utr",
+        "five_prime_UTR".lower(),
+        "three_prime_UTR".lower(),
+    }
+def build_used_ids(features: List[Feature]) -> set:
+    used = set()
+    for f in features:
+        fid = f.get_id()
+        if fid:
+            used.add(fid)
+    return used
+def unique_id(base: str, used_ids: set) -> str:
+    if base not in used_ids:
+        used_ids.add(base)
+        return base
+    n = 2
+    while f"{base}_{n}" in used_ids:
+        n += 1
+    new_id = f"{base}_{n}"
+    used_ids.add(new_id)
+    return new_id
+def classify_utr(
+    utr_start: int,
+    utr_end: int,
+    strand: str,
+    leftmost_cds: int,
+    rightmost_cds: int,
+) -> Optional[str]:
+    """
+    Match AGAT's classification idea:
+    If UTR is before the leftmost CDS:
+        + => five_prime_UTR
+        - => three_prime_UTR
+    If UTR is after the rightmost CDS:
+        + => three_prime_UTR
+        - => five_prime_UTR
+    If UTR is between CDS blocks, skip it.
+    """
+    if utr_end < leftmost_cds:
+        if strand == "-":
+            return "three_prime_UTR"
+        return "five_prime_UTR"
+    if utr_start > rightmost_cds:
+        if strand == "-":
+            return "five_prime_UTR"
+        return "three_prime_UTR"
+    return None
+def make_utr_feature(
+    template_exon: Feature,
+    parent_id: str,
+    utr_start: int,
+    utr_end: int,
+    utr_type: str,
+    used_ids: set,
+    index: int,
+    id_prefix: str,
+) -> Feature:
+    """
+    Clone exon-like information and replace type/start/end/phase/ID/Parent.
+    """
+    utr_id = unique_id(f"{id_prefix}{parent_id}.{utr_type}.{index}", used_ids)
+    attrs = {
+        "ID": [utr_id],
+        "Parent": [parent_id],
+    }
+    # Keep useful GTF-origin attributes if present.
+    if "gene_id" in template_exon.attrs:
+        attrs["gene_id"] = list(template_exon.attrs["gene_id"])
+    if "transcript_id" in template_exon.attrs:
+        attrs["transcript_id"] = list(template_exon.attrs["transcript_id"])
+    return Feature(
+        seqid=template_exon.seqid,
+        source=template_exon.source,
+        ftype=utr_type,
+        start=utr_start,
+        end=utr_end,
+        score=template_exon.score,
+        strand=template_exon.strand,
+        phase=".",
+        attrs=attrs,
+        line_no=template_exon.line_no,
+    )
+def add_utrs_like_agat(
+    features: List[Feature],
+    replace_existing_utrs: bool = False,
+    id_prefix: str = "agat_utrs_",
+) -> List[Feature]:
+    """
+    Add missing UTRs from exon and CDS.
+    If replace_existing_utrs=True:
+        remove all existing UTRs first, then recreate expected UTRs.
+    If replace_existing_utrs=False:
+        keep existing UTRs and only add expected UTRs that do not already
+        have identical coordinates and type under the same transcript.
+    """
+    exons_by_tx = defaultdict(list)
+    cds_by_tx = defaultdict(list)
+    utrs_by_tx = defaultdict(list)
+    for f in features:
+        ftype_lower = f.ftype.lower()
+        for parent in f.parent_ids():
+            if ftype_lower == "exon":
+                exons_by_tx[parent].append(f)
+            elif ftype_lower == "cds":
+                cds_by_tx[parent].append(f)
+            elif is_utr_type(f.ftype):
+                utrs_by_tx[parent].append(f)
+    used_ids = build_used_ids(features)
+    new_utrs: List[Feature] = []
+    # Optionally remove existing UTRs, closer to AGAT's "recreate when wrong" behavior.
+    if replace_existing_utrs:
+        features = [f for f in features if not is_utr_type(f.ftype)]
+    for tx_id in sorted(set(exons_by_tx) & set(cds_by_tx)):
+        exons = sorted(exons_by_tx[tx_id], key=lambda x: (x.start, x.end))
+        cds_features = sorted(cds_by_tx[tx_id], key=lambda x: (x.start, x.end))
+        if not exons or not cds_features:
+            continue
+        cds_intervals = merge_intervals([(c.start, c.end) for c in cds_features])
+        leftmost_cds = min(s for s, _ in cds_intervals)
+        rightmost_cds = max(e for _, e in cds_intervals)
+        existing_signatures = set()
+        if not replace_existing_utrs:
+            for u in utrs_by_tx.get(tx_id, []):
+                existing_signatures.add((u.ftype, u.start, u.end))
+        utr_index = 0
+        for exon in exons:
+            candidate_intervals = subtract_intervals(
+                (exon.start, exon.end),
+                cds_intervals,
+            )
+            for utr_start, utr_end in candidate_intervals:
+                if utr_start > utr_end:
+                    continue
+                utr_type = classify_utr(
+                    utr_start=utr_start,
+                    utr_end=utr_end,
+                    strand=exon.strand,
+                    leftmost_cds=leftmost_cds,
+                    rightmost_cds=rightmost_cds,
+                )
+                # AGAT skips UTR candidates located between CDS blocks.
+                if utr_type is None:
+                    continue
+                signature = (utr_type, utr_start, utr_end)
+                if signature in existing_signatures:
+                    continue
+                utr_index += 1
+                utr = make_utr_feature(
+                    template_exon=exon,
+                    parent_id=tx_id,
+                    utr_start=utr_start,
+                    utr_end=utr_end,
+                    utr_type=utr_type,
+                    used_ids=used_ids,
+                    index=utr_index,
+                    id_prefix=id_prefix,
+                )
+                new_utrs.append(utr)
+                existing_signatures.add(signature)
+    return insert_utrs_after_matching_exons(features, new_utrs)
+def insert_utrs_after_matching_exons(
+    features: List[Feature],
+    new_utrs: List[Feature],
+) -> List[Feature]:
+    """
+    Keep original order as much as possible.
+    Insert newly created UTRs after the exon line that was used as template.
+    """
+    by_line = defaultdict(list)
+    for u in new_utrs:
+        by_line[u.line_no].append(u)
+    output = []
+    for f in features:
+        output.append(f)
+        if f.line_no in by_line:
+            output.extend(
+                sorted(
+                    by_line[f.line_no],
+                    key=lambda x: (x.start, x.end, x.ftype),
+                )
+            )
+    return output
+def write_gff3(headers: List[str], features: List[Feature], output: str) -> None:
+    if output == "-":
+        out = sys.stdout
+        close = False
+    else:
+        out = open(output, "w", encoding="utf-8")
+        close = True
+    try:
+        has_gff_version = any(h.startswith("##gff-version") for h in headers)
+        if not has_gff_version:
+            print("##gff-version 3", file=out)
+        for h in headers:
+            if h.startswith("##FASTA"):
+                # This simple script does not preserve FASTA section safely.
+                # Stop before FASTA.
+                break
+            print(h, file=out)
+        for f in features:
+            print(f.to_gff3(), file=out)
+    finally:
+        if close:
+            out.close()
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description="Add missing UTR features from exon and CDS, similar to AGAT _check_utrs()."
+    )
+    parser.add_argument(
+        "-i",
+        "--input",
+        required=True,
+        help="Input GFF3/GTF file. Use '-' for stdin. .gz is supported.",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        required=True,
+        help="Output GFF3 file. Use '-' for stdout.",
+    )
+    parser.add_argument(
+        "--replace-existing-utrs",
+        action="store_true",
+        help=(
+            "Remove existing UTR/five_prime_UTR/three_prime_UTR features "
+            "and recreate UTRs from exon/CDS. This is closer to AGAT's correction mode."
+        ),
+    )
+    parser.add_argument(
+        "--id-prefix",
+        default="agat_utrs_",
+        help="Prefix used for newly created UTR IDs.",
+    )
+    return parser
+def main():
+    args = build_parser().parse_args()
+    headers, features = read_gff(args.input)
+    new_features = add_utrs_like_agat(
+        features,
+        replace_existing_utrs=args.replace_existing_utrs,
+        id_prefix=args.id_prefix,
+    )
+    write_gff3(headers, new_features, args.output)
+if __name__ == "__main__":
+    main()