PyPI - exbee - Versions diffs - 2026.6.4__py3-none-any.whl - Mend

exbee 2026.6.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

exbee/__init__.py +10 -0
exbee/exb_parser.py +261 -0
exbee/trs_parser.py +182 -0
exbee-2026.6.4.dist-info/METADATA +24 -0
exbee-2026.6.4.dist-info/RECORD +7 -0
exbee-2026.6.4.dist-info/WHEEL +4 -0
exbee-2026.6.4.dist-info/entry_points.txt +3 -0

exbee/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+from exbee.exb_parser import EXB
+from exbee.trs_parser import TRS
+__version__ = "2026.6.4"
+def main() -> None:
+    trs = TRS("/home/peter/exbee/exbee/tests/ROG-Dia-GSO-P0005-std.trs")
+    for i in trs.contents_dump:
+        print(i)

exbee/exb_parser.py ADDED Viewed

@@ -0,0 +1,261 @@
+from pathlib import Path
+from lxml import etree  # pyright: ignore[reportAttributeAccessIssue]
+from loguru import logger
+class EXB:
+    def __init__(self, file: Path | str):
+        self.path = Path(file)
+        self.doc = etree.fromstring(Path(file).read_bytes())
+        # self.timeline = self.get_timeline()
+        # self.speakers = self.find_speakers_from_tier_attrib_speaker()
+        self.wavfile_raw = Path(self.doc.find(".//referenced-file").attrib["url"])
+        self.wavfile_abs = (
+            self.path.absolute().resolve().parent / self.wavfile_raw
+        ).absolute()
+        # Check if trouble:
+        if not self.test_tier_id_unique():
+            logger.critical(f"Tiers have non-unique ids! Fix it!")
+        if not self.test_tier_display_name_unique():
+            logger.critical(f"Tiers have non-unique display names! Fix it!")
+    def get_tier_names(self):
+        tiers = self.doc.findall(".//tier")
+        return [t.attrib.get("display-name", "<NO DISPLAY NAME!>") for t in tiers]
+    @property
+    def tier_names(self):
+        """Get the names of all tiers"""
+        return [
+            t.attrib.get("display-name", "<NO DISPLAY NAME!>")
+            for t in self.doc.findall(".//tier")
+        ]
+    @property
+    def timeline(self):
+        """Find all <tli> elements and parse them as a dict with id:float pairs"""
+        return {
+            i.attrib["id"]: float(i.attrib.get("time"))
+            for i in self.doc.findall(".//tli")
+            if "time" in i.attrib.keys()
+        }
+    @property
+    def speakers(self):
+        """Read all the tiers, except the one named [nn], and extract speakers from the attributes"""
+        return list(
+            dict.fromkeys(
+                [
+                    i.attrib.get("speaker")
+                    for i in self.doc.findall(".//tier")
+                    if i.attrib.get("display-name") != "[nn]"
+                ]
+            )
+        )
+    def round_timeline(self, decimals=3) -> None:
+        """Round all the timestamps to desired precision"""
+        for tli in self.doc.findall(".//tli"):
+            tli.set("time", str(round(float(tli.get("time")), decimals)))
+    def find_speakers_from_tier_attrib_speaker(self) -> list[str]:
+        """Read all the tiers, except the one named [nn], and extract
+        speakers from the attributes. The result is in order of appearance.
+        :return list[str]: list of speakers
+        """
+        speakers = [
+            i.attrib.get("speaker")
+            for i in self.doc.findall(".//tier")
+            if i.attrib.get("display-name") != "[nn]"
+        ]
+        return list(dict.fromkeys(speakers))
+    def find_speakers_from_tier_display_name(self) -> list[str]:
+        """Read all the tiers, except the one named [nn], and extract
+        speakers from the attributes. The result is in order of appearance.
+        :return list[str]: list of speakers
+        """
+        speakers = [
+            i.attrib.get("display-name").split()[0]
+            for i in self.doc.findall(".//tier")
+            if i.attrib.get("display-name") != "[nn]"
+        ]
+        return list(dict.fromkeys(speakers))
+    def remove_unused_attributes(self) -> None:
+        """Removes redundant elements in EXB:
+        * AutoSave ud-information
+        * Dialect ud-information
+        * Accent ud-information
+        * Check ud-information
+        * Scope ud-information
+        * Tier format
+        * Tier format table
+        * hidden tier tags
+        """
+        for attribute in [
+            "AutoSave",
+            "Dialect",
+            "Accent",
+            "Check",
+            "Scope",
+        ]:
+            logger.trace(f"Removing redundant metadata: {attribute}")
+            for i in self.doc.findall(
+                f'.//ud-information[@attribute-name="{attribute}"]'
+            ):
+                i.getparent().remove(i)
+        logger.trace("Removing tier-format elements")
+        for i in self.doc.findall(".//tier-format"):
+            i.getparent().remove(i)
+        for i in self.doc.findall(".//tierformat-table"):
+            i.getparent().remove(i)
+        for attribute in [
+            "exmaralda:hidden",
+        ]:
+            logger.trace(f"Removing redundant metadata: {attribute}")
+            for i in self.doc.findall(
+                f'.//ud-information[@attribute-name="{attribute}"]'
+            ):
+                parent = i.getparent()
+                parent.remove(i)
+                parent.getparent().remove(parent)
+    def save(self, file: str | Path) -> None:
+        """Saves the doc with Unicode formatting with pretty
+        indenting.
+        :param str | Path file: Path into which the result will be saved.
+        """
+        # self.remove_duplicated_tlis()
+        self.sort_tlis()
+        self.remove_unused_attributes()
+        if not Path(file).parent.exists():
+            logger.info("Creating parent directory")
+            Path(file).parent.mkdir(exist_ok=True, parents=True)
+        etree.indent(self.doc)
+        Path(file).write_text(
+            etree.tostring(
+                self.doc,
+                encoding="unicode",
+                pretty_print=True,
+                with_tail=True,
+                doctype="""<?xml version="1.0" encoding="utf-8"?>""",
+            )
+        )
+        logger.info(f"EXB saved to {file} and formatted prettily.")
+    def sort_tlis(self) -> None:
+        tl = self.doc.find(".//common-timeline")
+        tl[:] = sorted(tl[:], key=lambda tli: float(tli.attrib.get("time", 0)))
+    def remove_duplicated_tlis(self) -> None:
+        """Performs exact deduplication on TLI elements in place. If duplicates
+        are found, they  will be removed and their references in events will be
+        changed to the non-duplicated ones."""
+        self.sort_tlis()
+        previous = dict(id=None, time=None)
+        for tli in self.doc.findall(".//tli"):
+            if tli.attrib["time"] == previous["time"]:
+                id = tli.attrib["id"]
+                for what in ["start", "stop"]:
+                    for event in self.doc.findall(f".//event[@{what}='{id}']"):
+                        event.attrib[what] = previous["id"]
+                logger.trace(
+                    f"Removing tli with id {tli.attrib['id']} and time {tli.attrib['time']}, duplicate of {previous['id']} at {previous['time']}"
+                )
+                tli.getparent().remove(tli)
+            else:
+                previous = tli.attrib
+    def copy(self):
+        """Returns a deep copy of the EXB instance
+        :return EXB: Copied instance
+        """
+        import copy
+        return copy.deepcopy(self)
+    def add_trailing_spaces(self):
+        """Strip all events with text and then append a trailing space."""
+        for event in self.doc.findall(".//event"):
+            if event.text:
+                event.text = event.text.strip() + " "
+    @staticmethod
+    def add_trailing_spaces_to_tier(tier):
+        """Within the tier, strip all events with text and then append a trailing space."""
+        for event in tier.findall(".//event"):
+            if event.text:
+                event.text = event.text.strip() + " "
+    def add_to_timeline(
+        self, timestamp_seconds: float, remove_duplicated: bool = True
+    ) -> str:
+        """Returns the id of tli at timestamp_seconds. If there was one already,
+        it will be recycled, else a new one will be created. Time resolution: 1ms
+        :param float timestamp_seconds: Time at which to create the tli
+        :return str: the id of the tli at timestamp_seconds
+        """
+        timeline = self.timeline
+        if round(timestamp_seconds, 3) in [round(i, 3) for i in timeline.values()]:
+            for id, time in timeline.items():
+                if round(timestamp_seconds, 3) == round(time, 3):
+                    return id
+        L = len(timeline) + 1
+        while True:
+            proposed_id = f"T{L}"
+            if proposed_id in self.timeline.keys():
+                L += 1
+            else:
+                break
+        tli = etree.Element("tli")
+        tli.attrib["id"] = proposed_id
+        tli.attrib["time"] = str(round(timestamp_seconds, 3))
+        self.doc.find(".//common-timeline").append(tli)
+        if remove_duplicated:
+            self.remove_duplicated_tlis()
+        self.sort_tlis()
+        return proposed_id
+    def test_tier_id_unique(self):
+        ids = self.doc.xpath(".//tier/@id")
+        return len(ids) == len(set(ids))
+    def test_tier_display_name_unique(self):
+        dispnames = self.doc.xpath(".//tier/@display-name")
+        return len(dispnames) == len(set(dispnames))
+    def remove_duplicated_tiers(self):
+        """Removes tier, if there is another one with the same attributes
+        and the same children."""
+        seen = {}
+        tiers_to_remove = []
+        etree.indent(self.doc)
+        for tier in self.doc.findall(".//tier"):
+            # Get the full XML string of this tier (attributes + children + text)
+            tier_xml = etree.tostring(tier, encoding="unicode")
+            if tier_xml in seen:
+                tiers_to_remove.append(tier)
+                logger.warning(
+                    f"Removing duplicate tier id='{tier.get('id', '?')}' "
+                    f"display-name='{tier.get('display-name', '?')}' — "
+                    f"duplicate of id='{seen[tier_xml].get('id', '?')}'"
+                )
+            else:
+                seen[tier_xml] = tier
+        for tier in tiers_to_remove:
+            tier.getparent().remove(tier)
+        logger.info(f"Removed {len(tiers_to_remove)} duplicate tier(s)")

exbee/trs_parser.py ADDED Viewed

@@ -0,0 +1,182 @@
+from pathlib import Path
+from lxml import etree  # pyright: ignore[reportAttributeAccessIssue]
+from loguru import logger
+from pydantic import BaseModel, Field, field_validator
+class Segment(BaseModel):
+    xmin: float
+    xmax: float
+    speaker: str
+    content: str
+    @field_validator("xmax")
+    @classmethod
+    def validate_xmax(cls, v, info):
+        if v <= info.data["xmin"]:
+            raise ValueError("xmax must be greater than xmin")
+        return v
+class TRS:
+    def __init__(self, file: Path | str):
+        self.path = Path(file)
+        self.doc = etree.fromstring(Path(file).read_bytes())
+        self.speakers_raw = self.find_speakers_from_turns()
+        self.speaker_table = {
+            s.attrib["id"]: s.attrib["name"] for s in self.doc.findall(".//Speaker")
+        }
+        self.contents_dump = self.parse_into_contents()
+        self.contents = self.postprocess_dump()
+        self.speakers = [self.speaker_table[s] for s in self.speakers_raw]
+    def find_speakers_from_turns(self) -> list[str]:
+        """Extracts speakers from tier speaker attribute
+        :return list[str]: List of speakers, deduplicated, in order of appearance.
+        """
+        turns = self.doc.findall(".//Turn")
+        turns = [t for t in turns if "speaker" in t.attrib]
+        speakers = [t.attrib["speaker"] for t in turns]
+        speakers = [i for s in speakers for i in s.split()]
+        # speakers = [s for s in speakers if self.doc.find(f".//Turn[@speaker='{s}']")]
+        speakers = list(dict.fromkeys(speakers))
+        return speakers
+    @staticmethod
+    def fragment_whos(doc):
+        who_elements = doc.findall(".//Who")
+        results = []
+        parts = []
+        current_part = []
+        for node in doc.iter():
+            if node == doc:  # Skip root element
+                continue
+            if node.tag == "Who":
+                if current_part:
+                    parts.append("\n".join(current_part).strip())
+                    current_part = []
+            current_part.append(
+                etree.tostring(node, encoding="unicode", with_tail=True).strip()
+            )
+        if current_part:
+            parts.append("\n".join(current_part).strip())
+        parts = [p for p in parts if p.strip()]
+        return parts
+    def parse_into_contents(self):
+        doc = self.doc
+        results = []
+        turns = doc.findall(".//Turn")
+        events = doc.findall(".//Event")
+        for e in events:
+            assert e.getparent().tag == "Turn"
+        for turn in turns:
+            speakers = turn.get("speaker", "").split()
+            turn_start = float(turn.get("startTime"))
+            turn_end = float(turn.get("endTime"))
+            if not "".join(turn.itertext()).strip():
+                # It's an empty turn. Check for events:
+                for e in turn.findall(".//Event"):
+                    results.append(
+                        {
+                            "xmin": turn_start,
+                            "xmax": turn_end,
+                            "speaker": speakers[0] if speakers else "nn",
+                            "content": f"[{e.get('desc')}]",
+                        }
+                    )
+                continue
+            if whos := list(turn.findall(".//Who")):
+                frags = self.fragment_whos(turn)
+                frags = [i for i in frags if "<Who" in i]
+                for frag in frags:
+                    frag = etree.fromstring(f"<frag>{frag}</frag>")
+                    contents = ""
+                    for i in frag.iter():
+                        if i.tag == "Event":
+                            contents += f" [{i.get('desc')}] {i.text if i.text else ''} {i.tail if i.tail else ''}"
+                        else:
+                            contents += f" {i.text} {i.tail}".replace("None", "")
+                    contents = contents.strip()
+                    2 + 2
+                    nb = int(frag.find(".//Who").get("nb"))
+                    speaker = speakers[nb - 1]
+                    results.append(
+                        {
+                            "xmin": turn_start,
+                            "xmax": turn_end,
+                            "speaker": speaker,
+                            "content": contents,
+                        }
+                    )
+            else:
+                start = turn_start
+                end = start
+                segments = []
+                current = None
+                for s in turn.iter():
+                    if s.tag == "Turn":
+                        continue
+                    if s.tag == "Sync":
+                        if current:
+                            current["content"] = contents.strip()
+                            current["xmax"] = float(s.get("time"))
+                            segments.append(current)
+                        contents = f" {s.text} {s.tail}".replace("None", "")
+                        start = float(s.get("time"))
+                    elif s.tag == "Event":
+                        contents += (
+                            f" [{s.get('desc')}] {s.text} {s.tail}".strip().replace(
+                                "None", ""
+                            )
+                        )
+                    else:
+                        1 / 0
+                    current = {
+                        "xmin": start,
+                        "xmax": end,
+                        "speaker": speakers[0],
+                        "content": contents.strip(),
+                    }
+                current["xmax"] = turn_end
+                if current["content"].strip():
+                    segments.append(current)
+                else:
+                    2 + 2
+                results.extend(segments)
+        results = sorted(results, key=lambda d: d["xmin"])
+        for i, r in enumerate(results):
+            text = r["content"].replace("\n", " ")
+            while "  " in text:
+                text = text.replace("  ", " ")
+            results[i]["content"] = text
+        return results
+    def postprocess_dump(self):
+        results = self.contents_dump
+        for i in results:
+            Segment(**i)
+        speakers = set(d["speaker"] for d in results)
+        new_results = dict()
+        for i in results:
+            new_results[i["speaker"]] = new_results.get(i["speaker"], []) + [i]
+        if "nn" in new_results:
+            self.nn = new_results["nn"]
+        else:
+            self.nn = []
+        # return new_results
+        old_speakers = list(new_results.keys())
+        for o in old_speakers:
+            new_results[self.speaker_table.get(o, o)] = sorted(
+                new_results[o], key=lambda d: float(d["xmin"])
+            )
+            del new_results[o]
+        return new_results

exbee-2026.6.4.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,24 @@
+Metadata-Version: 2.3
+Name: exbee
+Version: 2026.6.4
+Summary: A small utility for wrangling EXB data
+Author: Peter Rupnik
+Author-email: Peter Rupnik <peter.rupnik@ijs.si>
+Requires-Dist: loguru>=0.5.1
+Requires-Dist: lxml>=4.0.0
+Requires-Dist: pytest>=5.0.0
+Requires-Dist: pydantic>=1.0
+Requires-Dist: black ; extra == 'dev'
+Requires-Dist: bumpver ; extra == 'dev'
+Requires-Dist: isort ; extra == 'dev'
+Requires-Dist: pip-tools ; extra == 'dev'
+Requires-Dist: pytest ; extra == 'dev'
+Requires-Dist: pytest-cov ; extra == 'dev'
+Requires-Dist: twine ; extra == 'dev'
+Requires-Dist: ruff ; extra == 'dev'
+Requires-Dist: bandit ; extra == 'dev'
+Requires-Dist: pre-commit ; extra == 'dev'
+Requires-Python: >=3.10
+Provides-Extra: dev
+Description-Content-Type: text/markdown

exbee-2026.6.4.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,7 @@
+exbee/__init__.py,sha256=yJDdsYj_vLAK3-VgGNkwsX6IFj6sk4wvv6C2XnBRu3k,236
+exbee/exb_parser.py,sha256=txW4mwtGzZrE-bZyKQq4ov4POkM71Nhcm3NcoMEbzLM,9638
+exbee/trs_parser.py,sha256=BGs-cUmaR7wtizCxTkixq61O8Bnh-VmTJDwaWj0k-vo,6769
+exbee-2026.6.4.dist-info/WHEEL,sha256=iHtWm8nRfs0VRdCYVXocAWFW8ppjHL-uTJkAdZJKOBM,80
+exbee-2026.6.4.dist-info/entry_points.txt,sha256=GQXkeC6qyS8MO3f9fxHBmN19v1S7DHVlsDpAnq0XFb0,38
+exbee-2026.6.4.dist-info/METADATA,sha256=11a9p3kWUhqZGmB4YxhTRnGtNxl6W1JxDw-eQ_TKC_E,766
+exbee-2026.6.4.dist-info/RECORD,,

exbee-2026.6.4.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: uv 0.9.30
+Root-Is-Purelib: true
+Tag: py3-none-any

exbee-2026.6.4.dist-info/entry_points.txt ADDED Viewed

@@ -0,0 +1,3 @@
+[console_scripts]
+exbee = exbee:main