exbee 2026.6.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
exbee/__init__.py ADDED
@@ -0,0 +1,10 @@
1
+ from exbee.exb_parser import EXB
2
+ from exbee.trs_parser import TRS
3
+
4
+ __version__ = "2026.6.4"
5
+
6
+
7
+ def main() -> None:
8
+ trs = TRS("/home/peter/exbee/exbee/tests/ROG-Dia-GSO-P0005-std.trs")
9
+ for i in trs.contents_dump:
10
+ print(i)
exbee/exb_parser.py ADDED
@@ -0,0 +1,261 @@
1
+ from pathlib import Path
2
+ from lxml import etree # pyright: ignore[reportAttributeAccessIssue]
3
+ from loguru import logger
4
+
5
+
6
+ class EXB:
7
+ def __init__(self, file: Path | str):
8
+ self.path = Path(file)
9
+ self.doc = etree.fromstring(Path(file).read_bytes())
10
+ # self.timeline = self.get_timeline()
11
+ # self.speakers = self.find_speakers_from_tier_attrib_speaker()
12
+ self.wavfile_raw = Path(self.doc.find(".//referenced-file").attrib["url"])
13
+ self.wavfile_abs = (
14
+ self.path.absolute().resolve().parent / self.wavfile_raw
15
+ ).absolute()
16
+
17
+ # Check if trouble:
18
+ if not self.test_tier_id_unique():
19
+ logger.critical(f"Tiers have non-unique ids! Fix it!")
20
+ if not self.test_tier_display_name_unique():
21
+ logger.critical(f"Tiers have non-unique display names! Fix it!")
22
+
23
+ def get_tier_names(self):
24
+ tiers = self.doc.findall(".//tier")
25
+ return [t.attrib.get("display-name", "<NO DISPLAY NAME!>") for t in tiers]
26
+
27
+ @property
28
+ def tier_names(self):
29
+ """Get the names of all tiers"""
30
+ return [
31
+ t.attrib.get("display-name", "<NO DISPLAY NAME!>")
32
+ for t in self.doc.findall(".//tier")
33
+ ]
34
+
35
+ @property
36
+ def timeline(self):
37
+ """Find all <tli> elements and parse them as a dict with id:float pairs"""
38
+ return {
39
+ i.attrib["id"]: float(i.attrib.get("time"))
40
+ for i in self.doc.findall(".//tli")
41
+ if "time" in i.attrib.keys()
42
+ }
43
+
44
+ @property
45
+ def speakers(self):
46
+ """Read all the tiers, except the one named [nn], and extract speakers from the attributes"""
47
+ return list(
48
+ dict.fromkeys(
49
+ [
50
+ i.attrib.get("speaker")
51
+ for i in self.doc.findall(".//tier")
52
+ if i.attrib.get("display-name") != "[nn]"
53
+ ]
54
+ )
55
+ )
56
+
57
+
58
+ def round_timeline(self, decimals=3) -> None:
59
+ """Round all the timestamps to desired precision"""
60
+ for tli in self.doc.findall(".//tli"):
61
+ tli.set("time", str(round(float(tli.get("time")), decimals)))
62
+
63
+ def find_speakers_from_tier_attrib_speaker(self) -> list[str]:
64
+ """Read all the tiers, except the one named [nn], and extract
65
+ speakers from the attributes. The result is in order of appearance.
66
+
67
+ :return list[str]: list of speakers
68
+ """
69
+ speakers = [
70
+ i.attrib.get("speaker")
71
+ for i in self.doc.findall(".//tier")
72
+ if i.attrib.get("display-name") != "[nn]"
73
+ ]
74
+ return list(dict.fromkeys(speakers))
75
+
76
+ def find_speakers_from_tier_display_name(self) -> list[str]:
77
+ """Read all the tiers, except the one named [nn], and extract
78
+ speakers from the attributes. The result is in order of appearance.
79
+
80
+ :return list[str]: list of speakers
81
+ """
82
+ speakers = [
83
+ i.attrib.get("display-name").split()[0]
84
+ for i in self.doc.findall(".//tier")
85
+ if i.attrib.get("display-name") != "[nn]"
86
+ ]
87
+ return list(dict.fromkeys(speakers))
88
+
89
+ def remove_unused_attributes(self) -> None:
90
+ """Removes redundant elements in EXB:
91
+ * AutoSave ud-information
92
+ * Dialect ud-information
93
+ * Accent ud-information
94
+ * Check ud-information
95
+ * Scope ud-information
96
+ * Tier format
97
+ * Tier format table
98
+ * hidden tier tags
99
+
100
+ """
101
+ for attribute in [
102
+ "AutoSave",
103
+ "Dialect",
104
+ "Accent",
105
+ "Check",
106
+ "Scope",
107
+ ]:
108
+ logger.trace(f"Removing redundant metadata: {attribute}")
109
+ for i in self.doc.findall(
110
+ f'.//ud-information[@attribute-name="{attribute}"]'
111
+ ):
112
+ i.getparent().remove(i)
113
+ logger.trace("Removing tier-format elements")
114
+ for i in self.doc.findall(".//tier-format"):
115
+ i.getparent().remove(i)
116
+ for i in self.doc.findall(".//tierformat-table"):
117
+ i.getparent().remove(i)
118
+ for attribute in [
119
+ "exmaralda:hidden",
120
+ ]:
121
+ logger.trace(f"Removing redundant metadata: {attribute}")
122
+ for i in self.doc.findall(
123
+ f'.//ud-information[@attribute-name="{attribute}"]'
124
+ ):
125
+ parent = i.getparent()
126
+ parent.remove(i)
127
+ parent.getparent().remove(parent)
128
+
129
+ def save(self, file: str | Path) -> None:
130
+ """Saves the doc with Unicode formatting with pretty
131
+ indenting.
132
+
133
+ :param str | Path file: Path into which the result will be saved.
134
+ """
135
+ # self.remove_duplicated_tlis()
136
+ self.sort_tlis()
137
+ self.remove_unused_attributes()
138
+ if not Path(file).parent.exists():
139
+ logger.info("Creating parent directory")
140
+ Path(file).parent.mkdir(exist_ok=True, parents=True)
141
+ etree.indent(self.doc)
142
+ Path(file).write_text(
143
+ etree.tostring(
144
+ self.doc,
145
+ encoding="unicode",
146
+ pretty_print=True,
147
+ with_tail=True,
148
+ doctype="""<?xml version="1.0" encoding="utf-8"?>""",
149
+ )
150
+ )
151
+ logger.info(f"EXB saved to {file} and formatted prettily.")
152
+
153
+ def sort_tlis(self) -> None:
154
+ tl = self.doc.find(".//common-timeline")
155
+ tl[:] = sorted(tl[:], key=lambda tli: float(tli.attrib.get("time", 0)))
156
+
157
+ def remove_duplicated_tlis(self) -> None:
158
+ """Performs exact deduplication on TLI elements in place. If duplicates
159
+ are found, they will be removed and their references in events will be
160
+ changed to the non-duplicated ones."""
161
+
162
+ self.sort_tlis()
163
+ previous = dict(id=None, time=None)
164
+ for tli in self.doc.findall(".//tli"):
165
+ if tli.attrib["time"] == previous["time"]:
166
+ id = tli.attrib["id"]
167
+ for what in ["start", "stop"]:
168
+ for event in self.doc.findall(f".//event[@{what}='{id}']"):
169
+ event.attrib[what] = previous["id"]
170
+ logger.trace(
171
+ f"Removing tli with id {tli.attrib['id']} and time {tli.attrib['time']}, duplicate of {previous['id']} at {previous['time']}"
172
+ )
173
+ tli.getparent().remove(tli)
174
+ else:
175
+ previous = tli.attrib
176
+
177
+ def copy(self):
178
+ """Returns a deep copy of the EXB instance
179
+
180
+ :return EXB: Copied instance
181
+ """
182
+ import copy
183
+
184
+ return copy.deepcopy(self)
185
+
186
+ def add_trailing_spaces(self):
187
+ """Strip all events with text and then append a trailing space."""
188
+ for event in self.doc.findall(".//event"):
189
+ if event.text:
190
+ event.text = event.text.strip() + " "
191
+
192
+ @staticmethod
193
+ def add_trailing_spaces_to_tier(tier):
194
+ """Within the tier, strip all events with text and then append a trailing space."""
195
+ for event in tier.findall(".//event"):
196
+ if event.text:
197
+ event.text = event.text.strip() + " "
198
+
199
+ def add_to_timeline(
200
+ self, timestamp_seconds: float, remove_duplicated: bool = True
201
+ ) -> str:
202
+ """Returns the id of tli at timestamp_seconds. If there was one already,
203
+ it will be recycled, else a new one will be created. Time resolution: 1ms
204
+
205
+ :param float timestamp_seconds: Time at which to create the tli
206
+ :return str: the id of the tli at timestamp_seconds
207
+ """
208
+ timeline = self.timeline
209
+
210
+ if round(timestamp_seconds, 3) in [round(i, 3) for i in timeline.values()]:
211
+ for id, time in timeline.items():
212
+ if round(timestamp_seconds, 3) == round(time, 3):
213
+ return id
214
+ L = len(timeline) + 1
215
+ while True:
216
+ proposed_id = f"T{L}"
217
+ if proposed_id in self.timeline.keys():
218
+ L += 1
219
+ else:
220
+ break
221
+ tli = etree.Element("tli")
222
+ tli.attrib["id"] = proposed_id
223
+ tli.attrib["time"] = str(round(timestamp_seconds, 3))
224
+ self.doc.find(".//common-timeline").append(tli)
225
+ if remove_duplicated:
226
+ self.remove_duplicated_tlis()
227
+ self.sort_tlis()
228
+ return proposed_id
229
+
230
+ def test_tier_id_unique(self):
231
+ ids = self.doc.xpath(".//tier/@id")
232
+ return len(ids) == len(set(ids))
233
+
234
+ def test_tier_display_name_unique(self):
235
+ dispnames = self.doc.xpath(".//tier/@display-name")
236
+ return len(dispnames) == len(set(dispnames))
237
+
238
+ def remove_duplicated_tiers(self):
239
+ """Removes tier, if there is another one with the same attributes
240
+ and the same children."""
241
+ seen = {}
242
+ tiers_to_remove = []
243
+ etree.indent(self.doc)
244
+ for tier in self.doc.findall(".//tier"):
245
+ # Get the full XML string of this tier (attributes + children + text)
246
+ tier_xml = etree.tostring(tier, encoding="unicode")
247
+
248
+ if tier_xml in seen:
249
+ tiers_to_remove.append(tier)
250
+ logger.warning(
251
+ f"Removing duplicate tier id='{tier.get('id', '?')}' "
252
+ f"display-name='{tier.get('display-name', '?')}' — "
253
+ f"duplicate of id='{seen[tier_xml].get('id', '?')}'"
254
+ )
255
+ else:
256
+ seen[tier_xml] = tier
257
+
258
+ for tier in tiers_to_remove:
259
+ tier.getparent().remove(tier)
260
+
261
+ logger.info(f"Removed {len(tiers_to_remove)} duplicate tier(s)")
exbee/trs_parser.py ADDED
@@ -0,0 +1,182 @@
1
+ from pathlib import Path
2
+ from lxml import etree # pyright: ignore[reportAttributeAccessIssue]
3
+ from loguru import logger
4
+ from pydantic import BaseModel, Field, field_validator
5
+
6
+
7
+ class Segment(BaseModel):
8
+ xmin: float
9
+ xmax: float
10
+ speaker: str
11
+ content: str
12
+
13
+ @field_validator("xmax")
14
+ @classmethod
15
+ def validate_xmax(cls, v, info):
16
+ if v <= info.data["xmin"]:
17
+ raise ValueError("xmax must be greater than xmin")
18
+ return v
19
+
20
+
21
+ class TRS:
22
+ def __init__(self, file: Path | str):
23
+ self.path = Path(file)
24
+ self.doc = etree.fromstring(Path(file).read_bytes())
25
+ self.speakers_raw = self.find_speakers_from_turns()
26
+ self.speaker_table = {
27
+ s.attrib["id"]: s.attrib["name"] for s in self.doc.findall(".//Speaker")
28
+ }
29
+ self.contents_dump = self.parse_into_contents()
30
+ self.contents = self.postprocess_dump()
31
+ self.speakers = [self.speaker_table[s] for s in self.speakers_raw]
32
+
33
+ def find_speakers_from_turns(self) -> list[str]:
34
+ """Extracts speakers from tier speaker attribute
35
+
36
+ :return list[str]: List of speakers, deduplicated, in order of appearance.
37
+ """
38
+ turns = self.doc.findall(".//Turn")
39
+ turns = [t for t in turns if "speaker" in t.attrib]
40
+ speakers = [t.attrib["speaker"] for t in turns]
41
+ speakers = [i for s in speakers for i in s.split()]
42
+ # speakers = [s for s in speakers if self.doc.find(f".//Turn[@speaker='{s}']")]
43
+ speakers = list(dict.fromkeys(speakers))
44
+ return speakers
45
+
46
+ @staticmethod
47
+ def fragment_whos(doc):
48
+ who_elements = doc.findall(".//Who")
49
+ results = []
50
+
51
+ parts = []
52
+ current_part = []
53
+
54
+ for node in doc.iter():
55
+ if node == doc: # Skip root element
56
+ continue
57
+
58
+ if node.tag == "Who":
59
+ if current_part:
60
+ parts.append("\n".join(current_part).strip())
61
+ current_part = []
62
+
63
+ current_part.append(
64
+ etree.tostring(node, encoding="unicode", with_tail=True).strip()
65
+ )
66
+
67
+ if current_part:
68
+ parts.append("\n".join(current_part).strip())
69
+
70
+ parts = [p for p in parts if p.strip()]
71
+ return parts
72
+
73
+ def parse_into_contents(self):
74
+ doc = self.doc
75
+ results = []
76
+ turns = doc.findall(".//Turn")
77
+ events = doc.findall(".//Event")
78
+ for e in events:
79
+ assert e.getparent().tag == "Turn"
80
+ for turn in turns:
81
+ speakers = turn.get("speaker", "").split()
82
+ turn_start = float(turn.get("startTime"))
83
+ turn_end = float(turn.get("endTime"))
84
+ if not "".join(turn.itertext()).strip():
85
+ # It's an empty turn. Check for events:
86
+ for e in turn.findall(".//Event"):
87
+ results.append(
88
+ {
89
+ "xmin": turn_start,
90
+ "xmax": turn_end,
91
+ "speaker": speakers[0] if speakers else "nn",
92
+ "content": f"[{e.get('desc')}]",
93
+ }
94
+ )
95
+ continue
96
+ if whos := list(turn.findall(".//Who")):
97
+ frags = self.fragment_whos(turn)
98
+ frags = [i for i in frags if "<Who" in i]
99
+
100
+ for frag in frags:
101
+ frag = etree.fromstring(f"<frag>{frag}</frag>")
102
+ contents = ""
103
+ for i in frag.iter():
104
+ if i.tag == "Event":
105
+ contents += f" [{i.get('desc')}] {i.text if i.text else ''} {i.tail if i.tail else ''}"
106
+ else:
107
+ contents += f" {i.text} {i.tail}".replace("None", "")
108
+ contents = contents.strip()
109
+ 2 + 2
110
+ nb = int(frag.find(".//Who").get("nb"))
111
+ speaker = speakers[nb - 1]
112
+ results.append(
113
+ {
114
+ "xmin": turn_start,
115
+ "xmax": turn_end,
116
+ "speaker": speaker,
117
+ "content": contents,
118
+ }
119
+ )
120
+ else:
121
+ start = turn_start
122
+ end = start
123
+ segments = []
124
+ current = None
125
+ for s in turn.iter():
126
+ if s.tag == "Turn":
127
+ continue
128
+ if s.tag == "Sync":
129
+ if current:
130
+ current["content"] = contents.strip()
131
+ current["xmax"] = float(s.get("time"))
132
+ segments.append(current)
133
+ contents = f" {s.text} {s.tail}".replace("None", "")
134
+ start = float(s.get("time"))
135
+ elif s.tag == "Event":
136
+ contents += (
137
+ f" [{s.get('desc')}] {s.text} {s.tail}".strip().replace(
138
+ "None", ""
139
+ )
140
+ )
141
+ else:
142
+ 1 / 0
143
+ current = {
144
+ "xmin": start,
145
+ "xmax": end,
146
+ "speaker": speakers[0],
147
+ "content": contents.strip(),
148
+ }
149
+ current["xmax"] = turn_end
150
+ if current["content"].strip():
151
+ segments.append(current)
152
+ else:
153
+ 2 + 2
154
+ results.extend(segments)
155
+ results = sorted(results, key=lambda d: d["xmin"])
156
+ for i, r in enumerate(results):
157
+ text = r["content"].replace("\n", " ")
158
+ while " " in text:
159
+ text = text.replace(" ", " ")
160
+ results[i]["content"] = text
161
+ return results
162
+
163
+ def postprocess_dump(self):
164
+ results = self.contents_dump
165
+ for i in results:
166
+ Segment(**i)
167
+ speakers = set(d["speaker"] for d in results)
168
+ new_results = dict()
169
+ for i in results:
170
+ new_results[i["speaker"]] = new_results.get(i["speaker"], []) + [i]
171
+ if "nn" in new_results:
172
+ self.nn = new_results["nn"]
173
+ else:
174
+ self.nn = []
175
+ # return new_results
176
+ old_speakers = list(new_results.keys())
177
+ for o in old_speakers:
178
+ new_results[self.speaker_table.get(o, o)] = sorted(
179
+ new_results[o], key=lambda d: float(d["xmin"])
180
+ )
181
+ del new_results[o]
182
+ return new_results
@@ -0,0 +1,24 @@
1
+ Metadata-Version: 2.3
2
+ Name: exbee
3
+ Version: 2026.6.4
4
+ Summary: A small utility for wrangling EXB data
5
+ Author: Peter Rupnik
6
+ Author-email: Peter Rupnik <peter.rupnik@ijs.si>
7
+ Requires-Dist: loguru>=0.5.1
8
+ Requires-Dist: lxml>=4.0.0
9
+ Requires-Dist: pytest>=5.0.0
10
+ Requires-Dist: pydantic>=1.0
11
+ Requires-Dist: black ; extra == 'dev'
12
+ Requires-Dist: bumpver ; extra == 'dev'
13
+ Requires-Dist: isort ; extra == 'dev'
14
+ Requires-Dist: pip-tools ; extra == 'dev'
15
+ Requires-Dist: pytest ; extra == 'dev'
16
+ Requires-Dist: pytest-cov ; extra == 'dev'
17
+ Requires-Dist: twine ; extra == 'dev'
18
+ Requires-Dist: ruff ; extra == 'dev'
19
+ Requires-Dist: bandit ; extra == 'dev'
20
+ Requires-Dist: pre-commit ; extra == 'dev'
21
+ Requires-Python: >=3.10
22
+ Provides-Extra: dev
23
+ Description-Content-Type: text/markdown
24
+
@@ -0,0 +1,7 @@
1
+ exbee/__init__.py,sha256=yJDdsYj_vLAK3-VgGNkwsX6IFj6sk4wvv6C2XnBRu3k,236
2
+ exbee/exb_parser.py,sha256=txW4mwtGzZrE-bZyKQq4ov4POkM71Nhcm3NcoMEbzLM,9638
3
+ exbee/trs_parser.py,sha256=BGs-cUmaR7wtizCxTkixq61O8Bnh-VmTJDwaWj0k-vo,6769
4
+ exbee-2026.6.4.dist-info/WHEEL,sha256=iHtWm8nRfs0VRdCYVXocAWFW8ppjHL-uTJkAdZJKOBM,80
5
+ exbee-2026.6.4.dist-info/entry_points.txt,sha256=GQXkeC6qyS8MO3f9fxHBmN19v1S7DHVlsDpAnq0XFb0,38
6
+ exbee-2026.6.4.dist-info/METADATA,sha256=11a9p3kWUhqZGmB4YxhTRnGtNxl6W1JxDw-eQ_TKC_E,766
7
+ exbee-2026.6.4.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: uv 0.9.30
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ exbee = exbee:main
3
+