subtitle-engine 0.1.2__tar.gz → 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {subtitle_engine-0.1.2/src/subtitle_engine.egg-info → subtitle_engine-0.1.3}/PKG-INFO +9 -2
- {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/README.md +8 -1
- {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/pyproject.toml +1 -1
- {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/src/subtitle_engine/__init__.py +1 -1
- {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/src/subtitle_engine/cli.py +15 -0
- subtitle_engine-0.1.3/src/subtitle_engine/segmenter.py +165 -0
- {subtitle_engine-0.1.2 → subtitle_engine-0.1.3/src/subtitle_engine.egg-info}/PKG-INFO +9 -2
- {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/src/subtitle_engine.egg-info/SOURCES.txt +2 -0
- {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/tests/test_cli.py +27 -2
- subtitle_engine-0.1.3/tests/test_segmenter.py +84 -0
- {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/LICENSE +0 -0
- {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/setup.cfg +0 -0
- {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/src/subtitle_engine/captioner.py +0 -0
- {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/src/subtitle_engine/srt_writer.py +0 -0
- {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/src/subtitle_engine/transcriber.py +0 -0
- {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/src/subtitle_engine/updater.py +0 -0
- {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/src/subtitle_engine/utils.py +0 -0
- {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/src/subtitle_engine.egg-info/dependency_links.txt +0 -0
- {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/src/subtitle_engine.egg-info/entry_points.txt +0 -0
- {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/src/subtitle_engine.egg-info/requires.txt +0 -0
- {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/src/subtitle_engine.egg-info/top_level.txt +0 -0
- {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/tests/test_captioner.py +0 -0
- {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/tests/test_srt_writer.py +0 -0
- {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/tests/test_transcriber.py +0 -0
- {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/tests/test_updater.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: subtitle-engine
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Summary: Generate SRT subtitles from audio/video files using WhisperX
|
|
5
5
|
Author: Leevi Puntanen
|
|
6
6
|
License-Expression: MIT
|
|
@@ -65,7 +65,13 @@ subeng video.mp4 --device cpu
|
|
|
65
65
|
subeng video.mp4 --diarize --hf-token $HF_TOKEN
|
|
66
66
|
|
|
67
67
|
# Generate a caption from the transcript using Ollama
|
|
68
|
-
subeng video.mp4 --caption --ollama-model qwen3.5:0.
|
|
68
|
+
subeng video.mp4 --caption --ollama-model qwen3.5:0.6b
|
|
69
|
+
|
|
70
|
+
# Short-form subtitles (2-5 words per line, default)
|
|
71
|
+
subeng video.mp4 --preset shortform
|
|
72
|
+
|
|
73
|
+
# Long-form subtitles (10-14 words per line)
|
|
74
|
+
subeng video.mp4 --preset longform
|
|
69
75
|
```
|
|
70
76
|
|
|
71
77
|
## Options
|
|
@@ -83,6 +89,7 @@ subeng video.mp4 --caption --ollama-model qwen3.5:0.8b
|
|
|
83
89
|
| `--caption` | Generate a caption from the transcript via Ollama |
|
|
84
90
|
| `--ollama-model` | Ollama model name (required with `--caption`) |
|
|
85
91
|
| `--ollama-host` | Ollama API host (default: `http://localhost:11434`) |
|
|
92
|
+
| `--preset`, `-p` | Subtitle style: `shortform` (2-5 words, default) or `longform` (10-14 words) |
|
|
86
93
|
|
|
87
94
|
## Development
|
|
88
95
|
|
|
@@ -39,7 +39,13 @@ subeng video.mp4 --device cpu
|
|
|
39
39
|
subeng video.mp4 --diarize --hf-token $HF_TOKEN
|
|
40
40
|
|
|
41
41
|
# Generate a caption from the transcript using Ollama
|
|
42
|
-
subeng video.mp4 --caption --ollama-model qwen3.5:0.
|
|
42
|
+
subeng video.mp4 --caption --ollama-model qwen3.5:0.6b
|
|
43
|
+
|
|
44
|
+
# Short-form subtitles (2-5 words per line, default)
|
|
45
|
+
subeng video.mp4 --preset shortform
|
|
46
|
+
|
|
47
|
+
# Long-form subtitles (10-14 words per line)
|
|
48
|
+
subeng video.mp4 --preset longform
|
|
43
49
|
```
|
|
44
50
|
|
|
45
51
|
## Options
|
|
@@ -57,6 +63,7 @@ subeng video.mp4 --caption --ollama-model qwen3.5:0.8b
|
|
|
57
63
|
| `--caption` | Generate a caption from the transcript via Ollama |
|
|
58
64
|
| `--ollama-model` | Ollama model name (required with `--caption`) |
|
|
59
65
|
| `--ollama-host` | Ollama API host (default: `http://localhost:11434`) |
|
|
66
|
+
| `--preset`, `-p` | Subtitle style: `shortform` (2-5 words, default) or `longform` (10-14 words) |
|
|
60
67
|
|
|
61
68
|
## Development
|
|
62
69
|
|
|
@@ -9,6 +9,7 @@ from rich.console import Console
|
|
|
9
9
|
|
|
10
10
|
from subtitle_engine import __version__
|
|
11
11
|
from subtitle_engine.captioner import generate_caption
|
|
12
|
+
from subtitle_engine.segmenter import VALID_PRESETS, split_segments
|
|
12
13
|
from subtitle_engine.srt_writer import write_srt
|
|
13
14
|
from subtitle_engine.transcriber import transcribe
|
|
14
15
|
from subtitle_engine.updater import UpdateCheckError, check_for_update, update_package
|
|
@@ -162,6 +163,14 @@ def main(
|
|
|
162
163
|
envvar="OLLAMA_HOST",
|
|
163
164
|
),
|
|
164
165
|
] = "http://localhost:11434",
|
|
166
|
+
preset: Annotated[
|
|
167
|
+
str,
|
|
168
|
+
typer.Option(
|
|
169
|
+
"--preset",
|
|
170
|
+
"-p",
|
|
171
|
+
help="Subtitle style: shortform (2-5 words) or longform (10-14 words).",
|
|
172
|
+
),
|
|
173
|
+
] = "shortform",
|
|
165
174
|
quiet: Annotated[
|
|
166
175
|
bool,
|
|
167
176
|
typer.Option(
|
|
@@ -190,6 +199,10 @@ def main(
|
|
|
190
199
|
) -> None:
|
|
191
200
|
"""Generate SRT subtitles from a media file."""
|
|
192
201
|
try:
|
|
202
|
+
if preset not in VALID_PRESETS:
|
|
203
|
+
valid = ", ".join(sorted(VALID_PRESETS))
|
|
204
|
+
raise ValueError(f"Unknown preset '{preset}'. Choose from: {valid}")
|
|
205
|
+
|
|
193
206
|
validate_media_file(input_file)
|
|
194
207
|
output_path = resolve_output_path(input_file, output)
|
|
195
208
|
|
|
@@ -208,6 +221,7 @@ def main(
|
|
|
208
221
|
if not quiet:
|
|
209
222
|
console.print(f"[bold]Transcribing:[/bold] {input_file}")
|
|
210
223
|
console.print(f"[bold]Model:[/bold] {model}")
|
|
224
|
+
console.print(f"[bold]Preset:[/bold] {preset}")
|
|
211
225
|
if language:
|
|
212
226
|
console.print(f"[bold]Language:[/bold] {language}")
|
|
213
227
|
if device:
|
|
@@ -225,6 +239,7 @@ def main(
|
|
|
225
239
|
verbose=verbose,
|
|
226
240
|
)
|
|
227
241
|
|
|
242
|
+
segments = split_segments(segments, preset=preset)
|
|
228
243
|
write_srt(segments, output_path)
|
|
229
244
|
if not quiet:
|
|
230
245
|
console.print(f"[green]Wrote subtitles to:[/green] {output_path}")
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""Split WhisperX segments into shorter or longer subtitle chunks."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections import Counter
|
|
6
|
+
from typing import Iterable
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
PRESET_SHORTFORM = "shortform"
|
|
10
|
+
PRESET_LONGFORM = "longform"
|
|
11
|
+
VALID_PRESETS = {PRESET_SHORTFORM, PRESET_LONGFORM}
|
|
12
|
+
|
|
13
|
+
# Word-count targets per subtitle block.
|
|
14
|
+
PRESET_TARGETS = {
|
|
15
|
+
PRESET_SHORTFORM: (2, 5), # min, max
|
|
16
|
+
PRESET_LONGFORM: (10, 14), # min, max
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _sanitize_text(text: str) -> str:
|
|
21
|
+
"""Return a cleaned version of the text for display."""
|
|
22
|
+
return " ".join(text.split())
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _words_from_segment(segment: dict) -> list[dict]:
|
|
26
|
+
"""Extract a clean list of word dicts from a WhisperX segment.
|
|
27
|
+
|
|
28
|
+
Each word dict should have ``word`` and optionally ``start``/``end``.
|
|
29
|
+
"""
|
|
30
|
+
raw_words = segment.get("words", [])
|
|
31
|
+
words = []
|
|
32
|
+
for word_entry in raw_words:
|
|
33
|
+
if isinstance(word_entry, dict):
|
|
34
|
+
word_text = word_entry.get("word", "").strip()
|
|
35
|
+
else:
|
|
36
|
+
word_text = str(word_entry).strip()
|
|
37
|
+
if word_text:
|
|
38
|
+
words.append({"word": word_text, **word_entry} if isinstance(word_entry, dict) else {"word": word_text})
|
|
39
|
+
return words
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _split_text_evenly(text: str, chunk_count: int) -> list[str]:
|
|
43
|
+
"""Split text into ``chunk_count`` roughly equal word groups."""
|
|
44
|
+
tokens = text.split()
|
|
45
|
+
if chunk_count <= 1 or len(tokens) <= chunk_count:
|
|
46
|
+
return [text]
|
|
47
|
+
|
|
48
|
+
base_size, remainder = divmod(len(tokens), chunk_count)
|
|
49
|
+
chunks = []
|
|
50
|
+
index = 0
|
|
51
|
+
for i in range(chunk_count):
|
|
52
|
+
size = base_size + (1 if i < remainder else 0)
|
|
53
|
+
chunks.append(" ".join(tokens[index : index + size]))
|
|
54
|
+
index += size
|
|
55
|
+
return chunks
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _dominant_speaker(words: list[dict]) -> str | None:
|
|
59
|
+
"""Return the most common speaker label among the given words, if any."""
|
|
60
|
+
speakers = [w.get("speaker") for w in words if w.get("speaker")]
|
|
61
|
+
if not speakers:
|
|
62
|
+
return None
|
|
63
|
+
return Counter(speakers).most_common(1)[0][0]
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _prefix_speaker(text: str, speaker: str | None) -> str:
|
|
67
|
+
"""Prefix a speaker label to text when one is known."""
|
|
68
|
+
if not speaker:
|
|
69
|
+
return text
|
|
70
|
+
return f"[{speaker}] {text}"
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _split_segment(
|
|
74
|
+
segment: dict,
|
|
75
|
+
min_words: int,
|
|
76
|
+
max_words: int,
|
|
77
|
+
) -> list[dict]:
|
|
78
|
+
"""Split a single WhisperX segment into subtitle-sized chunks.
|
|
79
|
+
|
|
80
|
+
Word-level timings are used when available. If not, the segment's total
|
|
81
|
+
duration is divided proportionally among the chunks.
|
|
82
|
+
"""
|
|
83
|
+
words = _words_from_segment(segment)
|
|
84
|
+
segment_start = float(segment.get("start", 0.0))
|
|
85
|
+
segment_end = float(segment.get("end", segment_start))
|
|
86
|
+
|
|
87
|
+
if not words:
|
|
88
|
+
cleaned = _sanitize_text(str(segment.get("text", "")))
|
|
89
|
+
if cleaned:
|
|
90
|
+
return [{"start": segment_start, "end": segment_end, "text": cleaned}]
|
|
91
|
+
return []
|
|
92
|
+
|
|
93
|
+
# Build chunks based on word count targets.
|
|
94
|
+
chunks: list[list[dict]] = []
|
|
95
|
+
current_chunk: list[dict] = []
|
|
96
|
+
|
|
97
|
+
for word in words:
|
|
98
|
+
current_chunk.append(word)
|
|
99
|
+
if len(current_chunk) >= max_words:
|
|
100
|
+
chunks.append(current_chunk)
|
|
101
|
+
current_chunk = []
|
|
102
|
+
|
|
103
|
+
if current_chunk:
|
|
104
|
+
# Merge a tiny trailing chunk with the previous one if possible.
|
|
105
|
+
if len(current_chunk) < min_words and chunks:
|
|
106
|
+
chunks[-1].extend(current_chunk)
|
|
107
|
+
else:
|
|
108
|
+
chunks.append(current_chunk)
|
|
109
|
+
|
|
110
|
+
# Resolve timings per chunk.
|
|
111
|
+
result = []
|
|
112
|
+
for chunk in chunks:
|
|
113
|
+
text_words = [w["word"].strip() for w in chunk]
|
|
114
|
+
text = _sanitize_text(" ".join(text_words))
|
|
115
|
+
if not text:
|
|
116
|
+
continue
|
|
117
|
+
|
|
118
|
+
timed_words = [w for w in chunk if isinstance(w, dict) and w.get("start") is not None and w.get("end") is not None]
|
|
119
|
+
if timed_words:
|
|
120
|
+
start = float(timed_words[0]["start"])
|
|
121
|
+
end = float(timed_words[-1]["end"])
|
|
122
|
+
else:
|
|
123
|
+
# Fallback: divide the segment duration proportionally.
|
|
124
|
+
ratio = max(1, len(chunk)) / max(1, len(words))
|
|
125
|
+
duration = segment_end - segment_start
|
|
126
|
+
chunk_index = chunks.index(chunk)
|
|
127
|
+
start = segment_start + duration * (chunk_index / len(chunks))
|
|
128
|
+
end = segment_start + duration * ((chunk_index + 1) / len(chunks))
|
|
129
|
+
|
|
130
|
+
speaker = _dominant_speaker(chunk)
|
|
131
|
+
text = _prefix_speaker(text, speaker)
|
|
132
|
+
result.append({"start": start, "end": end, "text": text})
|
|
133
|
+
|
|
134
|
+
return result
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def split_segments(
|
|
138
|
+
segments: Iterable[dict],
|
|
139
|
+
preset: str = PRESET_SHORTFORM,
|
|
140
|
+
) -> list[dict]:
|
|
141
|
+
"""Split or join WhisperX segments according to the chosen preset.
|
|
142
|
+
|
|
143
|
+
Parameters
|
|
144
|
+
----------
|
|
145
|
+
segments:
|
|
146
|
+
WhisperX segments with ``start``, ``end``, ``text`` and optionally
|
|
147
|
+
per-word timings.
|
|
148
|
+
preset:
|
|
149
|
+
``shortform`` or ``longform``.
|
|
150
|
+
|
|
151
|
+
Returns
|
|
152
|
+
-------
|
|
153
|
+
A flat list of segment dicts suitable for writing to SRT.
|
|
154
|
+
"""
|
|
155
|
+
if preset not in VALID_PRESETS:
|
|
156
|
+
valid = ", ".join(sorted(VALID_PRESETS))
|
|
157
|
+
raise ValueError(f"Unknown preset '{preset}'. Choose from: {valid}")
|
|
158
|
+
|
|
159
|
+
min_words, max_words = PRESET_TARGETS[preset]
|
|
160
|
+
|
|
161
|
+
output: list[dict] = []
|
|
162
|
+
for segment in segments:
|
|
163
|
+
output.extend(_split_segment(segment, min_words, max_words))
|
|
164
|
+
|
|
165
|
+
return output
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: subtitle-engine
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Summary: Generate SRT subtitles from audio/video files using WhisperX
|
|
5
5
|
Author: Leevi Puntanen
|
|
6
6
|
License-Expression: MIT
|
|
@@ -65,7 +65,13 @@ subeng video.mp4 --device cpu
|
|
|
65
65
|
subeng video.mp4 --diarize --hf-token $HF_TOKEN
|
|
66
66
|
|
|
67
67
|
# Generate a caption from the transcript using Ollama
|
|
68
|
-
subeng video.mp4 --caption --ollama-model qwen3.5:0.
|
|
68
|
+
subeng video.mp4 --caption --ollama-model qwen3.5:0.6b
|
|
69
|
+
|
|
70
|
+
# Short-form subtitles (2-5 words per line, default)
|
|
71
|
+
subeng video.mp4 --preset shortform
|
|
72
|
+
|
|
73
|
+
# Long-form subtitles (10-14 words per line)
|
|
74
|
+
subeng video.mp4 --preset longform
|
|
69
75
|
```
|
|
70
76
|
|
|
71
77
|
## Options
|
|
@@ -83,6 +89,7 @@ subeng video.mp4 --caption --ollama-model qwen3.5:0.8b
|
|
|
83
89
|
| `--caption` | Generate a caption from the transcript via Ollama |
|
|
84
90
|
| `--ollama-model` | Ollama model name (required with `--caption`) |
|
|
85
91
|
| `--ollama-host` | Ollama API host (default: `http://localhost:11434`) |
|
|
92
|
+
| `--preset`, `-p` | Subtitle style: `shortform` (2-5 words, default) or `longform` (10-14 words) |
|
|
86
93
|
|
|
87
94
|
## Development
|
|
88
95
|
|
|
@@ -4,6 +4,7 @@ pyproject.toml
|
|
|
4
4
|
src/subtitle_engine/__init__.py
|
|
5
5
|
src/subtitle_engine/captioner.py
|
|
6
6
|
src/subtitle_engine/cli.py
|
|
7
|
+
src/subtitle_engine/segmenter.py
|
|
7
8
|
src/subtitle_engine/srt_writer.py
|
|
8
9
|
src/subtitle_engine/transcriber.py
|
|
9
10
|
src/subtitle_engine/updater.py
|
|
@@ -16,6 +17,7 @@ src/subtitle_engine.egg-info/requires.txt
|
|
|
16
17
|
src/subtitle_engine.egg-info/top_level.txt
|
|
17
18
|
tests/test_captioner.py
|
|
18
19
|
tests/test_cli.py
|
|
20
|
+
tests/test_segmenter.py
|
|
19
21
|
tests/test_srt_writer.py
|
|
20
22
|
tests/test_transcriber.py
|
|
21
23
|
tests/test_updater.py
|
|
@@ -67,7 +67,7 @@ def test_cli_version_long():
|
|
|
67
67
|
result = runner.invoke(app, ["--version"])
|
|
68
68
|
assert result.exit_code == 0
|
|
69
69
|
assert "subeng" in result.output
|
|
70
|
-
assert "0.1.
|
|
70
|
+
assert "0.1.2" in result.output
|
|
71
71
|
|
|
72
72
|
|
|
73
73
|
def test_cli_version_short():
|
|
@@ -79,7 +79,7 @@ def test_cli_version_short():
|
|
|
79
79
|
def test_cli_version_no_extra_output():
|
|
80
80
|
result = runner.invoke(app, ["--version"])
|
|
81
81
|
assert result.exit_code == 0
|
|
82
|
-
assert result.output.strip() == "subeng 0.1.
|
|
82
|
+
assert result.output.strip() == "subeng 0.1.2"
|
|
83
83
|
|
|
84
84
|
|
|
85
85
|
def test_cli_quiet_hides_status_but_keeps_errors(tmp_path: Path):
|
|
@@ -145,3 +145,28 @@ def test_main_entry_runs_typer_app_for_transcription():
|
|
|
145
145
|
with patch.object(sys, "argv", ["subeng", "video.mp4"]):
|
|
146
146
|
main_entry()
|
|
147
147
|
mock_app.assert_called_once()
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def test_cli_preset_shortform_accepted(tmp_path: Path):
|
|
151
|
+
media = tmp_path / "video.mp4"
|
|
152
|
+
media.write_bytes(b"fake")
|
|
153
|
+
result = runner.invoke(app, [str(media), "--preset", "shortform"])
|
|
154
|
+
# Validation passes; transcription fails because the file is fake.
|
|
155
|
+
assert result.exit_code != 0
|
|
156
|
+
assert "Preset: shortform" in result.output
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def test_cli_preset_longform_accepted(tmp_path: Path):
|
|
160
|
+
media = tmp_path / "video.mp4"
|
|
161
|
+
media.write_bytes(b"fake")
|
|
162
|
+
result = runner.invoke(app, [str(media), "--preset", "longform"])
|
|
163
|
+
assert result.exit_code != 0
|
|
164
|
+
assert "Preset: longform" in result.output
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def test_cli_invalid_preset_rejected(tmp_path: Path):
|
|
168
|
+
media = tmp_path / "video.mp4"
|
|
169
|
+
media.write_bytes(b"fake")
|
|
170
|
+
result = runner.invoke(app, [str(media), "--preset", "invalid"])
|
|
171
|
+
assert result.exit_code != 0
|
|
172
|
+
assert "Unknown preset" in result.output
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""Tests for the segmenter/preset splitter."""
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from subtitle_engine.segmenter import (
|
|
6
|
+
PRESET_LONGFORM,
|
|
7
|
+
PRESET_SHORTFORM,
|
|
8
|
+
split_segments,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _segment(text: str, words: list[dict] | None = None, start: float = 0.0, end: float = 1.0) -> dict:
|
|
13
|
+
return {
|
|
14
|
+
"start": start,
|
|
15
|
+
"end": end,
|
|
16
|
+
"text": text,
|
|
17
|
+
"words": words if words is not None else [{"word": w} for w in text.split()],
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def test_shortform_splits_to_small_chunks():
|
|
22
|
+
segment = _segment("one two three four five six seven eight", start=0.0, end=8.0)
|
|
23
|
+
result = split_segments([segment], preset=PRESET_SHORTFORM)
|
|
24
|
+
|
|
25
|
+
assert len(result) >= 2
|
|
26
|
+
for chunk in result:
|
|
27
|
+
word_count = len(chunk["text"].split())
|
|
28
|
+
assert 1 <= word_count <= 5
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_longform_allows_larger_chunks():
|
|
32
|
+
segment = _segment(" ".join(str(i) for i in range(25)), start=0.0, end=25.0)
|
|
33
|
+
result = split_segments([segment], preset=PRESET_LONGFORM)
|
|
34
|
+
|
|
35
|
+
assert len(result) >= 1
|
|
36
|
+
for chunk in result:
|
|
37
|
+
word_count = len(chunk["text"].split())
|
|
38
|
+
assert 1 <= word_count <= 14
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def test_unknown_preset_raises():
|
|
42
|
+
with pytest.raises(ValueError, match="Unknown preset"):
|
|
43
|
+
split_segments([], preset="invalid")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_word_timings_are_used():
|
|
47
|
+
words = [
|
|
48
|
+
{"word": "hello", "start": 0.0, "end": 0.5},
|
|
49
|
+
{"word": "world", "start": 0.5, "end": 1.0},
|
|
50
|
+
{"word": "today", "start": 1.0, "end": 1.5},
|
|
51
|
+
]
|
|
52
|
+
segment = _segment("hello world today", words=words, start=0.0, end=1.5)
|
|
53
|
+
result = split_segments([segment], preset=PRESET_SHORTFORM)
|
|
54
|
+
|
|
55
|
+
assert result[0]["start"] == pytest.approx(0.0)
|
|
56
|
+
assert result[-1]["end"] == pytest.approx(1.5)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def test_speaker_label_is_preserved():
|
|
60
|
+
words = [
|
|
61
|
+
{"word": "hello", "speaker": "SPEAKER_01"},
|
|
62
|
+
{"word": "world", "speaker": "SPEAKER_01"},
|
|
63
|
+
{"word": "today", "speaker": "SPEAKER_02"},
|
|
64
|
+
]
|
|
65
|
+
segment = _segment("hello world today", words=words, start=0.0, end=3.0)
|
|
66
|
+
result = split_segments([segment], preset=PRESET_SHORTFORM)
|
|
67
|
+
|
|
68
|
+
assert any("[SPEAKER_01]" in chunk["text"] for chunk in result)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def test_empty_segment_is_ignored():
|
|
72
|
+
segment = {"start": 0.0, "end": 1.0, "text": " ", "words": []}
|
|
73
|
+
result = split_segments([segment], preset=PRESET_SHORTFORM)
|
|
74
|
+
assert result == []
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def test_segment_without_words_splits_by_text():
|
|
78
|
+
segment = {"start": 0.0, "end": 9.0, "text": "one two three four five six seven eight nine"}
|
|
79
|
+
result = split_segments([segment], preset=PRESET_SHORTFORM)
|
|
80
|
+
|
|
81
|
+
total_words = sum(len(chunk["text"].split()) for chunk in result)
|
|
82
|
+
assert total_words == 9
|
|
83
|
+
assert result[0]["start"] == pytest.approx(0.0)
|
|
84
|
+
assert result[-1]["end"] == pytest.approx(9.0)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/src/subtitle_engine.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/src/subtitle_engine.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|