@storyteller-platform/align 0.1.13 → 0.1.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/align/align.cjs +42 -117
- package/dist/align/align.d.cts +14 -1
- package/dist/align/align.d.ts +14 -1
- package/dist/align/align.js +42 -117
- package/dist/align/getSentenceRanges.cjs +165 -36
- package/dist/align/getSentenceRanges.d.cts +8 -2
- package/dist/align/getSentenceRanges.d.ts +8 -2
- package/dist/align/getSentenceRanges.js +165 -36
- package/dist/align/search.cjs +122 -0
- package/dist/align/search.d.cts +12 -0
- package/dist/align/search.d.ts +12 -0
- package/dist/align/search.js +96 -0
- package/dist/errorAlign/utils.d.cts +1 -1
- package/dist/errorAlign/utils.d.ts +1 -1
- package/package.json +3 -3
- package/dist/align/fuzzy.cjs +0 -164
- package/dist/align/fuzzy.d.cts +0 -6
- package/dist/align/fuzzy.d.ts +0 -6
- package/dist/align/fuzzy.js +0 -141
package/dist/align/align.cjs
CHANGED
|
@@ -81,16 +81,14 @@ module.exports = __toCommonJS(align_exports);
|
|
|
81
81
|
var import_promises = require("node:fs/promises");
|
|
82
82
|
var import_node_path = require("node:path");
|
|
83
83
|
var import_posix = require("node:path/posix");
|
|
84
|
-
var import_itertools = require("itertools");
|
|
85
84
|
var import_memoize = __toESM(require("memoize"), 1);
|
|
86
|
-
var import_runes2 = require("runes2");
|
|
87
85
|
var import_audiobook = require("@storyteller-platform/audiobook");
|
|
88
86
|
var import_epub = require("@storyteller-platform/epub");
|
|
89
87
|
var import_ghost_story = require("@storyteller-platform/ghost-story");
|
|
90
88
|
var import_ffmpeg = require("../common/ffmpeg.cjs");
|
|
91
89
|
var import_segmentation = require("../markup/segmentation.cjs");
|
|
92
|
-
var import_fuzzy = require("./fuzzy.cjs");
|
|
93
90
|
var import_getSentenceRanges = require("./getSentenceRanges.cjs");
|
|
91
|
+
var import_search = require("./search.cjs");
|
|
94
92
|
var import_slugify = require("./slugify.cjs");
|
|
95
93
|
async function align(input, output, transcriptionsDir, audiobookDir, options) {
|
|
96
94
|
var _stack = [];
|
|
@@ -167,83 +165,6 @@ class Aligner {
|
|
|
167
165
|
report = {
|
|
168
166
|
chapters: []
|
|
169
167
|
};
|
|
170
|
-
findBestOffset(epubSentences, transcriptionText, lastMatchOffset, dir = 1) {
|
|
171
|
-
const reverse = dir < 0;
|
|
172
|
-
if (dir < 0) {
|
|
173
|
-
epubSentences = epubSentences.toReversed().map((s) => (0, import_runes2.runes)(s).toReversed().join(""));
|
|
174
|
-
transcriptionText = (0, import_runes2.runes)(transcriptionText).toReversed().join("");
|
|
175
|
-
lastMatchOffset = transcriptionText.length - lastMatchOffset;
|
|
176
|
-
}
|
|
177
|
-
const flatSliceIndices = [
|
|
178
|
-
0,
|
|
179
|
-
...this.alignedChapters.toSorted(
|
|
180
|
-
(a, b) => reverse ? transcriptionText.length - a.endOffset - (transcriptionText.length - b.endOffset) : a.startOffset - b.startOffset
|
|
181
|
-
).flatMap((aligned) => [
|
|
182
|
-
reverse ? transcriptionText.length - aligned.endOffset : aligned.startOffset,
|
|
183
|
-
reverse ? transcriptionText.length - aligned.startOffset : aligned.endOffset
|
|
184
|
-
]),
|
|
185
|
-
transcriptionText.length
|
|
186
|
-
];
|
|
187
|
-
const sliceIndices = [];
|
|
188
|
-
for (let i = 0; i < flatSliceIndices.length - 1; i += 2) {
|
|
189
|
-
sliceIndices.push([flatSliceIndices[i], flatSliceIndices[i + 1]]);
|
|
190
|
-
}
|
|
191
|
-
const allSlices = [];
|
|
192
|
-
let startSlice = 0;
|
|
193
|
-
for (const [i, [start, end]] of (0, import_itertools.enumerate)(sliceIndices)) {
|
|
194
|
-
if (lastMatchOffset >= start && lastMatchOffset < end) {
|
|
195
|
-
if (!reverse) {
|
|
196
|
-
startSlice = i + 1;
|
|
197
|
-
allSlices.push({
|
|
198
|
-
start,
|
|
199
|
-
text: transcriptionText.slice(start, lastMatchOffset)
|
|
200
|
-
});
|
|
201
|
-
}
|
|
202
|
-
allSlices.push({
|
|
203
|
-
start: lastMatchOffset,
|
|
204
|
-
text: transcriptionText.slice(lastMatchOffset, end)
|
|
205
|
-
});
|
|
206
|
-
} else if (!reverse) {
|
|
207
|
-
allSlices.push({ start, text: transcriptionText.slice(start, end) });
|
|
208
|
-
}
|
|
209
|
-
}
|
|
210
|
-
const slices = allSlices.filter((slice) => slice.text.length);
|
|
211
|
-
if (reverse && !slices.length) {
|
|
212
|
-
const indices = sliceIndices.find(([start]) => start > lastMatchOffset);
|
|
213
|
-
if (indices) {
|
|
214
|
-
slices.push({
|
|
215
|
-
start: indices[0],
|
|
216
|
-
text: transcriptionText.slice(...indices)
|
|
217
|
-
});
|
|
218
|
-
}
|
|
219
|
-
}
|
|
220
|
-
for (const slice of slices.slice(startSlice).concat(slices.slice(0, startSlice))) {
|
|
221
|
-
let startSentence = 0;
|
|
222
|
-
while (startSentence < epubSentences.length) {
|
|
223
|
-
const needle = epubSentences.slice(startSentence, startSentence + 6).join("-");
|
|
224
|
-
const firstMatch = (0, import_fuzzy.findNearestMatch)(
|
|
225
|
-
needle,
|
|
226
|
-
slice.text,
|
|
227
|
-
Math.max(Math.floor(0.1 * needle.length), 1)
|
|
228
|
-
);
|
|
229
|
-
if (firstMatch) {
|
|
230
|
-
const start = reverse ? transcriptionText.length - (slice.start + firstMatch.index) : slice.start + firstMatch.index;
|
|
231
|
-
return {
|
|
232
|
-
startSentence: reverse ? epubSentences.length - startSentence : startSentence,
|
|
233
|
-
transcriptionOffset: start
|
|
234
|
-
};
|
|
235
|
-
}
|
|
236
|
-
startSentence += 3;
|
|
237
|
-
}
|
|
238
|
-
}
|
|
239
|
-
if (reverse) {
|
|
240
|
-
return {
|
|
241
|
-
startSentence: epubSentences.length,
|
|
242
|
-
transcriptionOffset: slices[0] ? transcriptionText.length - slices[0].start : null
|
|
243
|
-
};
|
|
244
|
-
}
|
|
245
|
-
return { startSentence: 0, transcriptionOffset: null };
|
|
246
|
-
}
|
|
247
168
|
async getChapterSentences(chapterId) {
|
|
248
169
|
const chapterXml = await this.epub.readXhtmlItemContents(chapterId);
|
|
249
170
|
const { result: segmentation } = await (0, import_segmentation.getXhtmlSegmentation)(
|
|
@@ -306,7 +227,7 @@ class Aligner {
|
|
|
306
227
|
value: import_epub.Epub.formatSmilDuration(chapterDuration)
|
|
307
228
|
});
|
|
308
229
|
}
|
|
309
|
-
addChapterReport(chapter, chapterSentences, sentenceRanges, startSentence, transcriptionOffset) {
|
|
230
|
+
addChapterReport(chapter, chapterSentences, sentenceRanges, startSentence, endSentence, transcriptionOffset) {
|
|
310
231
|
this.report.chapters.push({
|
|
311
232
|
href: chapter.href,
|
|
312
233
|
transcriptionOffset,
|
|
@@ -330,6 +251,14 @@ class Aligner {
|
|
|
330
251
|
matchedSentence: chapterSentences[startSentence],
|
|
331
252
|
nextSentence: chapterSentences[startSentence + 1] ?? null
|
|
332
253
|
},
|
|
254
|
+
lastMatchedSentenceId: endSentence,
|
|
255
|
+
lastMatchedSentenceContext: {
|
|
256
|
+
prevSentence: chapterSentences[endSentence - 1] ?? null,
|
|
257
|
+
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
|
258
|
+
matchedSentence: chapterSentences[endSentence],
|
|
259
|
+
nextSentence: chapterSentences[endSentence + 1] ?? null
|
|
260
|
+
},
|
|
261
|
+
chapterSentenceCount: sentenceRanges.length,
|
|
333
262
|
audioFiles: sentenceRanges.reduce((acc, range) => {
|
|
334
263
|
const existing = acc.find(
|
|
335
264
|
(context) => context.filepath === range.audiofile
|
|
@@ -347,7 +276,7 @@ class Aligner {
|
|
|
347
276
|
}, [])
|
|
348
277
|
});
|
|
349
278
|
}
|
|
350
|
-
async alignChapter(
|
|
279
|
+
async alignChapter(chapterId, transcriptionOffset, transcriptionEndOffset, locale, mapping) {
|
|
351
280
|
const timing = (0, import_ghost_story.createTiming)();
|
|
352
281
|
timing.start("read contents");
|
|
353
282
|
const manifest = await this.epub.getManifest();
|
|
@@ -362,9 +291,12 @@ class Aligner {
|
|
|
362
291
|
const chapterSentences = await this.getChapterSentences(chapterId);
|
|
363
292
|
timing.end("split to sentences");
|
|
364
293
|
timing.start("align sentences");
|
|
365
|
-
const {
|
|
366
|
-
|
|
367
|
-
|
|
294
|
+
const {
|
|
295
|
+
sentenceRanges,
|
|
296
|
+
transcriptionOffset: endTranscriptionOffset,
|
|
297
|
+
firstFoundSentence,
|
|
298
|
+
lastFoundSentence
|
|
299
|
+
} = await (0, import_getSentenceRanges.getSentenceRanges)(
|
|
368
300
|
this.transcription,
|
|
369
301
|
chapterSentences,
|
|
370
302
|
transcriptionOffset,
|
|
@@ -392,7 +324,8 @@ class Aligner {
|
|
|
392
324
|
chapter,
|
|
393
325
|
chapterSentences,
|
|
394
326
|
sentenceRanges,
|
|
395
|
-
|
|
327
|
+
firstFoundSentence,
|
|
328
|
+
lastFoundSentence,
|
|
396
329
|
transcriptionOffset
|
|
397
330
|
);
|
|
398
331
|
return {
|
|
@@ -401,8 +334,20 @@ class Aligner {
|
|
|
401
334
|
timing
|
|
402
335
|
};
|
|
403
336
|
}
|
|
337
|
+
narrowToAvailableBoundary(boundary) {
|
|
338
|
+
const narrowed = { ...boundary };
|
|
339
|
+
for (const chapter of this.alignedChapters) {
|
|
340
|
+
if (chapter.startOffset > narrowed.start && chapter.startOffset <= narrowed.end) {
|
|
341
|
+
narrowed.end = chapter.startOffset - 1;
|
|
342
|
+
}
|
|
343
|
+
if (chapter.endOffset < narrowed.end && chapter.endOffset >= narrowed.start) {
|
|
344
|
+
narrowed.start = chapter.endOffset + 1;
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
return narrowed;
|
|
348
|
+
}
|
|
404
349
|
async alignBook(onProgress) {
|
|
405
|
-
var _a, _b, _c, _d, _e, _f
|
|
350
|
+
var _a, _b, _c, _d, _e, _f;
|
|
406
351
|
const locale = this.languageOverride ?? await this.epub.getLanguage() ?? new Intl.Locale("en-US");
|
|
407
352
|
this.timing.setMetadata("language", locale.toString());
|
|
408
353
|
this.timing.setMetadata("granularity", this.granularity);
|
|
@@ -412,7 +357,6 @@ class Aligner {
|
|
|
412
357
|
this.transcription.transcript,
|
|
413
358
|
locale
|
|
414
359
|
);
|
|
415
|
-
let lastTranscriptionOffset = 0;
|
|
416
360
|
for (let index = 0; index < spine.length; index++) {
|
|
417
361
|
onProgress == null ? void 0 : onProgress(index / spine.length);
|
|
418
362
|
const spineItem = spine[index];
|
|
@@ -441,48 +385,29 @@ class Aligner {
|
|
|
441
385
|
);
|
|
442
386
|
continue;
|
|
443
387
|
}
|
|
444
|
-
const
|
|
445
|
-
slugifiedChapterSentences,
|
|
446
|
-
transcriptionText
|
|
447
|
-
mapping.map(lastTranscriptionOffset, -1)
|
|
388
|
+
const boundaries = (0, import_search.findBoundaries)(
|
|
389
|
+
slugifiedChapterSentences.join("-"),
|
|
390
|
+
transcriptionText
|
|
448
391
|
);
|
|
449
|
-
if (
|
|
392
|
+
if (!boundaries) {
|
|
450
393
|
(_f = this.logger) == null ? void 0 : _f.info(
|
|
451
|
-
`
|
|
394
|
+
`Could not find chapter #${index} in the transcripton`
|
|
452
395
|
);
|
|
453
396
|
continue;
|
|
454
397
|
}
|
|
455
|
-
const
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
transcriptionOffset: slugifiedEndOffset
|
|
459
|
-
} = this.findBestOffset(
|
|
460
|
-
slugifiedChapterSentences,
|
|
461
|
-
transcriptionText,
|
|
462
|
-
Math.min(
|
|
463
|
-
transcriptionText.length,
|
|
464
|
-
slugifiedOffset + Math.round(slugifiedChapterSentences.join("-").length * 1.2)
|
|
465
|
-
),
|
|
466
|
-
-1
|
|
467
|
-
);
|
|
468
|
-
const endSentence = startEndSentence;
|
|
469
|
-
const endOffset = slugifiedEndOffset === null ? this.transcription.transcript.length : mapping.invert().map(slugifiedEndOffset, 1);
|
|
470
|
-
if (endSentence - startSentence < slugifiedChapterSentences.length / 2) {
|
|
471
|
-
(_g = this.logger) == null ? void 0 : _g.info(`Found less than half of chapter #${index}, skipping`);
|
|
398
|
+
const { start, end } = this.narrowToAvailableBoundary(boundaries);
|
|
399
|
+
if (start === end) {
|
|
400
|
+
continue;
|
|
472
401
|
}
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
);
|
|
402
|
+
const transcriptionOffset = mapping.invert().map(Math.max(start, 0), -1);
|
|
403
|
+
const endOffset = mapping.invert().map(Math.min(end, transcriptionText.length), 1);
|
|
476
404
|
const result = await this.alignChapter(
|
|
477
|
-
startSentence,
|
|
478
|
-
endSentence,
|
|
479
405
|
chapterId,
|
|
480
406
|
transcriptionOffset,
|
|
481
407
|
endOffset,
|
|
482
408
|
locale,
|
|
483
409
|
mapping
|
|
484
410
|
);
|
|
485
|
-
lastTranscriptionOffset = result.endTranscriptionOffset;
|
|
486
411
|
this.timing.add(result.timing.summary());
|
|
487
412
|
}
|
|
488
413
|
const audioOrderedChapters = this.alignedChapters.toSorted((a, b) => {
|
package/dist/align/align.d.cts
CHANGED
|
@@ -22,6 +22,13 @@ interface ChapterReport {
|
|
|
22
22
|
matchedSentence: string;
|
|
23
23
|
nextSentence: string | null;
|
|
24
24
|
};
|
|
25
|
+
lastMatchedSentenceId: number;
|
|
26
|
+
lastMatchedSentenceContext: {
|
|
27
|
+
prevSentence: string | null;
|
|
28
|
+
matchedSentence: string;
|
|
29
|
+
nextSentence: string | null;
|
|
30
|
+
};
|
|
31
|
+
chapterSentenceCount: number;
|
|
25
32
|
audioFiles: AudioFileContext[];
|
|
26
33
|
}
|
|
27
34
|
interface Report {
|
|
@@ -47,11 +54,17 @@ declare class Aligner {
|
|
|
47
54
|
private granularity;
|
|
48
55
|
report: Report;
|
|
49
56
|
constructor(epub: Epub, audiofiles: string[], transcriptions: Pick<RecognitionResult, "transcript" | "timeline">[], granularity: "sentence" | "word" | null | undefined, languageOverride?: (Intl.Locale | null) | undefined, logger?: (Logger | null) | undefined);
|
|
50
|
-
private findBestOffset;
|
|
51
57
|
private getChapterSentences;
|
|
52
58
|
private writeAlignedChapter;
|
|
53
59
|
private addChapterReport;
|
|
54
60
|
private alignChapter;
|
|
61
|
+
narrowToAvailableBoundary(boundary: {
|
|
62
|
+
start: number;
|
|
63
|
+
end: number;
|
|
64
|
+
}): {
|
|
65
|
+
start: number;
|
|
66
|
+
end: number;
|
|
67
|
+
};
|
|
55
68
|
alignBook(onProgress?: ((progress: number) => void) | null): Promise<_storyteller_platform_ghost_story.TimingAggregator>;
|
|
56
69
|
}
|
|
57
70
|
declare function concatTranscriptions(transcriptions: Pick<RecognitionResult, "transcript" | "timeline">[], audiofiles: string[]): StorytellerTranscription;
|
package/dist/align/align.d.ts
CHANGED
|
@@ -22,6 +22,13 @@ interface ChapterReport {
|
|
|
22
22
|
matchedSentence: string;
|
|
23
23
|
nextSentence: string | null;
|
|
24
24
|
};
|
|
25
|
+
lastMatchedSentenceId: number;
|
|
26
|
+
lastMatchedSentenceContext: {
|
|
27
|
+
prevSentence: string | null;
|
|
28
|
+
matchedSentence: string;
|
|
29
|
+
nextSentence: string | null;
|
|
30
|
+
};
|
|
31
|
+
chapterSentenceCount: number;
|
|
25
32
|
audioFiles: AudioFileContext[];
|
|
26
33
|
}
|
|
27
34
|
interface Report {
|
|
@@ -47,11 +54,17 @@ declare class Aligner {
|
|
|
47
54
|
private granularity;
|
|
48
55
|
report: Report;
|
|
49
56
|
constructor(epub: Epub, audiofiles: string[], transcriptions: Pick<RecognitionResult, "transcript" | "timeline">[], granularity: "sentence" | "word" | null | undefined, languageOverride?: (Intl.Locale | null) | undefined, logger?: (Logger | null) | undefined);
|
|
50
|
-
private findBestOffset;
|
|
51
57
|
private getChapterSentences;
|
|
52
58
|
private writeAlignedChapter;
|
|
53
59
|
private addChapterReport;
|
|
54
60
|
private alignChapter;
|
|
61
|
+
narrowToAvailableBoundary(boundary: {
|
|
62
|
+
start: number;
|
|
63
|
+
end: number;
|
|
64
|
+
}): {
|
|
65
|
+
start: number;
|
|
66
|
+
end: number;
|
|
67
|
+
};
|
|
55
68
|
alignBook(onProgress?: ((progress: number) => void) | null): Promise<_storyteller_platform_ghost_story.TimingAggregator>;
|
|
56
69
|
}
|
|
57
70
|
declare function concatTranscriptions(transcriptions: Pick<RecognitionResult, "transcript" | "timeline">[], audiofiles: string[]): StorytellerTranscription;
|
package/dist/align/align.js
CHANGED
|
@@ -5,9 +5,7 @@ import {
|
|
|
5
5
|
import { copyFile, mkdir, readFile, readdir, writeFile } from "node:fs/promises";
|
|
6
6
|
import { dirname as autoDirname, join as autoJoin } from "node:path";
|
|
7
7
|
import { basename, dirname, parse, relative } from "node:path/posix";
|
|
8
|
-
import { enumerate } from "itertools";
|
|
9
8
|
import memoize from "memoize";
|
|
10
|
-
import { runes } from "runes2";
|
|
11
9
|
import { isAudioFile, lookupAudioMime } from "@storyteller-platform/audiobook";
|
|
12
10
|
import {
|
|
13
11
|
Epub
|
|
@@ -18,13 +16,13 @@ import {
|
|
|
18
16
|
} from "@storyteller-platform/ghost-story";
|
|
19
17
|
import { getTrackDuration } from "../common/ffmpeg.js";
|
|
20
18
|
import { getXhtmlSegmentation } from "../markup/segmentation.js";
|
|
21
|
-
import { findNearestMatch } from "./fuzzy.js";
|
|
22
19
|
import {
|
|
23
20
|
expandEmptySentenceRanges,
|
|
24
21
|
getChapterDuration,
|
|
25
22
|
getSentenceRanges,
|
|
26
23
|
interpolateSentenceRanges
|
|
27
24
|
} from "./getSentenceRanges.js";
|
|
25
|
+
import { findBoundaries } from "./search.js";
|
|
28
26
|
import { slugify } from "./slugify.js";
|
|
29
27
|
async function align(input, output, transcriptionsDir, audiobookDir, options) {
|
|
30
28
|
var _stack = [];
|
|
@@ -101,83 +99,6 @@ class Aligner {
|
|
|
101
99
|
report = {
|
|
102
100
|
chapters: []
|
|
103
101
|
};
|
|
104
|
-
findBestOffset(epubSentences, transcriptionText, lastMatchOffset, dir = 1) {
|
|
105
|
-
const reverse = dir < 0;
|
|
106
|
-
if (dir < 0) {
|
|
107
|
-
epubSentences = epubSentences.toReversed().map((s) => runes(s).toReversed().join(""));
|
|
108
|
-
transcriptionText = runes(transcriptionText).toReversed().join("");
|
|
109
|
-
lastMatchOffset = transcriptionText.length - lastMatchOffset;
|
|
110
|
-
}
|
|
111
|
-
const flatSliceIndices = [
|
|
112
|
-
0,
|
|
113
|
-
...this.alignedChapters.toSorted(
|
|
114
|
-
(a, b) => reverse ? transcriptionText.length - a.endOffset - (transcriptionText.length - b.endOffset) : a.startOffset - b.startOffset
|
|
115
|
-
).flatMap((aligned) => [
|
|
116
|
-
reverse ? transcriptionText.length - aligned.endOffset : aligned.startOffset,
|
|
117
|
-
reverse ? transcriptionText.length - aligned.startOffset : aligned.endOffset
|
|
118
|
-
]),
|
|
119
|
-
transcriptionText.length
|
|
120
|
-
];
|
|
121
|
-
const sliceIndices = [];
|
|
122
|
-
for (let i = 0; i < flatSliceIndices.length - 1; i += 2) {
|
|
123
|
-
sliceIndices.push([flatSliceIndices[i], flatSliceIndices[i + 1]]);
|
|
124
|
-
}
|
|
125
|
-
const allSlices = [];
|
|
126
|
-
let startSlice = 0;
|
|
127
|
-
for (const [i, [start, end]] of enumerate(sliceIndices)) {
|
|
128
|
-
if (lastMatchOffset >= start && lastMatchOffset < end) {
|
|
129
|
-
if (!reverse) {
|
|
130
|
-
startSlice = i + 1;
|
|
131
|
-
allSlices.push({
|
|
132
|
-
start,
|
|
133
|
-
text: transcriptionText.slice(start, lastMatchOffset)
|
|
134
|
-
});
|
|
135
|
-
}
|
|
136
|
-
allSlices.push({
|
|
137
|
-
start: lastMatchOffset,
|
|
138
|
-
text: transcriptionText.slice(lastMatchOffset, end)
|
|
139
|
-
});
|
|
140
|
-
} else if (!reverse) {
|
|
141
|
-
allSlices.push({ start, text: transcriptionText.slice(start, end) });
|
|
142
|
-
}
|
|
143
|
-
}
|
|
144
|
-
const slices = allSlices.filter((slice) => slice.text.length);
|
|
145
|
-
if (reverse && !slices.length) {
|
|
146
|
-
const indices = sliceIndices.find(([start]) => start > lastMatchOffset);
|
|
147
|
-
if (indices) {
|
|
148
|
-
slices.push({
|
|
149
|
-
start: indices[0],
|
|
150
|
-
text: transcriptionText.slice(...indices)
|
|
151
|
-
});
|
|
152
|
-
}
|
|
153
|
-
}
|
|
154
|
-
for (const slice of slices.slice(startSlice).concat(slices.slice(0, startSlice))) {
|
|
155
|
-
let startSentence = 0;
|
|
156
|
-
while (startSentence < epubSentences.length) {
|
|
157
|
-
const needle = epubSentences.slice(startSentence, startSentence + 6).join("-");
|
|
158
|
-
const firstMatch = findNearestMatch(
|
|
159
|
-
needle,
|
|
160
|
-
slice.text,
|
|
161
|
-
Math.max(Math.floor(0.1 * needle.length), 1)
|
|
162
|
-
);
|
|
163
|
-
if (firstMatch) {
|
|
164
|
-
const start = reverse ? transcriptionText.length - (slice.start + firstMatch.index) : slice.start + firstMatch.index;
|
|
165
|
-
return {
|
|
166
|
-
startSentence: reverse ? epubSentences.length - startSentence : startSentence,
|
|
167
|
-
transcriptionOffset: start
|
|
168
|
-
};
|
|
169
|
-
}
|
|
170
|
-
startSentence += 3;
|
|
171
|
-
}
|
|
172
|
-
}
|
|
173
|
-
if (reverse) {
|
|
174
|
-
return {
|
|
175
|
-
startSentence: epubSentences.length,
|
|
176
|
-
transcriptionOffset: slices[0] ? transcriptionText.length - slices[0].start : null
|
|
177
|
-
};
|
|
178
|
-
}
|
|
179
|
-
return { startSentence: 0, transcriptionOffset: null };
|
|
180
|
-
}
|
|
181
102
|
async getChapterSentences(chapterId) {
|
|
182
103
|
const chapterXml = await this.epub.readXhtmlItemContents(chapterId);
|
|
183
104
|
const { result: segmentation } = await getXhtmlSegmentation(
|
|
@@ -240,7 +161,7 @@ class Aligner {
|
|
|
240
161
|
value: Epub.formatSmilDuration(chapterDuration)
|
|
241
162
|
});
|
|
242
163
|
}
|
|
243
|
-
addChapterReport(chapter, chapterSentences, sentenceRanges, startSentence, transcriptionOffset) {
|
|
164
|
+
addChapterReport(chapter, chapterSentences, sentenceRanges, startSentence, endSentence, transcriptionOffset) {
|
|
244
165
|
this.report.chapters.push({
|
|
245
166
|
href: chapter.href,
|
|
246
167
|
transcriptionOffset,
|
|
@@ -264,6 +185,14 @@ class Aligner {
|
|
|
264
185
|
matchedSentence: chapterSentences[startSentence],
|
|
265
186
|
nextSentence: chapterSentences[startSentence + 1] ?? null
|
|
266
187
|
},
|
|
188
|
+
lastMatchedSentenceId: endSentence,
|
|
189
|
+
lastMatchedSentenceContext: {
|
|
190
|
+
prevSentence: chapterSentences[endSentence - 1] ?? null,
|
|
191
|
+
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
|
192
|
+
matchedSentence: chapterSentences[endSentence],
|
|
193
|
+
nextSentence: chapterSentences[endSentence + 1] ?? null
|
|
194
|
+
},
|
|
195
|
+
chapterSentenceCount: sentenceRanges.length,
|
|
267
196
|
audioFiles: sentenceRanges.reduce((acc, range) => {
|
|
268
197
|
const existing = acc.find(
|
|
269
198
|
(context) => context.filepath === range.audiofile
|
|
@@ -281,7 +210,7 @@ class Aligner {
|
|
|
281
210
|
}, [])
|
|
282
211
|
});
|
|
283
212
|
}
|
|
284
|
-
async alignChapter(
|
|
213
|
+
async alignChapter(chapterId, transcriptionOffset, transcriptionEndOffset, locale, mapping) {
|
|
285
214
|
const timing = createTiming();
|
|
286
215
|
timing.start("read contents");
|
|
287
216
|
const manifest = await this.epub.getManifest();
|
|
@@ -296,9 +225,12 @@ class Aligner {
|
|
|
296
225
|
const chapterSentences = await this.getChapterSentences(chapterId);
|
|
297
226
|
timing.end("split to sentences");
|
|
298
227
|
timing.start("align sentences");
|
|
299
|
-
const {
|
|
300
|
-
|
|
301
|
-
|
|
228
|
+
const {
|
|
229
|
+
sentenceRanges,
|
|
230
|
+
transcriptionOffset: endTranscriptionOffset,
|
|
231
|
+
firstFoundSentence,
|
|
232
|
+
lastFoundSentence
|
|
233
|
+
} = await getSentenceRanges(
|
|
302
234
|
this.transcription,
|
|
303
235
|
chapterSentences,
|
|
304
236
|
transcriptionOffset,
|
|
@@ -326,7 +258,8 @@ class Aligner {
|
|
|
326
258
|
chapter,
|
|
327
259
|
chapterSentences,
|
|
328
260
|
sentenceRanges,
|
|
329
|
-
|
|
261
|
+
firstFoundSentence,
|
|
262
|
+
lastFoundSentence,
|
|
330
263
|
transcriptionOffset
|
|
331
264
|
);
|
|
332
265
|
return {
|
|
@@ -335,8 +268,20 @@ class Aligner {
|
|
|
335
268
|
timing
|
|
336
269
|
};
|
|
337
270
|
}
|
|
271
|
+
narrowToAvailableBoundary(boundary) {
|
|
272
|
+
const narrowed = { ...boundary };
|
|
273
|
+
for (const chapter of this.alignedChapters) {
|
|
274
|
+
if (chapter.startOffset > narrowed.start && chapter.startOffset <= narrowed.end) {
|
|
275
|
+
narrowed.end = chapter.startOffset - 1;
|
|
276
|
+
}
|
|
277
|
+
if (chapter.endOffset < narrowed.end && chapter.endOffset >= narrowed.start) {
|
|
278
|
+
narrowed.start = chapter.endOffset + 1;
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
return narrowed;
|
|
282
|
+
}
|
|
338
283
|
async alignBook(onProgress) {
|
|
339
|
-
var _a, _b, _c, _d, _e, _f
|
|
284
|
+
var _a, _b, _c, _d, _e, _f;
|
|
340
285
|
const locale = this.languageOverride ?? await this.epub.getLanguage() ?? new Intl.Locale("en-US");
|
|
341
286
|
this.timing.setMetadata("language", locale.toString());
|
|
342
287
|
this.timing.setMetadata("granularity", this.granularity);
|
|
@@ -346,7 +291,6 @@ class Aligner {
|
|
|
346
291
|
this.transcription.transcript,
|
|
347
292
|
locale
|
|
348
293
|
);
|
|
349
|
-
let lastTranscriptionOffset = 0;
|
|
350
294
|
for (let index = 0; index < spine.length; index++) {
|
|
351
295
|
onProgress == null ? void 0 : onProgress(index / spine.length);
|
|
352
296
|
const spineItem = spine[index];
|
|
@@ -375,48 +319,29 @@ class Aligner {
|
|
|
375
319
|
);
|
|
376
320
|
continue;
|
|
377
321
|
}
|
|
378
|
-
const
|
|
379
|
-
slugifiedChapterSentences,
|
|
380
|
-
transcriptionText
|
|
381
|
-
mapping.map(lastTranscriptionOffset, -1)
|
|
322
|
+
const boundaries = findBoundaries(
|
|
323
|
+
slugifiedChapterSentences.join("-"),
|
|
324
|
+
transcriptionText
|
|
382
325
|
);
|
|
383
|
-
if (
|
|
326
|
+
if (!boundaries) {
|
|
384
327
|
(_f = this.logger) == null ? void 0 : _f.info(
|
|
385
|
-
`
|
|
328
|
+
`Could not find chapter #${index} in the transcripton`
|
|
386
329
|
);
|
|
387
330
|
continue;
|
|
388
331
|
}
|
|
389
|
-
const
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
transcriptionOffset: slugifiedEndOffset
|
|
393
|
-
} = this.findBestOffset(
|
|
394
|
-
slugifiedChapterSentences,
|
|
395
|
-
transcriptionText,
|
|
396
|
-
Math.min(
|
|
397
|
-
transcriptionText.length,
|
|
398
|
-
slugifiedOffset + Math.round(slugifiedChapterSentences.join("-").length * 1.2)
|
|
399
|
-
),
|
|
400
|
-
-1
|
|
401
|
-
);
|
|
402
|
-
const endSentence = startEndSentence;
|
|
403
|
-
const endOffset = slugifiedEndOffset === null ? this.transcription.transcript.length : mapping.invert().map(slugifiedEndOffset, 1);
|
|
404
|
-
if (endSentence - startSentence < slugifiedChapterSentences.length / 2) {
|
|
405
|
-
(_g = this.logger) == null ? void 0 : _g.info(`Found less than half of chapter #${index}, skipping`);
|
|
332
|
+
const { start, end } = this.narrowToAvailableBoundary(boundaries);
|
|
333
|
+
if (start === end) {
|
|
334
|
+
continue;
|
|
406
335
|
}
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
);
|
|
336
|
+
const transcriptionOffset = mapping.invert().map(Math.max(start, 0), -1);
|
|
337
|
+
const endOffset = mapping.invert().map(Math.min(end, transcriptionText.length), 1);
|
|
410
338
|
const result = await this.alignChapter(
|
|
411
|
-
startSentence,
|
|
412
|
-
endSentence,
|
|
413
339
|
chapterId,
|
|
414
340
|
transcriptionOffset,
|
|
415
341
|
endOffset,
|
|
416
342
|
locale,
|
|
417
343
|
mapping
|
|
418
344
|
);
|
|
419
|
-
lastTranscriptionOffset = result.endTranscriptionOffset;
|
|
420
345
|
this.timing.add(result.timing.summary());
|
|
421
346
|
}
|
|
422
347
|
const audioOrderedChapters = this.alignedChapters.toSorted((a, b) => {
|