@storyteller-platform/align 0.1.12 → 0.1.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/align/align.cjs +53 -118
- package/dist/align/align.d.cts +14 -1
- package/dist/align/align.d.ts +14 -1
- package/dist/align/align.js +53 -118
- package/dist/align/getSentenceRanges.cjs +165 -36
- package/dist/align/getSentenceRanges.d.cts +8 -2
- package/dist/align/getSentenceRanges.d.ts +8 -2
- package/dist/align/getSentenceRanges.js +165 -36
- package/dist/align/search.cjs +122 -0
- package/dist/align/search.d.cts +12 -0
- package/dist/align/search.d.ts +12 -0
- package/dist/align/search.js +96 -0
- package/dist/errorAlign/utils.d.cts +1 -1
- package/dist/errorAlign/utils.d.ts +1 -1
- package/package.json +3 -3
- package/dist/align/fuzzy.cjs +0 -164
- package/dist/align/fuzzy.d.cts +0 -6
- package/dist/align/fuzzy.d.ts +0 -6
- package/dist/align/fuzzy.js +0 -141
package/dist/align/align.cjs
CHANGED
|
@@ -81,16 +81,14 @@ module.exports = __toCommonJS(align_exports);
|
|
|
81
81
|
var import_promises = require("node:fs/promises");
|
|
82
82
|
var import_node_path = require("node:path");
|
|
83
83
|
var import_posix = require("node:path/posix");
|
|
84
|
-
var import_itertools = require("itertools");
|
|
85
84
|
var import_memoize = __toESM(require("memoize"), 1);
|
|
86
|
-
var import_runes2 = require("runes2");
|
|
87
85
|
var import_audiobook = require("@storyteller-platform/audiobook");
|
|
88
86
|
var import_epub = require("@storyteller-platform/epub");
|
|
89
87
|
var import_ghost_story = require("@storyteller-platform/ghost-story");
|
|
90
88
|
var import_ffmpeg = require("../common/ffmpeg.cjs");
|
|
91
89
|
var import_segmentation = require("../markup/segmentation.cjs");
|
|
92
|
-
var import_fuzzy = require("./fuzzy.cjs");
|
|
93
90
|
var import_getSentenceRanges = require("./getSentenceRanges.cjs");
|
|
91
|
+
var import_search = require("./search.cjs");
|
|
94
92
|
var import_slugify = require("./slugify.cjs");
|
|
95
93
|
async function align(input, output, transcriptionsDir, audiobookDir, options) {
|
|
96
94
|
var _stack = [];
|
|
@@ -111,7 +109,17 @@ async function align(input, output, transcriptionsDir, audiobookDir, options) {
|
|
|
111
109
|
(contents) => contents.map(
|
|
112
110
|
(c) => JSON.parse(c)
|
|
113
111
|
)
|
|
114
|
-
)
|
|
112
|
+
).then((transcriptions2) => {
|
|
113
|
+
return transcriptions2.map((transcription) => {
|
|
114
|
+
if ("wordTimeline" in transcription) {
|
|
115
|
+
return {
|
|
116
|
+
...transcription,
|
|
117
|
+
timeline: transcription.wordTimeline
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
return transcription;
|
|
121
|
+
});
|
|
122
|
+
});
|
|
115
123
|
const aligner = new Aligner(
|
|
116
124
|
epub,
|
|
117
125
|
audiobookFiles,
|
|
@@ -157,83 +165,6 @@ class Aligner {
|
|
|
157
165
|
report = {
|
|
158
166
|
chapters: []
|
|
159
167
|
};
|
|
160
|
-
findBestOffset(epubSentences, transcriptionText, lastMatchOffset, dir = 1) {
|
|
161
|
-
const reverse = dir < 0;
|
|
162
|
-
if (dir < 0) {
|
|
163
|
-
epubSentences = epubSentences.toReversed().map((s) => (0, import_runes2.runes)(s).toReversed().join(""));
|
|
164
|
-
transcriptionText = (0, import_runes2.runes)(transcriptionText).toReversed().join("");
|
|
165
|
-
lastMatchOffset = transcriptionText.length - lastMatchOffset;
|
|
166
|
-
}
|
|
167
|
-
const flatSliceIndices = [
|
|
168
|
-
0,
|
|
169
|
-
...this.alignedChapters.toSorted(
|
|
170
|
-
(a, b) => reverse ? transcriptionText.length - a.endOffset - (transcriptionText.length - b.endOffset) : a.startOffset - b.startOffset
|
|
171
|
-
).flatMap((aligned) => [
|
|
172
|
-
reverse ? transcriptionText.length - aligned.endOffset : aligned.startOffset,
|
|
173
|
-
reverse ? transcriptionText.length - aligned.startOffset : aligned.endOffset
|
|
174
|
-
]),
|
|
175
|
-
transcriptionText.length
|
|
176
|
-
];
|
|
177
|
-
const sliceIndices = [];
|
|
178
|
-
for (let i = 0; i < flatSliceIndices.length - 1; i += 2) {
|
|
179
|
-
sliceIndices.push([flatSliceIndices[i], flatSliceIndices[i + 1]]);
|
|
180
|
-
}
|
|
181
|
-
const allSlices = [];
|
|
182
|
-
let startSlice = 0;
|
|
183
|
-
for (const [i, [start, end]] of (0, import_itertools.enumerate)(sliceIndices)) {
|
|
184
|
-
if (lastMatchOffset >= start && lastMatchOffset < end) {
|
|
185
|
-
if (!reverse) {
|
|
186
|
-
startSlice = i + 1;
|
|
187
|
-
allSlices.push({
|
|
188
|
-
start,
|
|
189
|
-
text: transcriptionText.slice(start, lastMatchOffset)
|
|
190
|
-
});
|
|
191
|
-
}
|
|
192
|
-
allSlices.push({
|
|
193
|
-
start: lastMatchOffset,
|
|
194
|
-
text: transcriptionText.slice(lastMatchOffset, end)
|
|
195
|
-
});
|
|
196
|
-
} else if (!reverse) {
|
|
197
|
-
allSlices.push({ start, text: transcriptionText.slice(start, end) });
|
|
198
|
-
}
|
|
199
|
-
}
|
|
200
|
-
const slices = allSlices.filter((slice) => slice.text.length);
|
|
201
|
-
if (reverse && !slices.length) {
|
|
202
|
-
const indices = sliceIndices.find(([start]) => start > lastMatchOffset);
|
|
203
|
-
if (indices) {
|
|
204
|
-
slices.push({
|
|
205
|
-
start: indices[0],
|
|
206
|
-
text: transcriptionText.slice(...indices)
|
|
207
|
-
});
|
|
208
|
-
}
|
|
209
|
-
}
|
|
210
|
-
for (const slice of slices.slice(startSlice).concat(slices.slice(0, startSlice))) {
|
|
211
|
-
let startSentence = 0;
|
|
212
|
-
while (startSentence < epubSentences.length) {
|
|
213
|
-
const needle = epubSentences.slice(startSentence, startSentence + 6).join("-");
|
|
214
|
-
const firstMatch = (0, import_fuzzy.findNearestMatch)(
|
|
215
|
-
needle,
|
|
216
|
-
slice.text,
|
|
217
|
-
Math.max(Math.floor(0.1 * needle.length), 1)
|
|
218
|
-
);
|
|
219
|
-
if (firstMatch) {
|
|
220
|
-
const start = reverse ? transcriptionText.length - (slice.start + firstMatch.index) : slice.start + firstMatch.index;
|
|
221
|
-
return {
|
|
222
|
-
startSentence: reverse ? epubSentences.length - startSentence : startSentence,
|
|
223
|
-
transcriptionOffset: start
|
|
224
|
-
};
|
|
225
|
-
}
|
|
226
|
-
startSentence += 3;
|
|
227
|
-
}
|
|
228
|
-
}
|
|
229
|
-
if (reverse) {
|
|
230
|
-
return {
|
|
231
|
-
startSentence: epubSentences.length,
|
|
232
|
-
transcriptionOffset: slices[0] ? transcriptionText.length - slices[0].start : null
|
|
233
|
-
};
|
|
234
|
-
}
|
|
235
|
-
return { startSentence: 0, transcriptionOffset: null };
|
|
236
|
-
}
|
|
237
168
|
async getChapterSentences(chapterId) {
|
|
238
169
|
const chapterXml = await this.epub.readXhtmlItemContents(chapterId);
|
|
239
170
|
const { result: segmentation } = await (0, import_segmentation.getXhtmlSegmentation)(
|
|
@@ -296,7 +227,7 @@ class Aligner {
|
|
|
296
227
|
value: import_epub.Epub.formatSmilDuration(chapterDuration)
|
|
297
228
|
});
|
|
298
229
|
}
|
|
299
|
-
addChapterReport(chapter, chapterSentences, sentenceRanges, startSentence, transcriptionOffset) {
|
|
230
|
+
addChapterReport(chapter, chapterSentences, sentenceRanges, startSentence, endSentence, transcriptionOffset) {
|
|
300
231
|
this.report.chapters.push({
|
|
301
232
|
href: chapter.href,
|
|
302
233
|
transcriptionOffset,
|
|
@@ -320,6 +251,14 @@ class Aligner {
|
|
|
320
251
|
matchedSentence: chapterSentences[startSentence],
|
|
321
252
|
nextSentence: chapterSentences[startSentence + 1] ?? null
|
|
322
253
|
},
|
|
254
|
+
lastMatchedSentenceId: endSentence,
|
|
255
|
+
lastMatchedSentenceContext: {
|
|
256
|
+
prevSentence: chapterSentences[endSentence - 1] ?? null,
|
|
257
|
+
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
|
258
|
+
matchedSentence: chapterSentences[endSentence],
|
|
259
|
+
nextSentence: chapterSentences[endSentence + 1] ?? null
|
|
260
|
+
},
|
|
261
|
+
chapterSentenceCount: sentenceRanges.length,
|
|
323
262
|
audioFiles: sentenceRanges.reduce((acc, range) => {
|
|
324
263
|
const existing = acc.find(
|
|
325
264
|
(context) => context.filepath === range.audiofile
|
|
@@ -337,7 +276,7 @@ class Aligner {
|
|
|
337
276
|
}, [])
|
|
338
277
|
});
|
|
339
278
|
}
|
|
340
|
-
async alignChapter(
|
|
279
|
+
async alignChapter(chapterId, transcriptionOffset, transcriptionEndOffset, locale, mapping) {
|
|
341
280
|
const timing = (0, import_ghost_story.createTiming)();
|
|
342
281
|
timing.start("read contents");
|
|
343
282
|
const manifest = await this.epub.getManifest();
|
|
@@ -352,9 +291,12 @@ class Aligner {
|
|
|
352
291
|
const chapterSentences = await this.getChapterSentences(chapterId);
|
|
353
292
|
timing.end("split to sentences");
|
|
354
293
|
timing.start("align sentences");
|
|
355
|
-
const {
|
|
356
|
-
|
|
357
|
-
|
|
294
|
+
const {
|
|
295
|
+
sentenceRanges,
|
|
296
|
+
transcriptionOffset: endTranscriptionOffset,
|
|
297
|
+
firstFoundSentence,
|
|
298
|
+
lastFoundSentence
|
|
299
|
+
} = await (0, import_getSentenceRanges.getSentenceRanges)(
|
|
358
300
|
this.transcription,
|
|
359
301
|
chapterSentences,
|
|
360
302
|
transcriptionOffset,
|
|
@@ -382,7 +324,8 @@ class Aligner {
|
|
|
382
324
|
chapter,
|
|
383
325
|
chapterSentences,
|
|
384
326
|
sentenceRanges,
|
|
385
|
-
|
|
327
|
+
firstFoundSentence,
|
|
328
|
+
lastFoundSentence,
|
|
386
329
|
transcriptionOffset
|
|
387
330
|
);
|
|
388
331
|
return {
|
|
@@ -391,8 +334,20 @@ class Aligner {
|
|
|
391
334
|
timing
|
|
392
335
|
};
|
|
393
336
|
}
|
|
337
|
+
narrowToAvailableBoundary(boundary) {
|
|
338
|
+
const narrowed = { ...boundary };
|
|
339
|
+
for (const chapter of this.alignedChapters) {
|
|
340
|
+
if (chapter.startOffset > narrowed.start && chapter.startOffset <= narrowed.end) {
|
|
341
|
+
narrowed.end = chapter.startOffset - 1;
|
|
342
|
+
}
|
|
343
|
+
if (chapter.endOffset < narrowed.end && chapter.endOffset >= narrowed.start) {
|
|
344
|
+
narrowed.start = chapter.endOffset + 1;
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
return narrowed;
|
|
348
|
+
}
|
|
394
349
|
async alignBook(onProgress) {
|
|
395
|
-
var _a, _b, _c, _d, _e, _f
|
|
350
|
+
var _a, _b, _c, _d, _e, _f;
|
|
396
351
|
const locale = this.languageOverride ?? await this.epub.getLanguage() ?? new Intl.Locale("en-US");
|
|
397
352
|
this.timing.setMetadata("language", locale.toString());
|
|
398
353
|
this.timing.setMetadata("granularity", this.granularity);
|
|
@@ -402,7 +357,6 @@ class Aligner {
|
|
|
402
357
|
this.transcription.transcript,
|
|
403
358
|
locale
|
|
404
359
|
);
|
|
405
|
-
let lastTranscriptionOffset = 0;
|
|
406
360
|
for (let index = 0; index < spine.length; index++) {
|
|
407
361
|
onProgress == null ? void 0 : onProgress(index / spine.length);
|
|
408
362
|
const spineItem = spine[index];
|
|
@@ -431,48 +385,29 @@ class Aligner {
|
|
|
431
385
|
);
|
|
432
386
|
continue;
|
|
433
387
|
}
|
|
434
|
-
const
|
|
435
|
-
slugifiedChapterSentences,
|
|
436
|
-
transcriptionText
|
|
437
|
-
mapping.map(lastTranscriptionOffset, -1)
|
|
388
|
+
const boundaries = (0, import_search.findBoundaries)(
|
|
389
|
+
slugifiedChapterSentences.join("-"),
|
|
390
|
+
transcriptionText
|
|
438
391
|
);
|
|
439
|
-
if (
|
|
392
|
+
if (!boundaries) {
|
|
440
393
|
(_f = this.logger) == null ? void 0 : _f.info(
|
|
441
|
-
`
|
|
394
|
+
`Could not find chapter #${index} in the transcripton`
|
|
442
395
|
);
|
|
443
396
|
continue;
|
|
444
397
|
}
|
|
445
|
-
const
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
transcriptionOffset: slugifiedEndOffset
|
|
449
|
-
} = this.findBestOffset(
|
|
450
|
-
slugifiedChapterSentences,
|
|
451
|
-
transcriptionText,
|
|
452
|
-
Math.min(
|
|
453
|
-
transcriptionText.length,
|
|
454
|
-
slugifiedOffset + Math.round(slugifiedChapterSentences.join("-").length * 1.2)
|
|
455
|
-
),
|
|
456
|
-
-1
|
|
457
|
-
);
|
|
458
|
-
const endSentence = startEndSentence;
|
|
459
|
-
const endOffset = slugifiedEndOffset === null ? this.transcription.transcript.length : mapping.invert().map(slugifiedEndOffset, 1);
|
|
460
|
-
if (endSentence - startSentence < slugifiedChapterSentences.length / 2) {
|
|
461
|
-
(_g = this.logger) == null ? void 0 : _g.info(`Found less than half of chapter #${index}, skipping`);
|
|
398
|
+
const { start, end } = this.narrowToAvailableBoundary(boundaries);
|
|
399
|
+
if (start === end) {
|
|
400
|
+
continue;
|
|
462
401
|
}
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
);
|
|
402
|
+
const transcriptionOffset = mapping.invert().map(Math.max(start, 0), -1);
|
|
403
|
+
const endOffset = mapping.invert().map(Math.min(end, transcriptionText.length), 1);
|
|
466
404
|
const result = await this.alignChapter(
|
|
467
|
-
startSentence,
|
|
468
|
-
endSentence,
|
|
469
405
|
chapterId,
|
|
470
406
|
transcriptionOffset,
|
|
471
407
|
endOffset,
|
|
472
408
|
locale,
|
|
473
409
|
mapping
|
|
474
410
|
);
|
|
475
|
-
lastTranscriptionOffset = result.endTranscriptionOffset;
|
|
476
411
|
this.timing.add(result.timing.summary());
|
|
477
412
|
}
|
|
478
413
|
const audioOrderedChapters = this.alignedChapters.toSorted((a, b) => {
|
package/dist/align/align.d.cts
CHANGED
|
@@ -22,6 +22,13 @@ interface ChapterReport {
|
|
|
22
22
|
matchedSentence: string;
|
|
23
23
|
nextSentence: string | null;
|
|
24
24
|
};
|
|
25
|
+
lastMatchedSentenceId: number;
|
|
26
|
+
lastMatchedSentenceContext: {
|
|
27
|
+
prevSentence: string | null;
|
|
28
|
+
matchedSentence: string;
|
|
29
|
+
nextSentence: string | null;
|
|
30
|
+
};
|
|
31
|
+
chapterSentenceCount: number;
|
|
25
32
|
audioFiles: AudioFileContext[];
|
|
26
33
|
}
|
|
27
34
|
interface Report {
|
|
@@ -47,11 +54,17 @@ declare class Aligner {
|
|
|
47
54
|
private granularity;
|
|
48
55
|
report: Report;
|
|
49
56
|
constructor(epub: Epub, audiofiles: string[], transcriptions: Pick<RecognitionResult, "transcript" | "timeline">[], granularity: "sentence" | "word" | null | undefined, languageOverride?: (Intl.Locale | null) | undefined, logger?: (Logger | null) | undefined);
|
|
50
|
-
private findBestOffset;
|
|
51
57
|
private getChapterSentences;
|
|
52
58
|
private writeAlignedChapter;
|
|
53
59
|
private addChapterReport;
|
|
54
60
|
private alignChapter;
|
|
61
|
+
narrowToAvailableBoundary(boundary: {
|
|
62
|
+
start: number;
|
|
63
|
+
end: number;
|
|
64
|
+
}): {
|
|
65
|
+
start: number;
|
|
66
|
+
end: number;
|
|
67
|
+
};
|
|
55
68
|
alignBook(onProgress?: ((progress: number) => void) | null): Promise<_storyteller_platform_ghost_story.TimingAggregator>;
|
|
56
69
|
}
|
|
57
70
|
declare function concatTranscriptions(transcriptions: Pick<RecognitionResult, "transcript" | "timeline">[], audiofiles: string[]): StorytellerTranscription;
|
package/dist/align/align.d.ts
CHANGED
|
@@ -22,6 +22,13 @@ interface ChapterReport {
|
|
|
22
22
|
matchedSentence: string;
|
|
23
23
|
nextSentence: string | null;
|
|
24
24
|
};
|
|
25
|
+
lastMatchedSentenceId: number;
|
|
26
|
+
lastMatchedSentenceContext: {
|
|
27
|
+
prevSentence: string | null;
|
|
28
|
+
matchedSentence: string;
|
|
29
|
+
nextSentence: string | null;
|
|
30
|
+
};
|
|
31
|
+
chapterSentenceCount: number;
|
|
25
32
|
audioFiles: AudioFileContext[];
|
|
26
33
|
}
|
|
27
34
|
interface Report {
|
|
@@ -47,11 +54,17 @@ declare class Aligner {
|
|
|
47
54
|
private granularity;
|
|
48
55
|
report: Report;
|
|
49
56
|
constructor(epub: Epub, audiofiles: string[], transcriptions: Pick<RecognitionResult, "transcript" | "timeline">[], granularity: "sentence" | "word" | null | undefined, languageOverride?: (Intl.Locale | null) | undefined, logger?: (Logger | null) | undefined);
|
|
50
|
-
private findBestOffset;
|
|
51
57
|
private getChapterSentences;
|
|
52
58
|
private writeAlignedChapter;
|
|
53
59
|
private addChapterReport;
|
|
54
60
|
private alignChapter;
|
|
61
|
+
narrowToAvailableBoundary(boundary: {
|
|
62
|
+
start: number;
|
|
63
|
+
end: number;
|
|
64
|
+
}): {
|
|
65
|
+
start: number;
|
|
66
|
+
end: number;
|
|
67
|
+
};
|
|
55
68
|
alignBook(onProgress?: ((progress: number) => void) | null): Promise<_storyteller_platform_ghost_story.TimingAggregator>;
|
|
56
69
|
}
|
|
57
70
|
declare function concatTranscriptions(transcriptions: Pick<RecognitionResult, "transcript" | "timeline">[], audiofiles: string[]): StorytellerTranscription;
|
package/dist/align/align.js
CHANGED
|
@@ -5,9 +5,7 @@ import {
|
|
|
5
5
|
import { copyFile, mkdir, readFile, readdir, writeFile } from "node:fs/promises";
|
|
6
6
|
import { dirname as autoDirname, join as autoJoin } from "node:path";
|
|
7
7
|
import { basename, dirname, parse, relative } from "node:path/posix";
|
|
8
|
-
import { enumerate } from "itertools";
|
|
9
8
|
import memoize from "memoize";
|
|
10
|
-
import { runes } from "runes2";
|
|
11
9
|
import { isAudioFile, lookupAudioMime } from "@storyteller-platform/audiobook";
|
|
12
10
|
import {
|
|
13
11
|
Epub
|
|
@@ -18,13 +16,13 @@ import {
|
|
|
18
16
|
} from "@storyteller-platform/ghost-story";
|
|
19
17
|
import { getTrackDuration } from "../common/ffmpeg.js";
|
|
20
18
|
import { getXhtmlSegmentation } from "../markup/segmentation.js";
|
|
21
|
-
import { findNearestMatch } from "./fuzzy.js";
|
|
22
19
|
import {
|
|
23
20
|
expandEmptySentenceRanges,
|
|
24
21
|
getChapterDuration,
|
|
25
22
|
getSentenceRanges,
|
|
26
23
|
interpolateSentenceRanges
|
|
27
24
|
} from "./getSentenceRanges.js";
|
|
25
|
+
import { findBoundaries } from "./search.js";
|
|
28
26
|
import { slugify } from "./slugify.js";
|
|
29
27
|
async function align(input, output, transcriptionsDir, audiobookDir, options) {
|
|
30
28
|
var _stack = [];
|
|
@@ -45,7 +43,17 @@ async function align(input, output, transcriptionsDir, audiobookDir, options) {
|
|
|
45
43
|
(contents) => contents.map(
|
|
46
44
|
(c) => JSON.parse(c)
|
|
47
45
|
)
|
|
48
|
-
)
|
|
46
|
+
).then((transcriptions2) => {
|
|
47
|
+
return transcriptions2.map((transcription) => {
|
|
48
|
+
if ("wordTimeline" in transcription) {
|
|
49
|
+
return {
|
|
50
|
+
...transcription,
|
|
51
|
+
timeline: transcription.wordTimeline
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
return transcription;
|
|
55
|
+
});
|
|
56
|
+
});
|
|
49
57
|
const aligner = new Aligner(
|
|
50
58
|
epub,
|
|
51
59
|
audiobookFiles,
|
|
@@ -91,83 +99,6 @@ class Aligner {
|
|
|
91
99
|
report = {
|
|
92
100
|
chapters: []
|
|
93
101
|
};
|
|
94
|
-
findBestOffset(epubSentences, transcriptionText, lastMatchOffset, dir = 1) {
|
|
95
|
-
const reverse = dir < 0;
|
|
96
|
-
if (dir < 0) {
|
|
97
|
-
epubSentences = epubSentences.toReversed().map((s) => runes(s).toReversed().join(""));
|
|
98
|
-
transcriptionText = runes(transcriptionText).toReversed().join("");
|
|
99
|
-
lastMatchOffset = transcriptionText.length - lastMatchOffset;
|
|
100
|
-
}
|
|
101
|
-
const flatSliceIndices = [
|
|
102
|
-
0,
|
|
103
|
-
...this.alignedChapters.toSorted(
|
|
104
|
-
(a, b) => reverse ? transcriptionText.length - a.endOffset - (transcriptionText.length - b.endOffset) : a.startOffset - b.startOffset
|
|
105
|
-
).flatMap((aligned) => [
|
|
106
|
-
reverse ? transcriptionText.length - aligned.endOffset : aligned.startOffset,
|
|
107
|
-
reverse ? transcriptionText.length - aligned.startOffset : aligned.endOffset
|
|
108
|
-
]),
|
|
109
|
-
transcriptionText.length
|
|
110
|
-
];
|
|
111
|
-
const sliceIndices = [];
|
|
112
|
-
for (let i = 0; i < flatSliceIndices.length - 1; i += 2) {
|
|
113
|
-
sliceIndices.push([flatSliceIndices[i], flatSliceIndices[i + 1]]);
|
|
114
|
-
}
|
|
115
|
-
const allSlices = [];
|
|
116
|
-
let startSlice = 0;
|
|
117
|
-
for (const [i, [start, end]] of enumerate(sliceIndices)) {
|
|
118
|
-
if (lastMatchOffset >= start && lastMatchOffset < end) {
|
|
119
|
-
if (!reverse) {
|
|
120
|
-
startSlice = i + 1;
|
|
121
|
-
allSlices.push({
|
|
122
|
-
start,
|
|
123
|
-
text: transcriptionText.slice(start, lastMatchOffset)
|
|
124
|
-
});
|
|
125
|
-
}
|
|
126
|
-
allSlices.push({
|
|
127
|
-
start: lastMatchOffset,
|
|
128
|
-
text: transcriptionText.slice(lastMatchOffset, end)
|
|
129
|
-
});
|
|
130
|
-
} else if (!reverse) {
|
|
131
|
-
allSlices.push({ start, text: transcriptionText.slice(start, end) });
|
|
132
|
-
}
|
|
133
|
-
}
|
|
134
|
-
const slices = allSlices.filter((slice) => slice.text.length);
|
|
135
|
-
if (reverse && !slices.length) {
|
|
136
|
-
const indices = sliceIndices.find(([start]) => start > lastMatchOffset);
|
|
137
|
-
if (indices) {
|
|
138
|
-
slices.push({
|
|
139
|
-
start: indices[0],
|
|
140
|
-
text: transcriptionText.slice(...indices)
|
|
141
|
-
});
|
|
142
|
-
}
|
|
143
|
-
}
|
|
144
|
-
for (const slice of slices.slice(startSlice).concat(slices.slice(0, startSlice))) {
|
|
145
|
-
let startSentence = 0;
|
|
146
|
-
while (startSentence < epubSentences.length) {
|
|
147
|
-
const needle = epubSentences.slice(startSentence, startSentence + 6).join("-");
|
|
148
|
-
const firstMatch = findNearestMatch(
|
|
149
|
-
needle,
|
|
150
|
-
slice.text,
|
|
151
|
-
Math.max(Math.floor(0.1 * needle.length), 1)
|
|
152
|
-
);
|
|
153
|
-
if (firstMatch) {
|
|
154
|
-
const start = reverse ? transcriptionText.length - (slice.start + firstMatch.index) : slice.start + firstMatch.index;
|
|
155
|
-
return {
|
|
156
|
-
startSentence: reverse ? epubSentences.length - startSentence : startSentence,
|
|
157
|
-
transcriptionOffset: start
|
|
158
|
-
};
|
|
159
|
-
}
|
|
160
|
-
startSentence += 3;
|
|
161
|
-
}
|
|
162
|
-
}
|
|
163
|
-
if (reverse) {
|
|
164
|
-
return {
|
|
165
|
-
startSentence: epubSentences.length,
|
|
166
|
-
transcriptionOffset: slices[0] ? transcriptionText.length - slices[0].start : null
|
|
167
|
-
};
|
|
168
|
-
}
|
|
169
|
-
return { startSentence: 0, transcriptionOffset: null };
|
|
170
|
-
}
|
|
171
102
|
async getChapterSentences(chapterId) {
|
|
172
103
|
const chapterXml = await this.epub.readXhtmlItemContents(chapterId);
|
|
173
104
|
const { result: segmentation } = await getXhtmlSegmentation(
|
|
@@ -230,7 +161,7 @@ class Aligner {
|
|
|
230
161
|
value: Epub.formatSmilDuration(chapterDuration)
|
|
231
162
|
});
|
|
232
163
|
}
|
|
233
|
-
addChapterReport(chapter, chapterSentences, sentenceRanges, startSentence, transcriptionOffset) {
|
|
164
|
+
addChapterReport(chapter, chapterSentences, sentenceRanges, startSentence, endSentence, transcriptionOffset) {
|
|
234
165
|
this.report.chapters.push({
|
|
235
166
|
href: chapter.href,
|
|
236
167
|
transcriptionOffset,
|
|
@@ -254,6 +185,14 @@ class Aligner {
|
|
|
254
185
|
matchedSentence: chapterSentences[startSentence],
|
|
255
186
|
nextSentence: chapterSentences[startSentence + 1] ?? null
|
|
256
187
|
},
|
|
188
|
+
lastMatchedSentenceId: endSentence,
|
|
189
|
+
lastMatchedSentenceContext: {
|
|
190
|
+
prevSentence: chapterSentences[endSentence - 1] ?? null,
|
|
191
|
+
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
|
192
|
+
matchedSentence: chapterSentences[endSentence],
|
|
193
|
+
nextSentence: chapterSentences[endSentence + 1] ?? null
|
|
194
|
+
},
|
|
195
|
+
chapterSentenceCount: sentenceRanges.length,
|
|
257
196
|
audioFiles: sentenceRanges.reduce((acc, range) => {
|
|
258
197
|
const existing = acc.find(
|
|
259
198
|
(context) => context.filepath === range.audiofile
|
|
@@ -271,7 +210,7 @@ class Aligner {
|
|
|
271
210
|
}, [])
|
|
272
211
|
});
|
|
273
212
|
}
|
|
274
|
-
async alignChapter(
|
|
213
|
+
async alignChapter(chapterId, transcriptionOffset, transcriptionEndOffset, locale, mapping) {
|
|
275
214
|
const timing = createTiming();
|
|
276
215
|
timing.start("read contents");
|
|
277
216
|
const manifest = await this.epub.getManifest();
|
|
@@ -286,9 +225,12 @@ class Aligner {
|
|
|
286
225
|
const chapterSentences = await this.getChapterSentences(chapterId);
|
|
287
226
|
timing.end("split to sentences");
|
|
288
227
|
timing.start("align sentences");
|
|
289
|
-
const {
|
|
290
|
-
|
|
291
|
-
|
|
228
|
+
const {
|
|
229
|
+
sentenceRanges,
|
|
230
|
+
transcriptionOffset: endTranscriptionOffset,
|
|
231
|
+
firstFoundSentence,
|
|
232
|
+
lastFoundSentence
|
|
233
|
+
} = await getSentenceRanges(
|
|
292
234
|
this.transcription,
|
|
293
235
|
chapterSentences,
|
|
294
236
|
transcriptionOffset,
|
|
@@ -316,7 +258,8 @@ class Aligner {
|
|
|
316
258
|
chapter,
|
|
317
259
|
chapterSentences,
|
|
318
260
|
sentenceRanges,
|
|
319
|
-
|
|
261
|
+
firstFoundSentence,
|
|
262
|
+
lastFoundSentence,
|
|
320
263
|
transcriptionOffset
|
|
321
264
|
);
|
|
322
265
|
return {
|
|
@@ -325,8 +268,20 @@ class Aligner {
|
|
|
325
268
|
timing
|
|
326
269
|
};
|
|
327
270
|
}
|
|
271
|
+
narrowToAvailableBoundary(boundary) {
|
|
272
|
+
const narrowed = { ...boundary };
|
|
273
|
+
for (const chapter of this.alignedChapters) {
|
|
274
|
+
if (chapter.startOffset > narrowed.start && chapter.startOffset <= narrowed.end) {
|
|
275
|
+
narrowed.end = chapter.startOffset - 1;
|
|
276
|
+
}
|
|
277
|
+
if (chapter.endOffset < narrowed.end && chapter.endOffset >= narrowed.start) {
|
|
278
|
+
narrowed.start = chapter.endOffset + 1;
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
return narrowed;
|
|
282
|
+
}
|
|
328
283
|
async alignBook(onProgress) {
|
|
329
|
-
var _a, _b, _c, _d, _e, _f
|
|
284
|
+
var _a, _b, _c, _d, _e, _f;
|
|
330
285
|
const locale = this.languageOverride ?? await this.epub.getLanguage() ?? new Intl.Locale("en-US");
|
|
331
286
|
this.timing.setMetadata("language", locale.toString());
|
|
332
287
|
this.timing.setMetadata("granularity", this.granularity);
|
|
@@ -336,7 +291,6 @@ class Aligner {
|
|
|
336
291
|
this.transcription.transcript,
|
|
337
292
|
locale
|
|
338
293
|
);
|
|
339
|
-
let lastTranscriptionOffset = 0;
|
|
340
294
|
for (let index = 0; index < spine.length; index++) {
|
|
341
295
|
onProgress == null ? void 0 : onProgress(index / spine.length);
|
|
342
296
|
const spineItem = spine[index];
|
|
@@ -365,48 +319,29 @@ class Aligner {
|
|
|
365
319
|
);
|
|
366
320
|
continue;
|
|
367
321
|
}
|
|
368
|
-
const
|
|
369
|
-
slugifiedChapterSentences,
|
|
370
|
-
transcriptionText
|
|
371
|
-
mapping.map(lastTranscriptionOffset, -1)
|
|
322
|
+
const boundaries = findBoundaries(
|
|
323
|
+
slugifiedChapterSentences.join("-"),
|
|
324
|
+
transcriptionText
|
|
372
325
|
);
|
|
373
|
-
if (
|
|
326
|
+
if (!boundaries) {
|
|
374
327
|
(_f = this.logger) == null ? void 0 : _f.info(
|
|
375
|
-
`
|
|
328
|
+
`Could not find chapter #${index} in the transcripton`
|
|
376
329
|
);
|
|
377
330
|
continue;
|
|
378
331
|
}
|
|
379
|
-
const
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
transcriptionOffset: slugifiedEndOffset
|
|
383
|
-
} = this.findBestOffset(
|
|
384
|
-
slugifiedChapterSentences,
|
|
385
|
-
transcriptionText,
|
|
386
|
-
Math.min(
|
|
387
|
-
transcriptionText.length,
|
|
388
|
-
slugifiedOffset + Math.round(slugifiedChapterSentences.join("-").length * 1.2)
|
|
389
|
-
),
|
|
390
|
-
-1
|
|
391
|
-
);
|
|
392
|
-
const endSentence = startEndSentence;
|
|
393
|
-
const endOffset = slugifiedEndOffset === null ? this.transcription.transcript.length : mapping.invert().map(slugifiedEndOffset, 1);
|
|
394
|
-
if (endSentence - startSentence < slugifiedChapterSentences.length / 2) {
|
|
395
|
-
(_g = this.logger) == null ? void 0 : _g.info(`Found less than half of chapter #${index}, skipping`);
|
|
332
|
+
const { start, end } = this.narrowToAvailableBoundary(boundaries);
|
|
333
|
+
if (start === end) {
|
|
334
|
+
continue;
|
|
396
335
|
}
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
);
|
|
336
|
+
const transcriptionOffset = mapping.invert().map(Math.max(start, 0), -1);
|
|
337
|
+
const endOffset = mapping.invert().map(Math.min(end, transcriptionText.length), 1);
|
|
400
338
|
const result = await this.alignChapter(
|
|
401
|
-
startSentence,
|
|
402
|
-
endSentence,
|
|
403
339
|
chapterId,
|
|
404
340
|
transcriptionOffset,
|
|
405
341
|
endOffset,
|
|
406
342
|
locale,
|
|
407
343
|
mapping
|
|
408
344
|
);
|
|
409
|
-
lastTranscriptionOffset = result.endTranscriptionOffset;
|
|
410
345
|
this.timing.add(result.timing.summary());
|
|
411
346
|
}
|
|
412
347
|
const audioOrderedChapters = this.alignedChapters.toSorted((a, b) => {
|