@storyteller-platform/align 0.1.9 → 0.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/align/__tests__/align.test.cjs +6 -5
- package/dist/align/__tests__/align.test.js +6 -5
- package/dist/align/align.cjs +133 -81
- package/dist/align/align.d.cts +1 -0
- package/dist/align/align.d.ts +1 -0
- package/dist/align/align.js +133 -81
- package/dist/align/getSentenceRanges.cjs +78 -149
- package/dist/align/getSentenceRanges.d.cts +1 -1
- package/dist/align/getSentenceRanges.d.ts +1 -1
- package/dist/align/getSentenceRanges.js +78 -149
- package/dist/errorAlign/__tests__/errorAlign.test.cjs +100 -0
- package/dist/errorAlign/__tests__/errorAlign.test.d.cts +2 -0
- package/dist/errorAlign/__tests__/errorAlign.test.d.ts +2 -0
- package/dist/errorAlign/__tests__/errorAlign.test.js +77 -0
- package/dist/errorAlign/__tests__/native.test.cjs +118 -0
- package/dist/errorAlign/__tests__/native.test.d.cts +2 -0
- package/dist/errorAlign/__tests__/native.test.d.ts +2 -0
- package/dist/errorAlign/__tests__/native.test.js +107 -0
- package/dist/errorAlign/backtraceGraph.cjs +298 -0
- package/dist/errorAlign/backtraceGraph.d.cts +103 -0
- package/dist/errorAlign/backtraceGraph.d.ts +103 -0
- package/dist/errorAlign/backtraceGraph.js +270 -0
- package/dist/errorAlign/beamSearch.cjs +302 -0
- package/dist/errorAlign/beamSearch.d.cts +53 -0
- package/dist/errorAlign/beamSearch.d.ts +53 -0
- package/dist/errorAlign/beamSearch.js +268 -0
- package/dist/errorAlign/core.cjs +33 -0
- package/dist/errorAlign/core.d.cts +5 -0
- package/dist/errorAlign/core.d.ts +5 -0
- package/dist/errorAlign/core.js +11 -0
- package/dist/errorAlign/editDistance.cjs +115 -0
- package/dist/errorAlign/editDistance.d.cts +46 -0
- package/dist/errorAlign/editDistance.d.ts +46 -0
- package/dist/errorAlign/editDistance.js +90 -0
- package/dist/errorAlign/errorAlign.cjs +159 -0
- package/dist/errorAlign/errorAlign.d.cts +15 -0
- package/dist/errorAlign/errorAlign.d.ts +15 -0
- package/dist/errorAlign/errorAlign.js +145 -0
- package/dist/errorAlign/graphMetadata.cjs +97 -0
- package/dist/errorAlign/graphMetadata.d.cts +44 -0
- package/dist/errorAlign/graphMetadata.d.ts +44 -0
- package/dist/errorAlign/graphMetadata.js +64 -0
- package/dist/errorAlign/hash.cjs +173 -0
- package/dist/errorAlign/hash.d.cts +28 -0
- package/dist/errorAlign/hash.d.ts +28 -0
- package/dist/errorAlign/hash.js +150 -0
- package/dist/errorAlign/native.cjs +60 -0
- package/dist/errorAlign/native.d.cts +18 -0
- package/dist/errorAlign/native.d.ts +18 -0
- package/dist/errorAlign/native.js +24 -0
- package/dist/errorAlign/node-gyp-build.d.cjs +1 -0
- package/dist/errorAlign/node-gyp-build.d.d.cts +3 -0
- package/dist/errorAlign/node-gyp-build.d.d.ts +3 -0
- package/dist/errorAlign/node-gyp-build.d.js +0 -0
- package/dist/errorAlign/pathToAlignment.cjs +122 -0
- package/dist/errorAlign/pathToAlignment.d.cts +11 -0
- package/dist/errorAlign/pathToAlignment.d.ts +11 -0
- package/dist/errorAlign/pathToAlignment.js +89 -0
- package/dist/errorAlign/utils.cjs +301 -0
- package/dist/errorAlign/utils.d.cts +107 -0
- package/dist/errorAlign/utils.d.ts +107 -0
- package/dist/errorAlign/utils.js +248 -0
- package/dist/index.d.cts +1 -0
- package/dist/index.d.ts +1 -0
- package/dist/markup/__tests__/markup.test.cjs +108 -81
- package/dist/markup/__tests__/markup.test.js +109 -82
- package/dist/markup/__tests__/parseDom.test.cjs +112 -0
- package/dist/markup/__tests__/parseDom.test.d.cts +2 -0
- package/dist/markup/__tests__/parseDom.test.d.ts +2 -0
- package/dist/markup/__tests__/parseDom.test.js +89 -0
- package/dist/markup/__tests__/serializeDom.test.cjs +120 -0
- package/dist/markup/__tests__/serializeDom.test.d.cts +2 -0
- package/dist/markup/__tests__/serializeDom.test.d.ts +2 -0
- package/dist/markup/__tests__/serializeDom.test.js +97 -0
- package/dist/markup/__tests__/transform.test.cjs +122 -0
- package/dist/markup/__tests__/transform.test.d.cts +2 -0
- package/dist/markup/__tests__/transform.test.d.ts +2 -0
- package/dist/markup/__tests__/transform.test.js +99 -0
- package/dist/markup/map.cjs +261 -0
- package/dist/markup/map.d.cts +50 -0
- package/dist/markup/map.d.ts +50 -0
- package/dist/markup/map.js +236 -0
- package/dist/markup/markup.cjs +23 -201
- package/dist/markup/markup.d.cts +5 -9
- package/dist/markup/markup.d.ts +5 -9
- package/dist/markup/markup.js +24 -203
- package/dist/markup/model.cjs +172 -0
- package/dist/markup/model.d.cts +57 -0
- package/dist/markup/model.d.ts +57 -0
- package/dist/markup/model.js +145 -0
- package/dist/markup/parseDom.cjs +59 -0
- package/dist/markup/parseDom.d.cts +7 -0
- package/dist/markup/parseDom.d.ts +7 -0
- package/dist/markup/parseDom.js +35 -0
- package/dist/markup/segmentation.cjs +11 -57
- package/dist/markup/segmentation.d.cts +6 -2
- package/dist/markup/segmentation.d.ts +6 -2
- package/dist/markup/segmentation.js +11 -58
- package/dist/markup/serializeDom.cjs +87 -0
- package/dist/markup/serializeDom.d.cts +7 -0
- package/dist/markup/serializeDom.d.ts +7 -0
- package/dist/markup/serializeDom.js +63 -0
- package/dist/markup/transform.cjs +92 -0
- package/dist/markup/transform.d.cts +11 -0
- package/dist/markup/transform.d.ts +11 -0
- package/dist/markup/transform.js +71 -0
- package/dist/types/node-gyp-build.d.cjs +1 -0
- package/dist/types/node-gyp-build.d.d.cts +3 -0
- package/dist/types/node-gyp-build.d.d.ts +3 -0
- package/dist/types/node-gyp-build.d.js +0 -0
- package/package.json +11 -4
package/dist/align/align.js
CHANGED
|
@@ -5,7 +5,9 @@ import {
|
|
|
5
5
|
import { copyFile, mkdir, readFile, readdir, writeFile } from "node:fs/promises";
|
|
6
6
|
import { dirname as autoDirname, join as autoJoin } from "node:path";
|
|
7
7
|
import { basename, dirname, parse, relative } from "node:path/posix";
|
|
8
|
+
import { enumerate } from "itertools";
|
|
8
9
|
import memoize from "memoize";
|
|
10
|
+
import { runes } from "runes2";
|
|
9
11
|
import { isAudioFile, lookupAudioMime } from "@storyteller-platform/audiobook";
|
|
10
12
|
import {
|
|
11
13
|
Epub
|
|
@@ -24,7 +26,6 @@ import {
|
|
|
24
26
|
interpolateSentenceRanges
|
|
25
27
|
} from "./getSentenceRanges.js";
|
|
26
28
|
import { slugify } from "./slugify.js";
|
|
27
|
-
const OFFSET_SEARCH_WINDOW_SIZE = 5e3;
|
|
28
29
|
async function align(input, output, transcriptionsDir, audiobookDir, options) {
|
|
29
30
|
var _stack = [];
|
|
30
31
|
try {
|
|
@@ -75,6 +76,7 @@ async function align(input, output, transcriptionsDir, audiobookDir, options) {
|
|
|
75
76
|
class Aligner {
|
|
76
77
|
constructor(epub, audiofiles, transcriptions, granularity, languageOverride, logger) {
|
|
77
78
|
this.epub = epub;
|
|
79
|
+
this.audiofiles = audiofiles;
|
|
78
80
|
this.languageOverride = languageOverride;
|
|
79
81
|
this.logger = logger;
|
|
80
82
|
this.transcription = concatTranscriptions(transcriptions, audiofiles);
|
|
@@ -89,71 +91,92 @@ class Aligner {
|
|
|
89
91
|
report = {
|
|
90
92
|
chapters: []
|
|
91
93
|
};
|
|
92
|
-
findBestOffset(epubSentences, transcriptionText, lastMatchOffset,
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
94
|
+
findBestOffset(epubSentences, transcriptionText, lastMatchOffset, dir = 1) {
|
|
95
|
+
const reverse = dir < 0;
|
|
96
|
+
if (dir < 0) {
|
|
97
|
+
epubSentences = epubSentences.toReversed().map((s) => runes(s).toReversed().join(""));
|
|
98
|
+
transcriptionText = runes(transcriptionText).toReversed().join("");
|
|
99
|
+
lastMatchOffset = transcriptionText.length - lastMatchOffset;
|
|
100
|
+
}
|
|
101
|
+
const flatSliceIndices = [
|
|
102
|
+
0,
|
|
103
|
+
...this.alignedChapters.toSorted(
|
|
104
|
+
(a, b) => reverse ? transcriptionText.length - a.endOffset - (transcriptionText.length - b.endOffset) : a.startOffset - b.startOffset
|
|
105
|
+
).flatMap((aligned) => [
|
|
106
|
+
reverse ? transcriptionText.length - aligned.endOffset : aligned.startOffset,
|
|
107
|
+
reverse ? transcriptionText.length - aligned.startOffset : aligned.endOffset
|
|
108
|
+
]),
|
|
109
|
+
transcriptionText.length
|
|
110
|
+
];
|
|
111
|
+
const sliceIndices = [];
|
|
112
|
+
for (let i = 0; i < flatSliceIndices.length - 1; i += 2) {
|
|
113
|
+
sliceIndices.push([flatSliceIndices[i], flatSliceIndices[i + 1]]);
|
|
114
|
+
}
|
|
115
|
+
const allSlices = [];
|
|
116
|
+
let startSlice = 0;
|
|
117
|
+
for (const [i, [start, end]] of enumerate(sliceIndices)) {
|
|
118
|
+
if (lastMatchOffset >= start && lastMatchOffset < end) {
|
|
119
|
+
if (!reverse) {
|
|
120
|
+
startSlice = i + 1;
|
|
121
|
+
allSlices.push({
|
|
122
|
+
start,
|
|
123
|
+
text: transcriptionText.slice(start, lastMatchOffset)
|
|
124
|
+
});
|
|
117
125
|
}
|
|
126
|
+
allSlices.push({
|
|
127
|
+
start: lastMatchOffset,
|
|
128
|
+
text: transcriptionText.slice(lastMatchOffset, end)
|
|
129
|
+
});
|
|
130
|
+
} else if (!reverse) {
|
|
131
|
+
allSlices.push({ start, text: transcriptionText.slice(start, end) });
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
const slices = allSlices.filter((slice) => slice.text.length);
|
|
135
|
+
if (reverse && !slices.length) {
|
|
136
|
+
const indices = sliceIndices.find(([start]) => start > lastMatchOffset);
|
|
137
|
+
if (indices) {
|
|
138
|
+
slices.push({
|
|
139
|
+
start: indices[0],
|
|
140
|
+
text: transcriptionText.slice(...indices)
|
|
141
|
+
});
|
|
118
142
|
}
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
143
|
+
}
|
|
144
|
+
for (const slice of slices.slice(startSlice).concat(slices.slice(0, startSlice))) {
|
|
145
|
+
let startSentence = 0;
|
|
146
|
+
while (startSentence < epubSentences.length) {
|
|
147
|
+
const needle = epubSentences.slice(startSentence, startSentence + 6).join("-");
|
|
148
|
+
const firstMatch = findNearestMatch(
|
|
149
|
+
needle,
|
|
150
|
+
slice.text,
|
|
151
|
+
Math.max(Math.floor(0.1 * needle.length), 1)
|
|
123
152
|
);
|
|
124
|
-
|
|
125
|
-
const
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
);
|
|
131
|
-
if (firstMatch) {
|
|
132
|
-
return {
|
|
133
|
-
startSentence,
|
|
134
|
-
transcriptionOffset: (firstMatch.index + startIndex) % transcriptionText.length
|
|
135
|
-
};
|
|
136
|
-
}
|
|
137
|
-
startSentence += 3;
|
|
153
|
+
if (firstMatch) {
|
|
154
|
+
const start = reverse ? transcriptionText.length - (slice.start + firstMatch.index) : slice.start + firstMatch.index;
|
|
155
|
+
return {
|
|
156
|
+
startSentence: reverse ? epubSentences.length - startSentence : startSentence,
|
|
157
|
+
transcriptionOffset: start
|
|
158
|
+
};
|
|
138
159
|
}
|
|
160
|
+
startSentence += 3;
|
|
139
161
|
}
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
162
|
+
}
|
|
163
|
+
if (reverse) {
|
|
164
|
+
return {
|
|
165
|
+
startSentence: epubSentences.length,
|
|
166
|
+
transcriptionOffset: slices[0] ? transcriptionText.length - slices[0].start : null
|
|
167
|
+
};
|
|
145
168
|
}
|
|
146
169
|
return { startSentence: 0, transcriptionOffset: null };
|
|
147
170
|
}
|
|
148
171
|
async getChapterSentences(chapterId) {
|
|
149
172
|
const chapterXml = await this.epub.readXhtmlItemContents(chapterId);
|
|
150
|
-
const segmentation = await getXhtmlSegmentation(
|
|
173
|
+
const { result: segmentation } = await getXhtmlSegmentation(
|
|
151
174
|
Epub.getXhtmlBody(chapterXml),
|
|
152
175
|
{
|
|
153
176
|
primaryLocale: this.languageOverride ?? await this.epub.getLanguage()
|
|
154
177
|
}
|
|
155
178
|
);
|
|
156
|
-
return segmentation.
|
|
179
|
+
return segmentation.map((s) => s.text).filter((s) => s.match(/\S/));
|
|
157
180
|
}
|
|
158
181
|
async writeAlignedChapter(alignedChapter) {
|
|
159
182
|
const { chapter, sentenceRanges, xml } = alignedChapter;
|
|
@@ -248,7 +271,7 @@ class Aligner {
|
|
|
248
271
|
}, [])
|
|
249
272
|
});
|
|
250
273
|
}
|
|
251
|
-
async alignChapter(startSentence, chapterId, transcriptionOffset, locale,
|
|
274
|
+
async alignChapter(startSentence, endSentence, chapterId, transcriptionOffset, transcriptionEndOffset, locale, mapping) {
|
|
252
275
|
const timing = createTiming();
|
|
253
276
|
timing.start("read contents");
|
|
254
277
|
const manifest = await this.epub.getManifest();
|
|
@@ -265,20 +288,14 @@ class Aligner {
|
|
|
265
288
|
timing.start("align sentences");
|
|
266
289
|
const { sentenceRanges, transcriptionOffset: endTranscriptionOffset } = await getSentenceRanges(
|
|
267
290
|
startSentence,
|
|
291
|
+
endSentence,
|
|
268
292
|
this.transcription,
|
|
269
293
|
chapterSentences,
|
|
270
294
|
transcriptionOffset,
|
|
271
|
-
|
|
272
|
-
|
|
295
|
+
transcriptionEndOffset,
|
|
296
|
+
locale
|
|
273
297
|
);
|
|
274
298
|
timing.end("align sentences");
|
|
275
|
-
timing.start("expand ranges");
|
|
276
|
-
const interpolated = await interpolateSentenceRanges(
|
|
277
|
-
sentenceRanges,
|
|
278
|
-
lastSentenceRange
|
|
279
|
-
);
|
|
280
|
-
const expanded = expandEmptySentenceRanges(interpolated);
|
|
281
|
-
timing.end("expand ranges");
|
|
282
299
|
const storytellerStylesheetUrl = relative(
|
|
283
300
|
dirname(chapter.href),
|
|
284
301
|
"Styles/storyteller-readaloud.css"
|
|
@@ -291,25 +308,25 @@ class Aligner {
|
|
|
291
308
|
this.alignedChapters.push({
|
|
292
309
|
chapter,
|
|
293
310
|
xml: chapterXml,
|
|
294
|
-
sentenceRanges
|
|
295
|
-
startOffset: transcriptionOffset,
|
|
296
|
-
endOffset: endTranscriptionOffset
|
|
311
|
+
sentenceRanges,
|
|
312
|
+
startOffset: mapping.map(transcriptionOffset),
|
|
313
|
+
endOffset: mapping.map(endTranscriptionOffset, -1)
|
|
297
314
|
});
|
|
298
315
|
this.addChapterReport(
|
|
299
316
|
chapter,
|
|
300
317
|
chapterSentences,
|
|
301
|
-
|
|
318
|
+
sentenceRanges,
|
|
302
319
|
startSentence,
|
|
303
320
|
transcriptionOffset
|
|
304
321
|
);
|
|
305
322
|
return {
|
|
306
|
-
lastSentenceRange:
|
|
323
|
+
lastSentenceRange: sentenceRanges.at(-1) ?? null,
|
|
307
324
|
endTranscriptionOffset,
|
|
308
325
|
timing
|
|
309
326
|
};
|
|
310
327
|
}
|
|
311
328
|
async alignBook(onProgress) {
|
|
312
|
-
var _a, _b, _c, _d, _e, _f, _g;
|
|
329
|
+
var _a, _b, _c, _d, _e, _f, _g, _h;
|
|
313
330
|
const locale = this.languageOverride ?? await this.epub.getLanguage() ?? new Intl.Locale("en-US");
|
|
314
331
|
this.timing.setMetadata("language", locale.toString());
|
|
315
332
|
this.timing.setMetadata("granularity", this.granularity);
|
|
@@ -320,7 +337,6 @@ class Aligner {
|
|
|
320
337
|
locale
|
|
321
338
|
);
|
|
322
339
|
let lastTranscriptionOffset = 0;
|
|
323
|
-
let lastSentenceRange = null;
|
|
324
340
|
for (let index = 0; index < spine.length; index++) {
|
|
325
341
|
onProgress == null ? void 0 : onProgress(index / spine.length);
|
|
326
342
|
const spineItem = spine[index];
|
|
@@ -352,36 +368,72 @@ class Aligner {
|
|
|
352
368
|
const { startSentence, transcriptionOffset: slugifiedOffset } = this.findBestOffset(
|
|
353
369
|
slugifiedChapterSentences,
|
|
354
370
|
transcriptionText,
|
|
355
|
-
mapping.map(lastTranscriptionOffset, -1)
|
|
356
|
-
mapping
|
|
371
|
+
mapping.map(lastTranscriptionOffset, -1)
|
|
357
372
|
);
|
|
358
|
-
|
|
359
|
-
if (transcriptionOffset === null) {
|
|
373
|
+
if (slugifiedOffset === null) {
|
|
360
374
|
(_f = this.logger) == null ? void 0 : _f.info(
|
|
361
375
|
`Couldn't find matching transcription for chapter #${index}`
|
|
362
376
|
);
|
|
363
377
|
continue;
|
|
364
378
|
}
|
|
365
|
-
|
|
366
|
-
|
|
379
|
+
const transcriptionOffset = mapping.invert().map(slugifiedOffset, -1);
|
|
380
|
+
const {
|
|
381
|
+
startSentence: startEndSentence,
|
|
382
|
+
transcriptionOffset: slugifiedEndOffset
|
|
383
|
+
} = this.findBestOffset(
|
|
384
|
+
slugifiedChapterSentences,
|
|
385
|
+
transcriptionText,
|
|
386
|
+
Math.min(
|
|
387
|
+
transcriptionText.length,
|
|
388
|
+
slugifiedOffset + Math.round(slugifiedChapterSentences.join("-").length * 1.2)
|
|
389
|
+
),
|
|
390
|
+
-1
|
|
391
|
+
);
|
|
392
|
+
const endSentence = startEndSentence;
|
|
393
|
+
const endOffset = slugifiedEndOffset === null ? this.transcription.transcript.length : mapping.invert().map(slugifiedEndOffset, 1);
|
|
394
|
+
if (endSentence - startSentence < slugifiedChapterSentences.length / 2) {
|
|
395
|
+
(_g = this.logger) == null ? void 0 : _g.info(`Found less than half of chapter #${index}, skipping`);
|
|
396
|
+
}
|
|
397
|
+
(_h = this.logger) == null ? void 0 : _h.info(
|
|
398
|
+
`Chapter #${index} best matches transcription from ${transcriptionOffset} to ${endOffset}, from sentence ${startSentence} to ${endSentence} (of ${slugifiedChapterSentences.length}) in the book`
|
|
367
399
|
);
|
|
368
400
|
const result = await this.alignChapter(
|
|
369
401
|
startSentence,
|
|
402
|
+
endSentence,
|
|
370
403
|
chapterId,
|
|
371
404
|
transcriptionOffset,
|
|
405
|
+
endOffset,
|
|
372
406
|
locale,
|
|
373
|
-
|
|
407
|
+
mapping
|
|
374
408
|
);
|
|
375
|
-
lastSentenceRange = result.lastSentenceRange;
|
|
376
409
|
lastTranscriptionOffset = result.endTranscriptionOffset;
|
|
377
410
|
this.timing.add(result.timing.summary());
|
|
378
411
|
}
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
412
|
+
const audioOrderedChapters = this.alignedChapters.toSorted((a, b) => {
|
|
413
|
+
const firstRangeA = a.sentenceRanges[0];
|
|
414
|
+
const firstRangeB = b.sentenceRanges[0];
|
|
415
|
+
if (!firstRangeA) return 1;
|
|
416
|
+
if (!firstRangeB) return -1;
|
|
417
|
+
const firstAudiofileIndexA = this.audiofiles.indexOf(
|
|
418
|
+
firstRangeA.audiofile
|
|
382
419
|
);
|
|
383
|
-
|
|
384
|
-
|
|
420
|
+
const firstAudiofileIndexB = this.audiofiles.indexOf(
|
|
421
|
+
firstRangeB.audiofile
|
|
422
|
+
);
|
|
423
|
+
if (firstAudiofileIndexA === firstAudiofileIndexB) {
|
|
424
|
+
return firstRangeA.start - firstRangeB.start;
|
|
425
|
+
}
|
|
426
|
+
return firstAudiofileIndexA - firstAudiofileIndexB;
|
|
427
|
+
});
|
|
428
|
+
let lastSentenceRange = null;
|
|
429
|
+
for (const alignedChapter of audioOrderedChapters) {
|
|
430
|
+
const interpolated = await interpolateSentenceRanges(
|
|
431
|
+
alignedChapter.sentenceRanges,
|
|
432
|
+
lastSentenceRange
|
|
433
|
+
);
|
|
434
|
+
const expanded = expandEmptySentenceRanges(interpolated);
|
|
435
|
+
alignedChapter.sentenceRanges = expanded;
|
|
436
|
+
lastSentenceRange = expanded.at(-1) ?? null;
|
|
385
437
|
await this.writeAlignedChapter(alignedChapter);
|
|
386
438
|
}
|
|
387
439
|
await this.epub.addMetadata({
|
|
@@ -25,29 +25,10 @@ __export(getSentenceRanges_exports, {
|
|
|
25
25
|
interpolateSentenceRanges: () => interpolateSentenceRanges
|
|
26
26
|
});
|
|
27
27
|
module.exports = __toCommonJS(getSentenceRanges_exports);
|
|
28
|
-
var
|
|
28
|
+
var import_itertools = require("itertools");
|
|
29
29
|
var import_ffmpeg = require("../common/ffmpeg.cjs");
|
|
30
|
-
var
|
|
30
|
+
var import_errorAlign = require("../errorAlign/errorAlign.cjs");
|
|
31
31
|
var import_slugify = require("./slugify.cjs");
|
|
32
|
-
async function getSentencesWithOffsets(text) {
|
|
33
|
-
const sentences = await (0, import_text_segmentation.segmentText)(text).then(
|
|
34
|
-
(r) => r.sentences.map((s) => s.text)
|
|
35
|
-
);
|
|
36
|
-
const sentencesWithOffsets = [];
|
|
37
|
-
let lastSentenceEnd = 0;
|
|
38
|
-
for (const sentence of sentences) {
|
|
39
|
-
const sentenceStart = text.indexOf(sentence, lastSentenceEnd);
|
|
40
|
-
if (sentenceStart > lastSentenceEnd) {
|
|
41
|
-
sentencesWithOffsets.push(text.slice(lastSentenceEnd, sentenceStart));
|
|
42
|
-
}
|
|
43
|
-
sentencesWithOffsets.push(sentence);
|
|
44
|
-
lastSentenceEnd = sentenceStart + sentence.length;
|
|
45
|
-
}
|
|
46
|
-
if (text.length > lastSentenceEnd) {
|
|
47
|
-
sentencesWithOffsets.push(text.slice(lastSentenceEnd));
|
|
48
|
-
}
|
|
49
|
-
return sentencesWithOffsets;
|
|
50
|
-
}
|
|
51
32
|
function findStartTimestamp(matchStartIndex, transcription) {
|
|
52
33
|
const entry = transcription.timeline.find(
|
|
53
34
|
(entry2) => (entry2.endOffsetUtf16 ?? 0) > matchStartIndex
|
|
@@ -65,144 +46,92 @@ function findEndTimestamp(matchEndIndex, transcription) {
|
|
|
65
46
|
);
|
|
66
47
|
return (entry == null ? void 0 : entry.endTime) ?? null;
|
|
67
48
|
}
|
|
68
|
-
function
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
49
|
+
function getAlignmentsForSentence(sentence, alignments) {
|
|
50
|
+
const result = [];
|
|
51
|
+
let sentenceIndex = 0;
|
|
52
|
+
for (const alignment of alignments) {
|
|
53
|
+
if (sentenceIndex === sentence.length) break;
|
|
54
|
+
if (alignment.opType !== "INSERT") {
|
|
55
|
+
sentenceIndex += alignment.ref.length + (sentenceIndex === 0 ? 0 : 1);
|
|
56
|
+
}
|
|
57
|
+
result.push(alignment);
|
|
73
58
|
}
|
|
74
|
-
return
|
|
59
|
+
return result;
|
|
75
60
|
}
|
|
76
|
-
function
|
|
77
|
-
return input.replaceAll(/\s+/g, " ");
|
|
78
|
-
}
|
|
79
|
-
async function getSentenceRanges(startSentence, transcription, sentences, chapterOffset, locale, lastSentenceRange) {
|
|
61
|
+
async function getSentenceRanges(startSentence, endSentence, transcription, sentences, chapterOffset, chapterEndOffset, locale) {
|
|
80
62
|
const sentenceRanges = [];
|
|
81
|
-
const
|
|
82
|
-
const
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
)
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
let notFound = 0;
|
|
100
|
-
let sentenceIndex = startSentenceEntry;
|
|
101
|
-
let lastMatchEnd = chapterOffset;
|
|
102
|
-
while (sentenceIndex < sentenceEntries.length) {
|
|
103
|
-
const [sentenceId, sentence] = sentenceEntries[sentenceIndex];
|
|
104
|
-
const transcriptionWindowList = transcriptionSentences.slice(
|
|
105
|
-
transcriptionWindowIndex,
|
|
106
|
-
transcriptionWindowIndex + 10
|
|
107
|
-
);
|
|
108
|
-
const { result: transcriptionWindow, mapping } = await (0, import_slugify.slugify)(
|
|
109
|
-
transcriptionWindowList.join("-").slice(transcriptionWindowOffset),
|
|
110
|
-
locale
|
|
111
|
-
);
|
|
112
|
-
const inverted = mapping.invert();
|
|
113
|
-
const query = collapseWhitespace(sentence.trim()).toLowerCase();
|
|
114
|
-
const firstMatch = (0, import_fuzzy.findNearestMatch)(
|
|
115
|
-
query,
|
|
116
|
-
transcriptionWindow,
|
|
117
|
-
Math.max(Math.floor(0.25 * query.length), 1)
|
|
118
|
-
);
|
|
119
|
-
if (!firstMatch) {
|
|
120
|
-
sentenceIndex += 1;
|
|
121
|
-
notFound += 1;
|
|
122
|
-
if (notFound === 3 || sentenceIndex === sentenceEntries.length) {
|
|
123
|
-
transcriptionWindowIndex += 1;
|
|
124
|
-
if (transcriptionWindowIndex == lastGoodTranscriptionWindow + 30) {
|
|
125
|
-
transcriptionWindowIndex = lastGoodTranscriptionWindow;
|
|
126
|
-
notFound = 0;
|
|
127
|
-
continue;
|
|
128
|
-
}
|
|
129
|
-
sentenceIndex -= notFound;
|
|
130
|
-
notFound = 0;
|
|
131
|
-
}
|
|
132
|
-
continue;
|
|
63
|
+
const fullTranscript = transcription.transcript;
|
|
64
|
+
const chapterTranscript = fullTranscript.slice(
|
|
65
|
+
chapterOffset,
|
|
66
|
+
chapterEndOffset
|
|
67
|
+
);
|
|
68
|
+
const { result: slugifiedChapterTranscript, mapping: transcriptMapping } = await (0, import_slugify.slugify)(chapterTranscript, locale);
|
|
69
|
+
let chapterTranscriptEndIndex = chapterOffset;
|
|
70
|
+
let chapterSentenceIndex = startSentence;
|
|
71
|
+
let slugifiedChapterTranscriptWindowStartIndex = 0;
|
|
72
|
+
while (chapterSentenceIndex < endSentence) {
|
|
73
|
+
const slugifiedChapterSentenceWindowList = [];
|
|
74
|
+
let sentenceWindowLength = 0;
|
|
75
|
+
let i = chapterSentenceIndex;
|
|
76
|
+
while (sentenceWindowLength < 5e3 && i < sentences.length) {
|
|
77
|
+
const { result: sentence } = await (0, import_slugify.slugify)(sentences[i], locale);
|
|
78
|
+
slugifiedChapterSentenceWindowList.push(sentence);
|
|
79
|
+
sentenceWindowLength += sentence.length;
|
|
80
|
+
i++;
|
|
133
81
|
}
|
|
134
|
-
const
|
|
135
|
-
const
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
-1
|
|
82
|
+
const slugifiedChapterSentenceWindow = slugifiedChapterSentenceWindowList.join("-");
|
|
83
|
+
const slugifiedChapterTranscriptWindow = slugifiedChapterTranscript.slice(
|
|
84
|
+
slugifiedChapterTranscriptWindowStartIndex,
|
|
85
|
+
slugifiedChapterTranscriptWindowStartIndex + sentenceWindowLength * 1.2
|
|
139
86
|
);
|
|
140
|
-
const
|
|
141
|
-
|
|
142
|
-
|
|
87
|
+
const alignments = (0, import_errorAlign.errorAlign)(
|
|
88
|
+
slugifiedChapterSentenceWindow,
|
|
89
|
+
slugifiedChapterTranscriptWindow
|
|
143
90
|
);
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
const
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
91
|
+
let alignmentIndex = 0;
|
|
92
|
+
let currentTranscriptWindowIndex = 0;
|
|
93
|
+
for (const [i2, slugifiedSentence] of (0, import_itertools.enumerate)(
|
|
94
|
+
slugifiedChapterSentenceWindowList
|
|
95
|
+
)) {
|
|
96
|
+
if (!slugifiedSentence) continue;
|
|
97
|
+
const sentenceAlignments = getAlignmentsForSentence(
|
|
98
|
+
slugifiedSentence,
|
|
99
|
+
alignments.slice(alignmentIndex)
|
|
100
|
+
);
|
|
101
|
+
const sentenceLengthInSlugifiedTranscript = sentenceAlignments.filter((a) => a.opType !== "DELETE").map((a) => a.hyp).join("-").length;
|
|
102
|
+
const start = findStartTimestamp(
|
|
103
|
+
chapterOffset + transcriptMapping.invert().map(
|
|
104
|
+
slugifiedChapterTranscriptWindowStartIndex + currentTranscriptWindowIndex,
|
|
105
|
+
1
|
|
106
|
+
),
|
|
107
|
+
transcription
|
|
108
|
+
);
|
|
109
|
+
chapterTranscriptEndIndex = chapterOffset + transcriptMapping.invert().map(
|
|
110
|
+
slugifiedChapterTranscriptWindowStartIndex + currentTranscriptWindowIndex + sentenceLengthInSlugifiedTranscript,
|
|
111
|
+
-1
|
|
112
|
+
);
|
|
113
|
+
const end = findEndTimestamp(chapterTranscriptEndIndex, transcription);
|
|
114
|
+
if (start && end !== null) {
|
|
115
|
+
sentenceRanges.push({
|
|
116
|
+
id: i2 + chapterSentenceIndex,
|
|
117
|
+
start: start.start,
|
|
118
|
+
audiofile: start.audiofile,
|
|
119
|
+
end
|
|
120
|
+
});
|
|
167
121
|
}
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
}
|
|
173
|
-
} else {
|
|
174
|
-
const lastTrackDuration = await (0, import_ffmpeg.getTrackDuration)(
|
|
175
|
-
lastSentenceRange.audiofile
|
|
176
|
-
);
|
|
177
|
-
lastSentenceRange.end = lastTrackDuration;
|
|
178
|
-
if (sentenceId === 0) {
|
|
179
|
-
start = 0;
|
|
180
|
-
}
|
|
122
|
+
alignmentIndex += sentenceAlignments.length;
|
|
123
|
+
currentTranscriptWindowIndex += sentenceLengthInSlugifiedTranscript;
|
|
124
|
+
if (slugifiedChapterTranscriptWindow[currentTranscriptWindowIndex] === "-") {
|
|
125
|
+
currentTranscriptWindowIndex++;
|
|
181
126
|
}
|
|
182
|
-
} else if (sentenceId === 0) {
|
|
183
|
-
start = 0;
|
|
184
127
|
}
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
});
|
|
191
|
-
notFound = 0;
|
|
192
|
-
lastMatchEnd = matchEnd + transcriptionOffset + transcriptionWindowOffset + chapterOffset;
|
|
193
|
-
const windowIndexResult = getWindowIndexFromOffset(
|
|
194
|
-
transcriptionWindowList,
|
|
195
|
-
matchEnd + transcriptionWindowOffset
|
|
196
|
-
);
|
|
197
|
-
transcriptionWindowIndex += windowIndexResult.index;
|
|
198
|
-
transcriptionWindowOffset = windowIndexResult.offset;
|
|
199
|
-
lastGoodTranscriptionWindow = transcriptionWindowIndex;
|
|
200
|
-
sentenceIndex += 1;
|
|
128
|
+
chapterSentenceIndex += slugifiedChapterSentenceWindowList.length;
|
|
129
|
+
slugifiedChapterTranscriptWindowStartIndex += currentTranscriptWindowIndex;
|
|
130
|
+
if (slugifiedChapterTranscript[slugifiedChapterTranscriptWindowStartIndex] === "-") {
|
|
131
|
+
slugifiedChapterTranscriptWindowStartIndex++;
|
|
132
|
+
}
|
|
201
133
|
}
|
|
202
|
-
return {
|
|
203
|
-
sentenceRanges,
|
|
204
|
-
transcriptionOffset: lastMatchEnd
|
|
205
|
-
};
|
|
134
|
+
return { sentenceRanges, transcriptionOffset: chapterTranscriptEndIndex };
|
|
206
135
|
}
|
|
207
136
|
async function getLargestGap(trailing, leading) {
|
|
208
137
|
const leadingGap = leading.start;
|
|
@@ -14,7 +14,7 @@ type SentenceRange = {
|
|
|
14
14
|
audiofile: string;
|
|
15
15
|
};
|
|
16
16
|
declare function findEndTimestamp(matchEndIndex: number, transcription: StorytellerTranscription): number | null;
|
|
17
|
-
declare function getSentenceRanges(startSentence: number, transcription: StorytellerTranscription, sentences: string[], chapterOffset: number, locale: Intl.Locale
|
|
17
|
+
declare function getSentenceRanges(startSentence: number, endSentence: number, transcription: StorytellerTranscription, sentences: string[], chapterOffset: number, chapterEndOffset: number, locale: Intl.Locale): Promise<{
|
|
18
18
|
sentenceRanges: SentenceRange[];
|
|
19
19
|
transcriptionOffset: number;
|
|
20
20
|
}>;
|
|
@@ -14,7 +14,7 @@ type SentenceRange = {
|
|
|
14
14
|
audiofile: string;
|
|
15
15
|
};
|
|
16
16
|
declare function findEndTimestamp(matchEndIndex: number, transcription: StorytellerTranscription): number | null;
|
|
17
|
-
declare function getSentenceRanges(startSentence: number, transcription: StorytellerTranscription, sentences: string[], chapterOffset: number, locale: Intl.Locale
|
|
17
|
+
declare function getSentenceRanges(startSentence: number, endSentence: number, transcription: StorytellerTranscription, sentences: string[], chapterOffset: number, chapterEndOffset: number, locale: Intl.Locale): Promise<{
|
|
18
18
|
sentenceRanges: SentenceRange[];
|
|
19
19
|
transcriptionOffset: number;
|
|
20
20
|
}>;
|