@storyteller-platform/align 0.1.9 → 0.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/align/__tests__/align.test.cjs +6 -5
- package/dist/align/__tests__/align.test.js +6 -5
- package/dist/align/align.cjs +133 -81
- package/dist/align/align.d.cts +1 -0
- package/dist/align/align.d.ts +1 -0
- package/dist/align/align.js +133 -81
- package/dist/align/getSentenceRanges.cjs +78 -149
- package/dist/align/getSentenceRanges.d.cts +1 -1
- package/dist/align/getSentenceRanges.d.ts +1 -1
- package/dist/align/getSentenceRanges.js +78 -149
- package/dist/align/slugify.cjs +16 -8
- package/dist/align/slugify.js +16 -8
- package/dist/errorAlign/__tests__/errorAlign.test.cjs +100 -0
- package/dist/errorAlign/__tests__/errorAlign.test.d.cts +2 -0
- package/dist/errorAlign/__tests__/errorAlign.test.d.ts +2 -0
- package/dist/errorAlign/__tests__/errorAlign.test.js +77 -0
- package/dist/errorAlign/__tests__/native.test.cjs +118 -0
- package/dist/errorAlign/__tests__/native.test.d.cts +2 -0
- package/dist/errorAlign/__tests__/native.test.d.ts +2 -0
- package/dist/errorAlign/__tests__/native.test.js +107 -0
- package/dist/errorAlign/backtraceGraph.cjs +298 -0
- package/dist/errorAlign/backtraceGraph.d.cts +103 -0
- package/dist/errorAlign/backtraceGraph.d.ts +103 -0
- package/dist/errorAlign/backtraceGraph.js +270 -0
- package/dist/errorAlign/beamSearch.cjs +302 -0
- package/dist/errorAlign/beamSearch.d.cts +53 -0
- package/dist/errorAlign/beamSearch.d.ts +53 -0
- package/dist/errorAlign/beamSearch.js +268 -0
- package/dist/errorAlign/core.cjs +33 -0
- package/dist/errorAlign/core.d.cts +5 -0
- package/dist/errorAlign/core.d.ts +5 -0
- package/dist/errorAlign/core.js +11 -0
- package/dist/errorAlign/editDistance.cjs +115 -0
- package/dist/errorAlign/editDistance.d.cts +46 -0
- package/dist/errorAlign/editDistance.d.ts +46 -0
- package/dist/errorAlign/editDistance.js +90 -0
- package/dist/errorAlign/errorAlign.cjs +159 -0
- package/dist/errorAlign/errorAlign.d.cts +15 -0
- package/dist/errorAlign/errorAlign.d.ts +15 -0
- package/dist/errorAlign/errorAlign.js +145 -0
- package/dist/errorAlign/graphMetadata.cjs +97 -0
- package/dist/errorAlign/graphMetadata.d.cts +44 -0
- package/dist/errorAlign/graphMetadata.d.ts +44 -0
- package/dist/errorAlign/graphMetadata.js +64 -0
- package/dist/errorAlign/hash.cjs +173 -0
- package/dist/errorAlign/hash.d.cts +28 -0
- package/dist/errorAlign/hash.d.ts +28 -0
- package/dist/errorAlign/hash.js +150 -0
- package/dist/errorAlign/native.cjs +60 -0
- package/dist/errorAlign/native.d.cts +18 -0
- package/dist/errorAlign/native.d.ts +18 -0
- package/dist/errorAlign/native.js +24 -0
- package/dist/errorAlign/node-gyp-build.d.cjs +1 -0
- package/dist/errorAlign/node-gyp-build.d.d.cts +3 -0
- package/dist/errorAlign/node-gyp-build.d.d.ts +3 -0
- package/dist/errorAlign/node-gyp-build.d.js +0 -0
- package/dist/errorAlign/pathToAlignment.cjs +122 -0
- package/dist/errorAlign/pathToAlignment.d.cts +11 -0
- package/dist/errorAlign/pathToAlignment.d.ts +11 -0
- package/dist/errorAlign/pathToAlignment.js +89 -0
- package/dist/errorAlign/utils.cjs +301 -0
- package/dist/errorAlign/utils.d.cts +107 -0
- package/dist/errorAlign/utils.d.ts +107 -0
- package/dist/errorAlign/utils.js +248 -0
- package/dist/index.d.cts +1 -0
- package/dist/index.d.ts +1 -0
- package/dist/markup/__tests__/markup.test.cjs +108 -81
- package/dist/markup/__tests__/markup.test.js +109 -82
- package/dist/markup/__tests__/parseDom.test.cjs +112 -0
- package/dist/markup/__tests__/parseDom.test.d.cts +2 -0
- package/dist/markup/__tests__/parseDom.test.d.ts +2 -0
- package/dist/markup/__tests__/parseDom.test.js +89 -0
- package/dist/markup/__tests__/serializeDom.test.cjs +120 -0
- package/dist/markup/__tests__/serializeDom.test.d.cts +2 -0
- package/dist/markup/__tests__/serializeDom.test.d.ts +2 -0
- package/dist/markup/__tests__/serializeDom.test.js +97 -0
- package/dist/markup/__tests__/transform.test.cjs +122 -0
- package/dist/markup/__tests__/transform.test.d.cts +2 -0
- package/dist/markup/__tests__/transform.test.d.ts +2 -0
- package/dist/markup/__tests__/transform.test.js +99 -0
- package/dist/markup/map.cjs +261 -0
- package/dist/markup/map.d.cts +50 -0
- package/dist/markup/map.d.ts +50 -0
- package/dist/markup/map.js +236 -0
- package/dist/markup/markup.cjs +23 -201
- package/dist/markup/markup.d.cts +5 -9
- package/dist/markup/markup.d.ts +5 -9
- package/dist/markup/markup.js +24 -203
- package/dist/markup/model.cjs +172 -0
- package/dist/markup/model.d.cts +57 -0
- package/dist/markup/model.d.ts +57 -0
- package/dist/markup/model.js +145 -0
- package/dist/markup/parseDom.cjs +59 -0
- package/dist/markup/parseDom.d.cts +7 -0
- package/dist/markup/parseDom.d.ts +7 -0
- package/dist/markup/parseDom.js +35 -0
- package/dist/markup/segmentation.cjs +11 -57
- package/dist/markup/segmentation.d.cts +6 -2
- package/dist/markup/segmentation.d.ts +6 -2
- package/dist/markup/segmentation.js +11 -58
- package/dist/markup/serializeDom.cjs +87 -0
- package/dist/markup/serializeDom.d.cts +7 -0
- package/dist/markup/serializeDom.d.ts +7 -0
- package/dist/markup/serializeDom.js +63 -0
- package/dist/markup/transform.cjs +92 -0
- package/dist/markup/transform.d.cts +11 -0
- package/dist/markup/transform.d.ts +11 -0
- package/dist/markup/transform.js +71 -0
- package/dist/types/node-gyp-build.d.cjs +1 -0
- package/dist/types/node-gyp-build.d.d.cts +3 -0
- package/dist/types/node-gyp-build.d.d.ts +3 -0
- package/dist/types/node-gyp-build.d.js +0 -0
- package/package.json +11 -4
|
@@ -131,13 +131,13 @@ async function assertAlignSnapshot(context, epub, transcriptionFilepaths) {
|
|
|
131
131
|
"utf-8"
|
|
132
132
|
);
|
|
133
133
|
const chapterXml = import_epub.Epub.xhtmlParser.parse(chapterContents);
|
|
134
|
-
const segmentation = await (0, import_segmentation.getXhtmlSegmentation)(
|
|
134
|
+
const { result: segmentation } = await (0, import_segmentation.getXhtmlSegmentation)(
|
|
135
135
|
import_epub.Epub.getXhtmlBody(chapterXml),
|
|
136
136
|
{
|
|
137
137
|
primaryLocale: new Intl.Locale("en-US")
|
|
138
138
|
}
|
|
139
139
|
);
|
|
140
|
-
const chapterSentences = segmentation.
|
|
140
|
+
const chapterSentences = segmentation.map((s) => s.text).filter((s) => s.match(/\S/));
|
|
141
141
|
for (const par of import_epub.Epub.getXmlChildren(seq)) {
|
|
142
142
|
newSnapshot += `
|
|
143
143
|
`;
|
|
@@ -151,14 +151,14 @@ async function assertAlignSnapshot(context, epub, transcriptionFilepaths) {
|
|
|
151
151
|
if (sentenceId === void 0) continue;
|
|
152
152
|
const textSentence = chapterSentences[parseInt(sentenceId)];
|
|
153
153
|
if (!textSentence) continue;
|
|
154
|
-
newSnapshot += `Text: ${textSentence}
|
|
154
|
+
newSnapshot += `Text: ${textSentence.replace(/\n/, "")}
|
|
155
155
|
`;
|
|
156
156
|
const audioSrc = (_d = audio[":@"]) == null ? void 0 : _d["@_src"];
|
|
157
157
|
if (!audioSrc) continue;
|
|
158
158
|
const audioStart = (_e = audio[":@"]) == null ? void 0 : _e["@_clipBegin"];
|
|
159
159
|
const audioEnd = (_f = audio[":@"]) == null ? void 0 : _f["@_clipEnd"];
|
|
160
160
|
if (!audioStart || !audioEnd) continue;
|
|
161
|
-
const audioStartTime = parseFloat(audioStart.slice(0, -1));
|
|
161
|
+
const audioStartTime = parseFloat(audioStart.slice(0, -1)) - 2e-3;
|
|
162
162
|
const audioEndTime = parseFloat(audioEnd.slice(0, -1));
|
|
163
163
|
const audioFilename = (0, import_posix.basename)(audioSrc, (0, import_node_path.extname)(audioSrc));
|
|
164
164
|
const transcriptionFilepath = transcriptionFilepaths.find(
|
|
@@ -271,7 +271,8 @@ void (0, import_node_test.describe)("align", () => {
|
|
|
271
271
|
void 0,
|
|
272
272
|
createTestLogger()
|
|
273
273
|
);
|
|
274
|
-
await aligner.alignBook();
|
|
274
|
+
const timing = await aligner.alignBook();
|
|
275
|
+
if (!process.env["CI"]) timing.print();
|
|
275
276
|
await assertAlignSnapshot(context, epub, transcriptionFilepaths);
|
|
276
277
|
} catch (_) {
|
|
277
278
|
var _error = _, _hasError = true;
|
|
@@ -67,13 +67,13 @@ async function assertAlignSnapshot(context, epub, transcriptionFilepaths) {
|
|
|
67
67
|
"utf-8"
|
|
68
68
|
);
|
|
69
69
|
const chapterXml = Epub.xhtmlParser.parse(chapterContents);
|
|
70
|
-
const segmentation = await getXhtmlSegmentation(
|
|
70
|
+
const { result: segmentation } = await getXhtmlSegmentation(
|
|
71
71
|
Epub.getXhtmlBody(chapterXml),
|
|
72
72
|
{
|
|
73
73
|
primaryLocale: new Intl.Locale("en-US")
|
|
74
74
|
}
|
|
75
75
|
);
|
|
76
|
-
const chapterSentences = segmentation.
|
|
76
|
+
const chapterSentences = segmentation.map((s) => s.text).filter((s) => s.match(/\S/));
|
|
77
77
|
for (const par of Epub.getXmlChildren(seq)) {
|
|
78
78
|
newSnapshot += `
|
|
79
79
|
`;
|
|
@@ -87,14 +87,14 @@ async function assertAlignSnapshot(context, epub, transcriptionFilepaths) {
|
|
|
87
87
|
if (sentenceId === void 0) continue;
|
|
88
88
|
const textSentence = chapterSentences[parseInt(sentenceId)];
|
|
89
89
|
if (!textSentence) continue;
|
|
90
|
-
newSnapshot += `Text: ${textSentence}
|
|
90
|
+
newSnapshot += `Text: ${textSentence.replace(/\n/, "")}
|
|
91
91
|
`;
|
|
92
92
|
const audioSrc = (_d = audio[":@"]) == null ? void 0 : _d["@_src"];
|
|
93
93
|
if (!audioSrc) continue;
|
|
94
94
|
const audioStart = (_e = audio[":@"]) == null ? void 0 : _e["@_clipBegin"];
|
|
95
95
|
const audioEnd = (_f = audio[":@"]) == null ? void 0 : _f["@_clipEnd"];
|
|
96
96
|
if (!audioStart || !audioEnd) continue;
|
|
97
|
-
const audioStartTime = parseFloat(audioStart.slice(0, -1));
|
|
97
|
+
const audioStartTime = parseFloat(audioStart.slice(0, -1)) - 2e-3;
|
|
98
98
|
const audioEndTime = parseFloat(audioEnd.slice(0, -1));
|
|
99
99
|
const audioFilename = posixBasename(audioSrc, extname(audioSrc));
|
|
100
100
|
const transcriptionFilepath = transcriptionFilepaths.find(
|
|
@@ -207,7 +207,8 @@ void describe("align", () => {
|
|
|
207
207
|
void 0,
|
|
208
208
|
createTestLogger()
|
|
209
209
|
);
|
|
210
|
-
await aligner.alignBook();
|
|
210
|
+
const timing = await aligner.alignBook();
|
|
211
|
+
if (!process.env["CI"]) timing.print();
|
|
211
212
|
await assertAlignSnapshot(context, epub, transcriptionFilepaths);
|
|
212
213
|
} catch (_) {
|
|
213
214
|
var _error = _, _hasError = true;
|
package/dist/align/align.cjs
CHANGED
|
@@ -81,7 +81,9 @@ module.exports = __toCommonJS(align_exports);
|
|
|
81
81
|
var import_promises = require("node:fs/promises");
|
|
82
82
|
var import_node_path = require("node:path");
|
|
83
83
|
var import_posix = require("node:path/posix");
|
|
84
|
+
var import_itertools = require("itertools");
|
|
84
85
|
var import_memoize = __toESM(require("memoize"), 1);
|
|
86
|
+
var import_runes2 = require("runes2");
|
|
85
87
|
var import_audiobook = require("@storyteller-platform/audiobook");
|
|
86
88
|
var import_epub = require("@storyteller-platform/epub");
|
|
87
89
|
var import_ghost_story = require("@storyteller-platform/ghost-story");
|
|
@@ -90,7 +92,6 @@ var import_segmentation = require("../markup/segmentation.cjs");
|
|
|
90
92
|
var import_fuzzy = require("./fuzzy.cjs");
|
|
91
93
|
var import_getSentenceRanges = require("./getSentenceRanges.cjs");
|
|
92
94
|
var import_slugify = require("./slugify.cjs");
|
|
93
|
-
const OFFSET_SEARCH_WINDOW_SIZE = 5e3;
|
|
94
95
|
async function align(input, output, transcriptionsDir, audiobookDir, options) {
|
|
95
96
|
var _stack = [];
|
|
96
97
|
try {
|
|
@@ -141,6 +142,7 @@ async function align(input, output, transcriptionsDir, audiobookDir, options) {
|
|
|
141
142
|
class Aligner {
|
|
142
143
|
constructor(epub, audiofiles, transcriptions, granularity, languageOverride, logger) {
|
|
143
144
|
this.epub = epub;
|
|
145
|
+
this.audiofiles = audiofiles;
|
|
144
146
|
this.languageOverride = languageOverride;
|
|
145
147
|
this.logger = logger;
|
|
146
148
|
this.transcription = concatTranscriptions(transcriptions, audiofiles);
|
|
@@ -155,71 +157,92 @@ class Aligner {
|
|
|
155
157
|
report = {
|
|
156
158
|
chapters: []
|
|
157
159
|
};
|
|
158
|
-
findBestOffset(epubSentences, transcriptionText, lastMatchOffset,
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
160
|
+
findBestOffset(epubSentences, transcriptionText, lastMatchOffset, dir = 1) {
|
|
161
|
+
const reverse = dir < 0;
|
|
162
|
+
if (dir < 0) {
|
|
163
|
+
epubSentences = epubSentences.toReversed().map((s) => (0, import_runes2.runes)(s).toReversed().join(""));
|
|
164
|
+
transcriptionText = (0, import_runes2.runes)(transcriptionText).toReversed().join("");
|
|
165
|
+
lastMatchOffset = transcriptionText.length - lastMatchOffset;
|
|
166
|
+
}
|
|
167
|
+
const flatSliceIndices = [
|
|
168
|
+
0,
|
|
169
|
+
...this.alignedChapters.toSorted(
|
|
170
|
+
(a, b) => reverse ? transcriptionText.length - a.endOffset - (transcriptionText.length - b.endOffset) : a.startOffset - b.startOffset
|
|
171
|
+
).flatMap((aligned) => [
|
|
172
|
+
reverse ? transcriptionText.length - aligned.endOffset : aligned.startOffset,
|
|
173
|
+
reverse ? transcriptionText.length - aligned.startOffset : aligned.endOffset
|
|
174
|
+
]),
|
|
175
|
+
transcriptionText.length
|
|
176
|
+
];
|
|
177
|
+
const sliceIndices = [];
|
|
178
|
+
for (let i = 0; i < flatSliceIndices.length - 1; i += 2) {
|
|
179
|
+
sliceIndices.push([flatSliceIndices[i], flatSliceIndices[i + 1]]);
|
|
180
|
+
}
|
|
181
|
+
const allSlices = [];
|
|
182
|
+
let startSlice = 0;
|
|
183
|
+
for (const [i, [start, end]] of (0, import_itertools.enumerate)(sliceIndices)) {
|
|
184
|
+
if (lastMatchOffset >= start && lastMatchOffset < end) {
|
|
185
|
+
if (!reverse) {
|
|
186
|
+
startSlice = i + 1;
|
|
187
|
+
allSlices.push({
|
|
188
|
+
start,
|
|
189
|
+
text: transcriptionText.slice(start, lastMatchOffset)
|
|
190
|
+
});
|
|
183
191
|
}
|
|
192
|
+
allSlices.push({
|
|
193
|
+
start: lastMatchOffset,
|
|
194
|
+
text: transcriptionText.slice(lastMatchOffset, end)
|
|
195
|
+
});
|
|
196
|
+
} else if (!reverse) {
|
|
197
|
+
allSlices.push({ start, text: transcriptionText.slice(start, end) });
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
const slices = allSlices.filter((slice) => slice.text.length);
|
|
201
|
+
if (reverse && !slices.length) {
|
|
202
|
+
const indices = sliceIndices.find(([start]) => start > lastMatchOffset);
|
|
203
|
+
if (indices) {
|
|
204
|
+
slices.push({
|
|
205
|
+
start: indices[0],
|
|
206
|
+
text: transcriptionText.slice(...indices)
|
|
207
|
+
});
|
|
184
208
|
}
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
209
|
+
}
|
|
210
|
+
for (const slice of slices.slice(startSlice).concat(slices.slice(0, startSlice))) {
|
|
211
|
+
let startSentence = 0;
|
|
212
|
+
while (startSentence < epubSentences.length) {
|
|
213
|
+
const needle = epubSentences.slice(startSentence, startSentence + 6).join("-");
|
|
214
|
+
const firstMatch = (0, import_fuzzy.findNearestMatch)(
|
|
215
|
+
needle,
|
|
216
|
+
slice.text,
|
|
217
|
+
Math.max(Math.floor(0.1 * needle.length), 1)
|
|
189
218
|
);
|
|
190
|
-
|
|
191
|
-
const
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
);
|
|
197
|
-
if (firstMatch) {
|
|
198
|
-
return {
|
|
199
|
-
startSentence,
|
|
200
|
-
transcriptionOffset: (firstMatch.index + startIndex) % transcriptionText.length
|
|
201
|
-
};
|
|
202
|
-
}
|
|
203
|
-
startSentence += 3;
|
|
219
|
+
if (firstMatch) {
|
|
220
|
+
const start = reverse ? transcriptionText.length - (slice.start + firstMatch.index) : slice.start + firstMatch.index;
|
|
221
|
+
return {
|
|
222
|
+
startSentence: reverse ? epubSentences.length - startSentence : startSentence,
|
|
223
|
+
transcriptionOffset: start
|
|
224
|
+
};
|
|
204
225
|
}
|
|
226
|
+
startSentence += 3;
|
|
205
227
|
}
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
228
|
+
}
|
|
229
|
+
if (reverse) {
|
|
230
|
+
return {
|
|
231
|
+
startSentence: epubSentences.length,
|
|
232
|
+
transcriptionOffset: slices[0] ? transcriptionText.length - slices[0].start : null
|
|
233
|
+
};
|
|
211
234
|
}
|
|
212
235
|
return { startSentence: 0, transcriptionOffset: null };
|
|
213
236
|
}
|
|
214
237
|
async getChapterSentences(chapterId) {
|
|
215
238
|
const chapterXml = await this.epub.readXhtmlItemContents(chapterId);
|
|
216
|
-
const segmentation = await (0, import_segmentation.getXhtmlSegmentation)(
|
|
239
|
+
const { result: segmentation } = await (0, import_segmentation.getXhtmlSegmentation)(
|
|
217
240
|
import_epub.Epub.getXhtmlBody(chapterXml),
|
|
218
241
|
{
|
|
219
242
|
primaryLocale: this.languageOverride ?? await this.epub.getLanguage()
|
|
220
243
|
}
|
|
221
244
|
);
|
|
222
|
-
return segmentation.
|
|
245
|
+
return segmentation.map((s) => s.text).filter((s) => s.match(/\S/));
|
|
223
246
|
}
|
|
224
247
|
async writeAlignedChapter(alignedChapter) {
|
|
225
248
|
const { chapter, sentenceRanges, xml } = alignedChapter;
|
|
@@ -314,7 +337,7 @@ class Aligner {
|
|
|
314
337
|
}, [])
|
|
315
338
|
});
|
|
316
339
|
}
|
|
317
|
-
async alignChapter(startSentence, chapterId, transcriptionOffset, locale,
|
|
340
|
+
async alignChapter(startSentence, endSentence, chapterId, transcriptionOffset, transcriptionEndOffset, locale, mapping) {
|
|
318
341
|
const timing = (0, import_ghost_story.createTiming)();
|
|
319
342
|
timing.start("read contents");
|
|
320
343
|
const manifest = await this.epub.getManifest();
|
|
@@ -331,20 +354,14 @@ class Aligner {
|
|
|
331
354
|
timing.start("align sentences");
|
|
332
355
|
const { sentenceRanges, transcriptionOffset: endTranscriptionOffset } = await (0, import_getSentenceRanges.getSentenceRanges)(
|
|
333
356
|
startSentence,
|
|
357
|
+
endSentence,
|
|
334
358
|
this.transcription,
|
|
335
359
|
chapterSentences,
|
|
336
360
|
transcriptionOffset,
|
|
337
|
-
|
|
338
|
-
|
|
361
|
+
transcriptionEndOffset,
|
|
362
|
+
locale
|
|
339
363
|
);
|
|
340
364
|
timing.end("align sentences");
|
|
341
|
-
timing.start("expand ranges");
|
|
342
|
-
const interpolated = await (0, import_getSentenceRanges.interpolateSentenceRanges)(
|
|
343
|
-
sentenceRanges,
|
|
344
|
-
lastSentenceRange
|
|
345
|
-
);
|
|
346
|
-
const expanded = (0, import_getSentenceRanges.expandEmptySentenceRanges)(interpolated);
|
|
347
|
-
timing.end("expand ranges");
|
|
348
365
|
const storytellerStylesheetUrl = (0, import_posix.relative)(
|
|
349
366
|
(0, import_posix.dirname)(chapter.href),
|
|
350
367
|
"Styles/storyteller-readaloud.css"
|
|
@@ -357,25 +374,25 @@ class Aligner {
|
|
|
357
374
|
this.alignedChapters.push({
|
|
358
375
|
chapter,
|
|
359
376
|
xml: chapterXml,
|
|
360
|
-
sentenceRanges
|
|
361
|
-
startOffset: transcriptionOffset,
|
|
362
|
-
endOffset: endTranscriptionOffset
|
|
377
|
+
sentenceRanges,
|
|
378
|
+
startOffset: mapping.map(transcriptionOffset),
|
|
379
|
+
endOffset: mapping.map(endTranscriptionOffset, -1)
|
|
363
380
|
});
|
|
364
381
|
this.addChapterReport(
|
|
365
382
|
chapter,
|
|
366
383
|
chapterSentences,
|
|
367
|
-
|
|
384
|
+
sentenceRanges,
|
|
368
385
|
startSentence,
|
|
369
386
|
transcriptionOffset
|
|
370
387
|
);
|
|
371
388
|
return {
|
|
372
|
-
lastSentenceRange:
|
|
389
|
+
lastSentenceRange: sentenceRanges.at(-1) ?? null,
|
|
373
390
|
endTranscriptionOffset,
|
|
374
391
|
timing
|
|
375
392
|
};
|
|
376
393
|
}
|
|
377
394
|
async alignBook(onProgress) {
|
|
378
|
-
var _a, _b, _c, _d, _e, _f, _g;
|
|
395
|
+
var _a, _b, _c, _d, _e, _f, _g, _h;
|
|
379
396
|
const locale = this.languageOverride ?? await this.epub.getLanguage() ?? new Intl.Locale("en-US");
|
|
380
397
|
this.timing.setMetadata("language", locale.toString());
|
|
381
398
|
this.timing.setMetadata("granularity", this.granularity);
|
|
@@ -386,7 +403,6 @@ class Aligner {
|
|
|
386
403
|
locale
|
|
387
404
|
);
|
|
388
405
|
let lastTranscriptionOffset = 0;
|
|
389
|
-
let lastSentenceRange = null;
|
|
390
406
|
for (let index = 0; index < spine.length; index++) {
|
|
391
407
|
onProgress == null ? void 0 : onProgress(index / spine.length);
|
|
392
408
|
const spineItem = spine[index];
|
|
@@ -418,36 +434,72 @@ class Aligner {
|
|
|
418
434
|
const { startSentence, transcriptionOffset: slugifiedOffset } = this.findBestOffset(
|
|
419
435
|
slugifiedChapterSentences,
|
|
420
436
|
transcriptionText,
|
|
421
|
-
mapping.map(lastTranscriptionOffset, -1)
|
|
422
|
-
mapping
|
|
437
|
+
mapping.map(lastTranscriptionOffset, -1)
|
|
423
438
|
);
|
|
424
|
-
|
|
425
|
-
if (transcriptionOffset === null) {
|
|
439
|
+
if (slugifiedOffset === null) {
|
|
426
440
|
(_f = this.logger) == null ? void 0 : _f.info(
|
|
427
441
|
`Couldn't find matching transcription for chapter #${index}`
|
|
428
442
|
);
|
|
429
443
|
continue;
|
|
430
444
|
}
|
|
431
|
-
|
|
432
|
-
|
|
445
|
+
const transcriptionOffset = mapping.invert().map(slugifiedOffset, -1);
|
|
446
|
+
const {
|
|
447
|
+
startSentence: startEndSentence,
|
|
448
|
+
transcriptionOffset: slugifiedEndOffset
|
|
449
|
+
} = this.findBestOffset(
|
|
450
|
+
slugifiedChapterSentences,
|
|
451
|
+
transcriptionText,
|
|
452
|
+
Math.min(
|
|
453
|
+
transcriptionText.length,
|
|
454
|
+
slugifiedOffset + Math.round(slugifiedChapterSentences.join("-").length * 1.2)
|
|
455
|
+
),
|
|
456
|
+
-1
|
|
457
|
+
);
|
|
458
|
+
const endSentence = startEndSentence;
|
|
459
|
+
const endOffset = slugifiedEndOffset === null ? this.transcription.transcript.length : mapping.invert().map(slugifiedEndOffset, 1);
|
|
460
|
+
if (endSentence - startSentence < slugifiedChapterSentences.length / 2) {
|
|
461
|
+
(_g = this.logger) == null ? void 0 : _g.info(`Found less than half of chapter #${index}, skipping`);
|
|
462
|
+
}
|
|
463
|
+
(_h = this.logger) == null ? void 0 : _h.info(
|
|
464
|
+
`Chapter #${index} best matches transcription from ${transcriptionOffset} to ${endOffset}, from sentence ${startSentence} to ${endSentence} (of ${slugifiedChapterSentences.length}) in the book`
|
|
433
465
|
);
|
|
434
466
|
const result = await this.alignChapter(
|
|
435
467
|
startSentence,
|
|
468
|
+
endSentence,
|
|
436
469
|
chapterId,
|
|
437
470
|
transcriptionOffset,
|
|
471
|
+
endOffset,
|
|
438
472
|
locale,
|
|
439
|
-
|
|
473
|
+
mapping
|
|
440
474
|
);
|
|
441
|
-
lastSentenceRange = result.lastSentenceRange;
|
|
442
475
|
lastTranscriptionOffset = result.endTranscriptionOffset;
|
|
443
476
|
this.timing.add(result.timing.summary());
|
|
444
477
|
}
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
478
|
+
const audioOrderedChapters = this.alignedChapters.toSorted((a, b) => {
|
|
479
|
+
const firstRangeA = a.sentenceRanges[0];
|
|
480
|
+
const firstRangeB = b.sentenceRanges[0];
|
|
481
|
+
if (!firstRangeA) return 1;
|
|
482
|
+
if (!firstRangeB) return -1;
|
|
483
|
+
const firstAudiofileIndexA = this.audiofiles.indexOf(
|
|
484
|
+
firstRangeA.audiofile
|
|
448
485
|
);
|
|
449
|
-
|
|
450
|
-
|
|
486
|
+
const firstAudiofileIndexB = this.audiofiles.indexOf(
|
|
487
|
+
firstRangeB.audiofile
|
|
488
|
+
);
|
|
489
|
+
if (firstAudiofileIndexA === firstAudiofileIndexB) {
|
|
490
|
+
return firstRangeA.start - firstRangeB.start;
|
|
491
|
+
}
|
|
492
|
+
return firstAudiofileIndexA - firstAudiofileIndexB;
|
|
493
|
+
});
|
|
494
|
+
let lastSentenceRange = null;
|
|
495
|
+
for (const alignedChapter of audioOrderedChapters) {
|
|
496
|
+
const interpolated = await (0, import_getSentenceRanges.interpolateSentenceRanges)(
|
|
497
|
+
alignedChapter.sentenceRanges,
|
|
498
|
+
lastSentenceRange
|
|
499
|
+
);
|
|
500
|
+
const expanded = (0, import_getSentenceRanges.expandEmptySentenceRanges)(interpolated);
|
|
501
|
+
alignedChapter.sentenceRanges = expanded;
|
|
502
|
+
lastSentenceRange = expanded.at(-1) ?? null;
|
|
451
503
|
await this.writeAlignedChapter(alignedChapter);
|
|
452
504
|
}
|
|
453
505
|
await this.epub.addMetadata({
|
package/dist/align/align.d.cts
CHANGED
|
@@ -37,6 +37,7 @@ interface AlignOptions {
|
|
|
37
37
|
declare function align(input: string, output: string, transcriptionsDir: string, audiobookDir: string, options: AlignOptions): Promise<_storyteller_platform_ghost_story.TimingAggregator>;
|
|
38
38
|
declare class Aligner {
|
|
39
39
|
epub: Epub;
|
|
40
|
+
private audiofiles;
|
|
40
41
|
private languageOverride?;
|
|
41
42
|
private logger?;
|
|
42
43
|
private transcription;
|
package/dist/align/align.d.ts
CHANGED
|
@@ -37,6 +37,7 @@ interface AlignOptions {
|
|
|
37
37
|
declare function align(input: string, output: string, transcriptionsDir: string, audiobookDir: string, options: AlignOptions): Promise<_storyteller_platform_ghost_story.TimingAggregator>;
|
|
38
38
|
declare class Aligner {
|
|
39
39
|
epub: Epub;
|
|
40
|
+
private audiofiles;
|
|
40
41
|
private languageOverride?;
|
|
41
42
|
private logger?;
|
|
42
43
|
private transcription;
|