@storyteller-platform/align 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/align/__tests__/align.test.cjs +282 -0
- package/dist/align/__tests__/align.test.d.cts +2 -0
- package/dist/align/__tests__/align.test.d.ts +2 -0
- package/dist/align/__tests__/align.test.js +218 -0
- package/dist/align/__tests__/slugify.test.cjs +64 -0
- package/dist/align/__tests__/slugify.test.d.cts +2 -0
- package/dist/align/__tests__/slugify.test.d.ts +2 -0
- package/dist/align/__tests__/slugify.test.js +41 -0
- package/dist/align/align.cjs +41 -21
- package/dist/align/align.js +41 -21
- package/dist/align/fuzzy.cjs +1 -1
- package/dist/align/fuzzy.js +1 -1
- package/dist/align/getSentenceRanges.cjs +24 -12
- package/dist/align/getSentenceRanges.d.cts +1 -1
- package/dist/align/getSentenceRanges.d.ts +1 -1
- package/dist/align/getSentenceRanges.js +24 -12
- package/dist/align/slugify.cjs +125 -0
- package/dist/align/slugify.d.cts +8 -0
- package/dist/align/slugify.d.ts +8 -0
- package/dist/align/slugify.js +102 -0
- package/package.json +6 -3
package/dist/align/align.cjs
CHANGED
|
@@ -89,10 +89,12 @@ var import_ffmpeg = require("../common/ffmpeg.cjs");
|
|
|
89
89
|
var import_segmentation = require("../markup/segmentation.cjs");
|
|
90
90
|
var import_fuzzy = require("./fuzzy.cjs");
|
|
91
91
|
var import_getSentenceRanges = require("./getSentenceRanges.cjs");
|
|
92
|
+
var import_slugify = require("./slugify.cjs");
|
|
92
93
|
const OFFSET_SEARCH_WINDOW_SIZE = 5e3;
|
|
93
94
|
async function align(input, output, transcriptionsDir, audiobookDir, options) {
|
|
94
95
|
var _stack = [];
|
|
95
96
|
try {
|
|
97
|
+
await (0, import_promises.mkdir)((0, import_posix.dirname)(output), { recursive: true });
|
|
96
98
|
await (0, import_promises.copyFile)(input, output);
|
|
97
99
|
const audiobookFiles = await (0, import_promises.readdir)(audiobookDir).then(
|
|
98
100
|
(filenames) => filenames.filter((f) => (0, import_audiobook.isAudioFile)(f)).map((f) => (0, import_node_path.join)(audiobookDir, f))
|
|
@@ -118,6 +120,7 @@ async function align(input, output, transcriptionsDir, audiobookDir, options) {
|
|
|
118
120
|
options.logger
|
|
119
121
|
);
|
|
120
122
|
const timing = await aligner.alignBook(options.onProgress);
|
|
123
|
+
await epub.saveAndClose();
|
|
121
124
|
if (options.reportsPath) {
|
|
122
125
|
await (0, import_promises.mkdir)((0, import_node_path.dirname)(options.reportsPath), { recursive: true });
|
|
123
126
|
await (0, import_promises.writeFile)(
|
|
@@ -152,7 +155,7 @@ class Aligner {
|
|
|
152
155
|
report = {
|
|
153
156
|
chapters: []
|
|
154
157
|
};
|
|
155
|
-
findBestOffset(epubSentences, transcriptionText, lastMatchOffset) {
|
|
158
|
+
findBestOffset(epubSentences, transcriptionText, lastMatchOffset, mapping) {
|
|
156
159
|
let i = 0;
|
|
157
160
|
while (i < transcriptionText.length) {
|
|
158
161
|
let startSentence = 0;
|
|
@@ -164,11 +167,13 @@ class Aligner {
|
|
|
164
167
|
let startSeen = null;
|
|
165
168
|
let endSeen = null;
|
|
166
169
|
for (const aligned of this.alignedChapters) {
|
|
167
|
-
|
|
168
|
-
|
|
170
|
+
const alignedStart = mapping.map(aligned.startOffset, -1);
|
|
171
|
+
const alignedEnd = mapping.map(aligned.endOffset, -1);
|
|
172
|
+
if (startSeen !== null && endSeen === alignedStart) {
|
|
173
|
+
endSeen = alignedEnd;
|
|
169
174
|
} else {
|
|
170
|
-
startSeen =
|
|
171
|
-
endSeen =
|
|
175
|
+
startSeen = alignedStart;
|
|
176
|
+
endSeen = alignedEnd;
|
|
172
177
|
}
|
|
173
178
|
if (startIndex >= startSeen && startIndex < endSeen) {
|
|
174
179
|
startIndex = endSeen;
|
|
@@ -183,7 +188,7 @@ class Aligner {
|
|
|
183
188
|
endIndex
|
|
184
189
|
);
|
|
185
190
|
while (startSentence < epubSentences.length) {
|
|
186
|
-
const queryString = epubSentences.slice(startSentence, startSentence + 6).join("
|
|
191
|
+
const queryString = epubSentences.slice(startSentence, startSentence + 6).join("-");
|
|
187
192
|
const firstMatch = (0, import_fuzzy.findNearestMatch)(
|
|
188
193
|
queryString.toLowerCase(),
|
|
189
194
|
transcriptionTextSlice.toLowerCase(),
|
|
@@ -309,7 +314,7 @@ class Aligner {
|
|
|
309
314
|
}, [])
|
|
310
315
|
});
|
|
311
316
|
}
|
|
312
|
-
async alignChapter(startSentence, chapterId, transcriptionOffset, lastSentenceRange) {
|
|
317
|
+
async alignChapter(startSentence, chapterId, transcriptionOffset, locale, lastSentenceRange) {
|
|
313
318
|
const timing = (0, import_ghost_story.createTiming)();
|
|
314
319
|
timing.start("read contents");
|
|
315
320
|
const manifest = await this.epub.getManifest();
|
|
@@ -329,6 +334,7 @@ class Aligner {
|
|
|
329
334
|
this.transcription,
|
|
330
335
|
chapterSentences,
|
|
331
336
|
transcriptionOffset,
|
|
337
|
+
locale,
|
|
332
338
|
lastSentenceRange
|
|
333
339
|
);
|
|
334
340
|
timing.end("align sentences");
|
|
@@ -369,53 +375,67 @@ class Aligner {
|
|
|
369
375
|
};
|
|
370
376
|
}
|
|
371
377
|
async alignBook(onProgress) {
|
|
372
|
-
var _a, _b, _c, _d, _e, _f;
|
|
373
|
-
this.
|
|
374
|
-
|
|
375
|
-
((_a = this.languageOverride ?? await this.epub.getLanguage()) == null ? void 0 : _a.language) ?? "unknown"
|
|
376
|
-
);
|
|
378
|
+
var _a, _b, _c, _d, _e, _f, _g;
|
|
379
|
+
const locale = this.languageOverride ?? await this.epub.getLanguage() ?? new Intl.Locale("en-US");
|
|
380
|
+
this.timing.setMetadata("language", locale.toString());
|
|
377
381
|
this.timing.setMetadata("granularity", this.granularity);
|
|
378
382
|
const spine = await this.epub.getSpineItems();
|
|
379
|
-
const
|
|
383
|
+
const manifest = await this.epub.getManifest();
|
|
384
|
+
const { result: transcriptionText, mapping } = await (0, import_slugify.slugify)(
|
|
385
|
+
this.transcription.transcript,
|
|
386
|
+
locale
|
|
387
|
+
);
|
|
380
388
|
let lastTranscriptionOffset = 0;
|
|
381
389
|
let lastSentenceRange = null;
|
|
382
390
|
for (let index = 0; index < spine.length; index++) {
|
|
383
391
|
onProgress == null ? void 0 : onProgress(index / spine.length);
|
|
384
392
|
const spineItem = spine[index];
|
|
385
|
-
(
|
|
393
|
+
(_a = this.logger) == null ? void 0 : _a.info(
|
|
386
394
|
`Aligning epub item #${index} : ${(0, import_posix.basename)(spineItem.href)}`
|
|
387
395
|
);
|
|
388
396
|
const chapterId = spineItem.id;
|
|
397
|
+
if ((_c = (_b = manifest[chapterId]) == null ? void 0 : _b.properties) == null ? void 0 : _c.includes("nav")) {
|
|
398
|
+
continue;
|
|
399
|
+
}
|
|
389
400
|
const chapterSentences = await this.getChapterSentences(chapterId);
|
|
401
|
+
const slugifiedChapterSentences = [];
|
|
402
|
+
for (const chapterSentence of chapterSentences) {
|
|
403
|
+
slugifiedChapterSentences.push(
|
|
404
|
+
(await (0, import_slugify.slugify)(chapterSentence, locale)).result
|
|
405
|
+
);
|
|
406
|
+
}
|
|
390
407
|
if (chapterSentences.length === 0) {
|
|
391
|
-
(
|
|
408
|
+
(_d = this.logger) == null ? void 0 : _d.info(`Chapter #${index} has no text; skipping`);
|
|
392
409
|
continue;
|
|
393
410
|
}
|
|
394
411
|
if (chapterSentences.length < 2 && // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
|
395
412
|
chapterSentences[0].split(" ").length < 4) {
|
|
396
|
-
(
|
|
413
|
+
(_e = this.logger) == null ? void 0 : _e.info(
|
|
397
414
|
`Chapter #${index} is fewer than four words; skipping`
|
|
398
415
|
);
|
|
399
416
|
continue;
|
|
400
417
|
}
|
|
401
|
-
const { startSentence, transcriptionOffset } = this.findBestOffset(
|
|
402
|
-
|
|
418
|
+
const { startSentence, transcriptionOffset: slugifiedOffset } = this.findBestOffset(
|
|
419
|
+
slugifiedChapterSentences,
|
|
403
420
|
transcriptionText,
|
|
404
|
-
lastTranscriptionOffset
|
|
421
|
+
mapping.map(lastTranscriptionOffset, -1),
|
|
422
|
+
mapping
|
|
405
423
|
);
|
|
424
|
+
const transcriptionOffset = slugifiedOffset && mapping.invert().map(slugifiedOffset, -1);
|
|
406
425
|
if (transcriptionOffset === null) {
|
|
407
|
-
(
|
|
426
|
+
(_f = this.logger) == null ? void 0 : _f.info(
|
|
408
427
|
`Couldn't find matching transcription for chapter #${index}`
|
|
409
428
|
);
|
|
410
429
|
continue;
|
|
411
430
|
}
|
|
412
|
-
(
|
|
431
|
+
(_g = this.logger) == null ? void 0 : _g.info(
|
|
413
432
|
`Chapter #${index} best matches transcription at offset ${transcriptionOffset}, starting at sentence ${startSentence}`
|
|
414
433
|
);
|
|
415
434
|
const result = await this.alignChapter(
|
|
416
435
|
startSentence,
|
|
417
436
|
chapterId,
|
|
418
437
|
transcriptionOffset,
|
|
438
|
+
locale,
|
|
419
439
|
lastSentenceRange
|
|
420
440
|
);
|
|
421
441
|
lastSentenceRange = result.lastSentenceRange;
|
package/dist/align/align.js
CHANGED
|
@@ -23,10 +23,12 @@ import {
|
|
|
23
23
|
getSentenceRanges,
|
|
24
24
|
interpolateSentenceRanges
|
|
25
25
|
} from "./getSentenceRanges.js";
|
|
26
|
+
import { slugify } from "./slugify.js";
|
|
26
27
|
const OFFSET_SEARCH_WINDOW_SIZE = 5e3;
|
|
27
28
|
async function align(input, output, transcriptionsDir, audiobookDir, options) {
|
|
28
29
|
var _stack = [];
|
|
29
30
|
try {
|
|
31
|
+
await mkdir(dirname(output), { recursive: true });
|
|
30
32
|
await copyFile(input, output);
|
|
31
33
|
const audiobookFiles = await readdir(audiobookDir).then(
|
|
32
34
|
(filenames) => filenames.filter((f) => isAudioFile(f)).map((f) => autoJoin(audiobookDir, f))
|
|
@@ -52,6 +54,7 @@ async function align(input, output, transcriptionsDir, audiobookDir, options) {
|
|
|
52
54
|
options.logger
|
|
53
55
|
);
|
|
54
56
|
const timing = await aligner.alignBook(options.onProgress);
|
|
57
|
+
await epub.saveAndClose();
|
|
55
58
|
if (options.reportsPath) {
|
|
56
59
|
await mkdir(autoDirname(options.reportsPath), { recursive: true });
|
|
57
60
|
await writeFile(
|
|
@@ -86,7 +89,7 @@ class Aligner {
|
|
|
86
89
|
report = {
|
|
87
90
|
chapters: []
|
|
88
91
|
};
|
|
89
|
-
findBestOffset(epubSentences, transcriptionText, lastMatchOffset) {
|
|
92
|
+
findBestOffset(epubSentences, transcriptionText, lastMatchOffset, mapping) {
|
|
90
93
|
let i = 0;
|
|
91
94
|
while (i < transcriptionText.length) {
|
|
92
95
|
let startSentence = 0;
|
|
@@ -98,11 +101,13 @@ class Aligner {
|
|
|
98
101
|
let startSeen = null;
|
|
99
102
|
let endSeen = null;
|
|
100
103
|
for (const aligned of this.alignedChapters) {
|
|
101
|
-
|
|
102
|
-
|
|
104
|
+
const alignedStart = mapping.map(aligned.startOffset, -1);
|
|
105
|
+
const alignedEnd = mapping.map(aligned.endOffset, -1);
|
|
106
|
+
if (startSeen !== null && endSeen === alignedStart) {
|
|
107
|
+
endSeen = alignedEnd;
|
|
103
108
|
} else {
|
|
104
|
-
startSeen =
|
|
105
|
-
endSeen =
|
|
109
|
+
startSeen = alignedStart;
|
|
110
|
+
endSeen = alignedEnd;
|
|
106
111
|
}
|
|
107
112
|
if (startIndex >= startSeen && startIndex < endSeen) {
|
|
108
113
|
startIndex = endSeen;
|
|
@@ -117,7 +122,7 @@ class Aligner {
|
|
|
117
122
|
endIndex
|
|
118
123
|
);
|
|
119
124
|
while (startSentence < epubSentences.length) {
|
|
120
|
-
const queryString = epubSentences.slice(startSentence, startSentence + 6).join("
|
|
125
|
+
const queryString = epubSentences.slice(startSentence, startSentence + 6).join("-");
|
|
121
126
|
const firstMatch = findNearestMatch(
|
|
122
127
|
queryString.toLowerCase(),
|
|
123
128
|
transcriptionTextSlice.toLowerCase(),
|
|
@@ -243,7 +248,7 @@ class Aligner {
|
|
|
243
248
|
}, [])
|
|
244
249
|
});
|
|
245
250
|
}
|
|
246
|
-
async alignChapter(startSentence, chapterId, transcriptionOffset, lastSentenceRange) {
|
|
251
|
+
async alignChapter(startSentence, chapterId, transcriptionOffset, locale, lastSentenceRange) {
|
|
247
252
|
const timing = createTiming();
|
|
248
253
|
timing.start("read contents");
|
|
249
254
|
const manifest = await this.epub.getManifest();
|
|
@@ -263,6 +268,7 @@ class Aligner {
|
|
|
263
268
|
this.transcription,
|
|
264
269
|
chapterSentences,
|
|
265
270
|
transcriptionOffset,
|
|
271
|
+
locale,
|
|
266
272
|
lastSentenceRange
|
|
267
273
|
);
|
|
268
274
|
timing.end("align sentences");
|
|
@@ -303,53 +309,67 @@ class Aligner {
|
|
|
303
309
|
};
|
|
304
310
|
}
|
|
305
311
|
async alignBook(onProgress) {
|
|
306
|
-
var _a, _b, _c, _d, _e, _f;
|
|
307
|
-
this.
|
|
308
|
-
|
|
309
|
-
((_a = this.languageOverride ?? await this.epub.getLanguage()) == null ? void 0 : _a.language) ?? "unknown"
|
|
310
|
-
);
|
|
312
|
+
var _a, _b, _c, _d, _e, _f, _g;
|
|
313
|
+
const locale = this.languageOverride ?? await this.epub.getLanguage() ?? new Intl.Locale("en-US");
|
|
314
|
+
this.timing.setMetadata("language", locale.toString());
|
|
311
315
|
this.timing.setMetadata("granularity", this.granularity);
|
|
312
316
|
const spine = await this.epub.getSpineItems();
|
|
313
|
-
const
|
|
317
|
+
const manifest = await this.epub.getManifest();
|
|
318
|
+
const { result: transcriptionText, mapping } = await slugify(
|
|
319
|
+
this.transcription.transcript,
|
|
320
|
+
locale
|
|
321
|
+
);
|
|
314
322
|
let lastTranscriptionOffset = 0;
|
|
315
323
|
let lastSentenceRange = null;
|
|
316
324
|
for (let index = 0; index < spine.length; index++) {
|
|
317
325
|
onProgress == null ? void 0 : onProgress(index / spine.length);
|
|
318
326
|
const spineItem = spine[index];
|
|
319
|
-
(
|
|
327
|
+
(_a = this.logger) == null ? void 0 : _a.info(
|
|
320
328
|
`Aligning epub item #${index} : ${basename(spineItem.href)}`
|
|
321
329
|
);
|
|
322
330
|
const chapterId = spineItem.id;
|
|
331
|
+
if ((_c = (_b = manifest[chapterId]) == null ? void 0 : _b.properties) == null ? void 0 : _c.includes("nav")) {
|
|
332
|
+
continue;
|
|
333
|
+
}
|
|
323
334
|
const chapterSentences = await this.getChapterSentences(chapterId);
|
|
335
|
+
const slugifiedChapterSentences = [];
|
|
336
|
+
for (const chapterSentence of chapterSentences) {
|
|
337
|
+
slugifiedChapterSentences.push(
|
|
338
|
+
(await slugify(chapterSentence, locale)).result
|
|
339
|
+
);
|
|
340
|
+
}
|
|
324
341
|
if (chapterSentences.length === 0) {
|
|
325
|
-
(
|
|
342
|
+
(_d = this.logger) == null ? void 0 : _d.info(`Chapter #${index} has no text; skipping`);
|
|
326
343
|
continue;
|
|
327
344
|
}
|
|
328
345
|
if (chapterSentences.length < 2 && // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
|
329
346
|
chapterSentences[0].split(" ").length < 4) {
|
|
330
|
-
(
|
|
347
|
+
(_e = this.logger) == null ? void 0 : _e.info(
|
|
331
348
|
`Chapter #${index} is fewer than four words; skipping`
|
|
332
349
|
);
|
|
333
350
|
continue;
|
|
334
351
|
}
|
|
335
|
-
const { startSentence, transcriptionOffset } = this.findBestOffset(
|
|
336
|
-
|
|
352
|
+
const { startSentence, transcriptionOffset: slugifiedOffset } = this.findBestOffset(
|
|
353
|
+
slugifiedChapterSentences,
|
|
337
354
|
transcriptionText,
|
|
338
|
-
lastTranscriptionOffset
|
|
355
|
+
mapping.map(lastTranscriptionOffset, -1),
|
|
356
|
+
mapping
|
|
339
357
|
);
|
|
358
|
+
const transcriptionOffset = slugifiedOffset && mapping.invert().map(slugifiedOffset, -1);
|
|
340
359
|
if (transcriptionOffset === null) {
|
|
341
|
-
(
|
|
360
|
+
(_f = this.logger) == null ? void 0 : _f.info(
|
|
342
361
|
`Couldn't find matching transcription for chapter #${index}`
|
|
343
362
|
);
|
|
344
363
|
continue;
|
|
345
364
|
}
|
|
346
|
-
(
|
|
365
|
+
(_g = this.logger) == null ? void 0 : _g.info(
|
|
347
366
|
`Chapter #${index} best matches transcription at offset ${transcriptionOffset}, starting at sentence ${startSentence}`
|
|
348
367
|
);
|
|
349
368
|
const result = await this.alignChapter(
|
|
350
369
|
startSentence,
|
|
351
370
|
chapterId,
|
|
352
371
|
transcriptionOffset,
|
|
372
|
+
locale,
|
|
353
373
|
lastSentenceRange
|
|
354
374
|
);
|
|
355
375
|
lastSentenceRange = result.lastSentenceRange;
|
package/dist/align/fuzzy.cjs
CHANGED
|
@@ -108,7 +108,7 @@ function expand(subsequence, sequence, maxDist) {
|
|
|
108
108
|
function* levenshteinNgram(subsequence, sequence, maxDist) {
|
|
109
109
|
const subsequenceLength = subsequence.length;
|
|
110
110
|
const sequenceLength = sequence.length;
|
|
111
|
-
const ngramLength = Math.
|
|
111
|
+
const ngramLength = Math.floor(subsequenceLength / (maxDist + 1));
|
|
112
112
|
if (ngramLength === 0) {
|
|
113
113
|
throw new Error("The subsequence length must be greater than maxDist");
|
|
114
114
|
}
|
package/dist/align/fuzzy.js
CHANGED
|
@@ -86,7 +86,7 @@ function expand(subsequence, sequence, maxDist) {
|
|
|
86
86
|
function* levenshteinNgram(subsequence, sequence, maxDist) {
|
|
87
87
|
const subsequenceLength = subsequence.length;
|
|
88
88
|
const sequenceLength = sequence.length;
|
|
89
|
-
const ngramLength = Math.
|
|
89
|
+
const ngramLength = Math.floor(subsequenceLength / (maxDist + 1));
|
|
90
90
|
if (ngramLength === 0) {
|
|
91
91
|
throw new Error("The subsequence length must be greater than maxDist");
|
|
92
92
|
}
|
|
@@ -28,6 +28,7 @@ module.exports = __toCommonJS(getSentenceRanges_exports);
|
|
|
28
28
|
var import_text_segmentation = require("@echogarden/text-segmentation");
|
|
29
29
|
var import_ffmpeg = require("../common/ffmpeg.cjs");
|
|
30
30
|
var import_fuzzy = require("./fuzzy.cjs");
|
|
31
|
+
var import_slugify = require("./slugify.cjs");
|
|
31
32
|
async function getSentencesWithOffsets(text) {
|
|
32
33
|
const sentences = await (0, import_text_segmentation.segmentText)(text).then(
|
|
33
34
|
(r) => r.sentences.map((s) => s.text)
|
|
@@ -75,7 +76,7 @@ function getWindowIndexFromOffset(window, offset) {
|
|
|
75
76
|
function collapseWhitespace(input) {
|
|
76
77
|
return input.replaceAll(/\s+/g, " ");
|
|
77
78
|
}
|
|
78
|
-
async function getSentenceRanges(startSentence, transcription, sentences, chapterOffset, lastSentenceRange) {
|
|
79
|
+
async function getSentenceRanges(startSentence, transcription, sentences, chapterOffset, locale, lastSentenceRange) {
|
|
79
80
|
const sentenceRanges = [];
|
|
80
81
|
const fullTranscriptionText = transcription.transcript;
|
|
81
82
|
const transcriptionText = fullTranscriptionText.slice(chapterOffset);
|
|
@@ -83,13 +84,15 @@ async function getSentenceRanges(startSentence, transcription, sentences, chapte
|
|
|
83
84
|
transcriptionText
|
|
84
85
|
).then((s) => s.map((sentence) => sentence.toLowerCase()));
|
|
85
86
|
let startSentenceEntry = startSentence;
|
|
86
|
-
const sentenceEntries =
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
87
|
+
const sentenceEntries = [];
|
|
88
|
+
for (let i = 0; i < sentences.length; i++) {
|
|
89
|
+
const sentence = (await (0, import_slugify.slugify)(sentences[i], locale)).result;
|
|
90
|
+
if (sentence.length <= 3) {
|
|
91
|
+
if (i < startSentence) startSentenceEntry--;
|
|
92
|
+
continue;
|
|
90
93
|
}
|
|
91
|
-
|
|
92
|
-
}
|
|
94
|
+
sentenceEntries.push([i, sentence]);
|
|
95
|
+
}
|
|
93
96
|
let transcriptionWindowIndex = 0;
|
|
94
97
|
let transcriptionWindowOffset = 0;
|
|
95
98
|
let lastGoodTranscriptionWindow = 0;
|
|
@@ -102,7 +105,11 @@ async function getSentenceRanges(startSentence, transcription, sentences, chapte
|
|
|
102
105
|
transcriptionWindowIndex,
|
|
103
106
|
transcriptionWindowIndex + 10
|
|
104
107
|
);
|
|
105
|
-
const transcriptionWindow =
|
|
108
|
+
const { result: transcriptionWindow, mapping } = await (0, import_slugify.slugify)(
|
|
109
|
+
transcriptionWindowList.join("-").slice(transcriptionWindowOffset),
|
|
110
|
+
locale
|
|
111
|
+
);
|
|
112
|
+
const inverted = mapping.invert();
|
|
106
113
|
const query = collapseWhitespace(sentence.trim()).toLowerCase();
|
|
107
114
|
const firstMatch = (0, import_fuzzy.findNearestMatch)(
|
|
108
115
|
query,
|
|
@@ -125,8 +132,13 @@ async function getSentenceRanges(startSentence, transcription, sentences, chapte
|
|
|
125
132
|
continue;
|
|
126
133
|
}
|
|
127
134
|
const transcriptionOffset = transcriptionSentences.slice(0, transcriptionWindowIndex).join("").length;
|
|
135
|
+
const matchStart = inverted.map(firstMatch.index, 1);
|
|
136
|
+
const matchEnd = inverted.map(
|
|
137
|
+
firstMatch.index + firstMatch.match.length,
|
|
138
|
+
-1
|
|
139
|
+
);
|
|
128
140
|
const startResult = findStartTimestamp(
|
|
129
|
-
|
|
141
|
+
matchStart + transcriptionOffset + transcriptionWindowOffset + chapterOffset,
|
|
130
142
|
transcription
|
|
131
143
|
);
|
|
132
144
|
if (!startResult) {
|
|
@@ -136,7 +148,7 @@ async function getSentenceRanges(startSentence, transcription, sentences, chapte
|
|
|
136
148
|
let start = startResult.start;
|
|
137
149
|
const audiofile = startResult.audiofile;
|
|
138
150
|
const end = findEndTimestamp(
|
|
139
|
-
|
|
151
|
+
matchEnd + transcriptionOffset + transcriptionWindowOffset + chapterOffset,
|
|
140
152
|
transcription
|
|
141
153
|
) ?? startResult.end;
|
|
142
154
|
if (sentenceRanges.length > 0) {
|
|
@@ -177,10 +189,10 @@ async function getSentenceRanges(startSentence, transcription, sentences, chapte
|
|
|
177
189
|
audiofile
|
|
178
190
|
});
|
|
179
191
|
notFound = 0;
|
|
180
|
-
lastMatchEnd =
|
|
192
|
+
lastMatchEnd = matchEnd + transcriptionOffset + transcriptionWindowOffset + chapterOffset;
|
|
181
193
|
const windowIndexResult = getWindowIndexFromOffset(
|
|
182
194
|
transcriptionWindowList,
|
|
183
|
-
|
|
195
|
+
matchEnd + transcriptionWindowOffset
|
|
184
196
|
);
|
|
185
197
|
transcriptionWindowIndex += windowIndexResult.index;
|
|
186
198
|
transcriptionWindowOffset = windowIndexResult.offset;
|
|
@@ -14,7 +14,7 @@ type SentenceRange = {
|
|
|
14
14
|
audiofile: string;
|
|
15
15
|
};
|
|
16
16
|
declare function findEndTimestamp(matchEndIndex: number, transcription: StorytellerTranscription): number | null;
|
|
17
|
-
declare function getSentenceRanges(startSentence: number, transcription: StorytellerTranscription, sentences: string[], chapterOffset: number, lastSentenceRange: SentenceRange | null): Promise<{
|
|
17
|
+
declare function getSentenceRanges(startSentence: number, transcription: StorytellerTranscription, sentences: string[], chapterOffset: number, locale: Intl.Locale, lastSentenceRange: SentenceRange | null): Promise<{
|
|
18
18
|
sentenceRanges: SentenceRange[];
|
|
19
19
|
transcriptionOffset: number;
|
|
20
20
|
}>;
|
|
@@ -14,7 +14,7 @@ type SentenceRange = {
|
|
|
14
14
|
audiofile: string;
|
|
15
15
|
};
|
|
16
16
|
declare function findEndTimestamp(matchEndIndex: number, transcription: StorytellerTranscription): number | null;
|
|
17
|
-
declare function getSentenceRanges(startSentence: number, transcription: StorytellerTranscription, sentences: string[], chapterOffset: number, lastSentenceRange: SentenceRange | null): Promise<{
|
|
17
|
+
declare function getSentenceRanges(startSentence: number, transcription: StorytellerTranscription, sentences: string[], chapterOffset: number, locale: Intl.Locale, lastSentenceRange: SentenceRange | null): Promise<{
|
|
18
18
|
sentenceRanges: SentenceRange[];
|
|
19
19
|
transcriptionOffset: number;
|
|
20
20
|
}>;
|
|
@@ -2,6 +2,7 @@ import "../chunk-BIEQXUOY.js";
|
|
|
2
2
|
import { segmentText } from "@echogarden/text-segmentation";
|
|
3
3
|
import { getTrackDuration } from "../common/ffmpeg.js";
|
|
4
4
|
import { findNearestMatch } from "./fuzzy.js";
|
|
5
|
+
import { slugify } from "./slugify.js";
|
|
5
6
|
async function getSentencesWithOffsets(text) {
|
|
6
7
|
const sentences = await segmentText(text).then(
|
|
7
8
|
(r) => r.sentences.map((s) => s.text)
|
|
@@ -49,7 +50,7 @@ function getWindowIndexFromOffset(window, offset) {
|
|
|
49
50
|
function collapseWhitespace(input) {
|
|
50
51
|
return input.replaceAll(/\s+/g, " ");
|
|
51
52
|
}
|
|
52
|
-
async function getSentenceRanges(startSentence, transcription, sentences, chapterOffset, lastSentenceRange) {
|
|
53
|
+
async function getSentenceRanges(startSentence, transcription, sentences, chapterOffset, locale, lastSentenceRange) {
|
|
53
54
|
const sentenceRanges = [];
|
|
54
55
|
const fullTranscriptionText = transcription.transcript;
|
|
55
56
|
const transcriptionText = fullTranscriptionText.slice(chapterOffset);
|
|
@@ -57,13 +58,15 @@ async function getSentenceRanges(startSentence, transcription, sentences, chapte
|
|
|
57
58
|
transcriptionText
|
|
58
59
|
).then((s) => s.map((sentence) => sentence.toLowerCase()));
|
|
59
60
|
let startSentenceEntry = startSentence;
|
|
60
|
-
const sentenceEntries =
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
61
|
+
const sentenceEntries = [];
|
|
62
|
+
for (let i = 0; i < sentences.length; i++) {
|
|
63
|
+
const sentence = (await slugify(sentences[i], locale)).result;
|
|
64
|
+
if (sentence.length <= 3) {
|
|
65
|
+
if (i < startSentence) startSentenceEntry--;
|
|
66
|
+
continue;
|
|
64
67
|
}
|
|
65
|
-
|
|
66
|
-
}
|
|
68
|
+
sentenceEntries.push([i, sentence]);
|
|
69
|
+
}
|
|
67
70
|
let transcriptionWindowIndex = 0;
|
|
68
71
|
let transcriptionWindowOffset = 0;
|
|
69
72
|
let lastGoodTranscriptionWindow = 0;
|
|
@@ -76,7 +79,11 @@ async function getSentenceRanges(startSentence, transcription, sentences, chapte
|
|
|
76
79
|
transcriptionWindowIndex,
|
|
77
80
|
transcriptionWindowIndex + 10
|
|
78
81
|
);
|
|
79
|
-
const transcriptionWindow =
|
|
82
|
+
const { result: transcriptionWindow, mapping } = await slugify(
|
|
83
|
+
transcriptionWindowList.join("-").slice(transcriptionWindowOffset),
|
|
84
|
+
locale
|
|
85
|
+
);
|
|
86
|
+
const inverted = mapping.invert();
|
|
80
87
|
const query = collapseWhitespace(sentence.trim()).toLowerCase();
|
|
81
88
|
const firstMatch = findNearestMatch(
|
|
82
89
|
query,
|
|
@@ -99,8 +106,13 @@ async function getSentenceRanges(startSentence, transcription, sentences, chapte
|
|
|
99
106
|
continue;
|
|
100
107
|
}
|
|
101
108
|
const transcriptionOffset = transcriptionSentences.slice(0, transcriptionWindowIndex).join("").length;
|
|
109
|
+
const matchStart = inverted.map(firstMatch.index, 1);
|
|
110
|
+
const matchEnd = inverted.map(
|
|
111
|
+
firstMatch.index + firstMatch.match.length,
|
|
112
|
+
-1
|
|
113
|
+
);
|
|
102
114
|
const startResult = findStartTimestamp(
|
|
103
|
-
|
|
115
|
+
matchStart + transcriptionOffset + transcriptionWindowOffset + chapterOffset,
|
|
104
116
|
transcription
|
|
105
117
|
);
|
|
106
118
|
if (!startResult) {
|
|
@@ -110,7 +122,7 @@ async function getSentenceRanges(startSentence, transcription, sentences, chapte
|
|
|
110
122
|
let start = startResult.start;
|
|
111
123
|
const audiofile = startResult.audiofile;
|
|
112
124
|
const end = findEndTimestamp(
|
|
113
|
-
|
|
125
|
+
matchEnd + transcriptionOffset + transcriptionWindowOffset + chapterOffset,
|
|
114
126
|
transcription
|
|
115
127
|
) ?? startResult.end;
|
|
116
128
|
if (sentenceRanges.length > 0) {
|
|
@@ -151,10 +163,10 @@ async function getSentenceRanges(startSentence, transcription, sentences, chapte
|
|
|
151
163
|
audiofile
|
|
152
164
|
});
|
|
153
165
|
notFound = 0;
|
|
154
|
-
lastMatchEnd =
|
|
166
|
+
lastMatchEnd = matchEnd + transcriptionOffset + transcriptionWindowOffset + chapterOffset;
|
|
155
167
|
const windowIndexResult = getWindowIndexFromOffset(
|
|
156
168
|
transcriptionWindowList,
|
|
157
|
-
|
|
169
|
+
matchEnd + transcriptionWindowOffset
|
|
158
170
|
);
|
|
159
171
|
transcriptionWindowIndex += windowIndexResult.index;
|
|
160
172
|
transcriptionWindowOffset = windowIndexResult.offset;
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
|
+
var __export = (target, all) => {
|
|
7
|
+
for (var name in all)
|
|
8
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
+
};
|
|
10
|
+
var __copyProps = (to, from, except, desc) => {
|
|
11
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
12
|
+
for (let key of __getOwnPropNames(from))
|
|
13
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
14
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
15
|
+
}
|
|
16
|
+
return to;
|
|
17
|
+
};
|
|
18
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
|
+
var slugify_exports = {};
|
|
20
|
+
__export(slugify_exports, {
|
|
21
|
+
slugify: () => slugify
|
|
22
|
+
});
|
|
23
|
+
module.exports = __toCommonJS(slugify_exports);
|
|
24
|
+
var import_locale_currency = require("locale-currency");
|
|
25
|
+
var import_to_words = require("to-words");
|
|
26
|
+
var import_transliteration = require("@storyteller-platform/transliteration");
|
|
27
|
+
const replacerMap = /* @__PURE__ */ new WeakMap();
|
|
28
|
+
function createReplacers(locale) {
|
|
29
|
+
const maximizedLocale = locale.maximize();
|
|
30
|
+
const demoNumber = 123456.789;
|
|
31
|
+
const currencyFormat = new Intl.NumberFormat(locale, {
|
|
32
|
+
style: "currency",
|
|
33
|
+
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
|
34
|
+
currency: (0, import_locale_currency.getCurrency)(locale.maximize().region)
|
|
35
|
+
});
|
|
36
|
+
const currencyParts = currencyFormat.formatToParts(demoNumber);
|
|
37
|
+
const currencySymbols = currencyParts.reduce(
|
|
38
|
+
(acc, part, index) => {
|
|
39
|
+
if (part.type === "group") {
|
|
40
|
+
return {
|
|
41
|
+
...acc,
|
|
42
|
+
group: part.value
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
if (part.type === "decimal") {
|
|
46
|
+
return {
|
|
47
|
+
...acc,
|
|
48
|
+
decimal: part.value
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
if (part.type === "currency") {
|
|
52
|
+
return {
|
|
53
|
+
...acc,
|
|
54
|
+
currency: part.value,
|
|
55
|
+
currencyLeading: index === 0
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
return acc;
|
|
59
|
+
},
|
|
60
|
+
{ group: "", decimal: "", currency: "", currencyLeading: true }
|
|
61
|
+
);
|
|
62
|
+
const numeralRegexPart = `(\\p{Number}[\\p{Number}${currencySymbols.group}]*(?:[${currencySymbols.decimal}]\\p{Number}*)?)`;
|
|
63
|
+
const currencyRegex = currencySymbols.currencyLeading ? new RegExp(`[${currencySymbols.currency}]\\s?${numeralRegexPart}`, "gu") : new RegExp(`${numeralRegexPart}\\s?[${currencySymbols.currency}]`, "gu");
|
|
64
|
+
function currencyReplacer(match) {
|
|
65
|
+
const numeralMatch = match[1];
|
|
66
|
+
if (!numeralMatch) return match[0];
|
|
67
|
+
const normalizedNumeral = numeralMatch.replaceAll(new RegExp(`\\${currencySymbols.group}`, "g"), "").replace(new RegExp(`\\${currencySymbols.decimal}`), ".");
|
|
68
|
+
const number = parseFloat(normalizedNumeral);
|
|
69
|
+
return (0, import_to_words.toWords)(number, {
|
|
70
|
+
localeCode: `${maximizedLocale.language}-${maximizedLocale.region}`,
|
|
71
|
+
currency: true,
|
|
72
|
+
doNotAddOnly: true
|
|
73
|
+
});
|
|
74
|
+
}
|
|
75
|
+
const numberFormat = new Intl.NumberFormat(locale);
|
|
76
|
+
const numberParts = numberFormat.formatToParts(demoNumber);
|
|
77
|
+
const numberSymbols = numberParts.reduce(
|
|
78
|
+
(acc, part) => {
|
|
79
|
+
if (part.type === "group") {
|
|
80
|
+
return {
|
|
81
|
+
...acc,
|
|
82
|
+
group: part.value
|
|
83
|
+
};
|
|
84
|
+
}
|
|
85
|
+
if (part.type === "decimal") {
|
|
86
|
+
return {
|
|
87
|
+
...acc,
|
|
88
|
+
decimal: part.value
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
return acc;
|
|
92
|
+
},
|
|
93
|
+
{ group: "", decimal: "" }
|
|
94
|
+
);
|
|
95
|
+
const numberRegex = new RegExp(
|
|
96
|
+
`(\\p{Number}[\\p{Number}${numberSymbols.group}]*(?:[${numberSymbols.decimal}]\\p{Number}*)?)`,
|
|
97
|
+
"gu"
|
|
98
|
+
);
|
|
99
|
+
function numberReplacer(match) {
|
|
100
|
+
const numeralMatch = match[1];
|
|
101
|
+
if (!numeralMatch) return match[0];
|
|
102
|
+
const normalizedNumeral = numeralMatch.replaceAll(new RegExp(`\\${numberSymbols.group}`, "g"), "").replace(new RegExp(`\\${numberSymbols.decimal}`), ".");
|
|
103
|
+
const number = parseFloat(normalizedNumeral);
|
|
104
|
+
return (0, import_to_words.toWords)(number, {
|
|
105
|
+
localeCode: `${maximizedLocale.language}-${maximizedLocale.region}`
|
|
106
|
+
});
|
|
107
|
+
}
|
|
108
|
+
return [
|
|
109
|
+
[currencyRegex, currencyReplacer],
|
|
110
|
+
[numberRegex, numberReplacer]
|
|
111
|
+
];
|
|
112
|
+
}
|
|
113
|
+
async function slugify(text, locale) {
|
|
114
|
+
const replacers = replacerMap.get(locale) ?? createReplacers(locale);
|
|
115
|
+
replacerMap.set(locale, replacers);
|
|
116
|
+
const { result, mapping } = await (0, import_transliteration.slugify)(text, {
|
|
117
|
+
allowedChars: "a-zA-Z0-9",
|
|
118
|
+
replace: replacers
|
|
119
|
+
});
|
|
120
|
+
return { result, mapping };
|
|
121
|
+
}
|
|
122
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
123
|
+
0 && (module.exports = {
|
|
124
|
+
slugify
|
|
125
|
+
});
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import * as _storyteller_platform_transliteration from '@storyteller-platform/transliteration';
|
|
2
|
+
|
|
3
|
+
declare function slugify(text: string, locale: Intl.Locale): Promise<{
|
|
4
|
+
result: string;
|
|
5
|
+
mapping: _storyteller_platform_transliteration.Mapping;
|
|
6
|
+
}>;
|
|
7
|
+
|
|
8
|
+
export { slugify };
|