@storyteller-platform/align 0.1.25 → 0.1.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/align/align.cjs +21 -9
- package/dist/align/align.js +22 -11
- package/dist/align/getSentenceRanges.cjs +0 -58
- package/dist/align/getSentenceRanges.d.cts +1 -2
- package/dist/align/getSentenceRanges.d.ts +1 -2
- package/dist/align/getSentenceRanges.js +0 -57
- package/dist/align/interpolateSentenceRanges.cjs +124 -0
- package/dist/align/interpolateSentenceRanges.d.cts +23 -0
- package/dist/align/interpolateSentenceRanges.d.ts +23 -0
- package/dist/align/interpolateSentenceRanges.js +101 -0
- package/dist/align/search.cjs +18 -7
- package/dist/align/search.js +18 -7
- package/dist/index.d.cts +1 -2
- package/dist/index.d.ts +1 -2
- package/dist/markup/markup.cjs +21 -14
- package/dist/markup/markup.d.cts +2 -4
- package/dist/markup/markup.d.ts +2 -4
- package/dist/markup/markup.js +28 -16
- package/dist/markup/model.cjs +138 -5
- package/dist/markup/model.d.cts +2 -57
- package/dist/markup/model.d.ts +2 -57
- package/dist/markup/model.js +136 -5
- package/dist/markup/parseDom.cjs +80 -25
- package/dist/markup/parseDom.d.cts +4 -4
- package/dist/markup/parseDom.d.ts +4 -4
- package/dist/markup/parseDom.js +87 -24
- package/dist/markup/resolvedPos.cjs +85 -0
- package/dist/markup/resolvedPos.d.cts +2 -0
- package/dist/markup/resolvedPos.d.ts +2 -0
- package/dist/markup/resolvedPos.js +62 -0
- package/dist/markup/segmentation.cjs +4 -8
- package/dist/markup/segmentation.d.cts +3 -8
- package/dist/markup/segmentation.d.ts +3 -8
- package/dist/markup/segmentation.js +3 -7
- package/dist/markup/serializeDom.d.cts +1 -1
- package/dist/markup/serializeDom.d.ts +1 -1
- package/dist/markup/transform.cjs +59 -2
- package/dist/markup/transform.d.cts +8 -2
- package/dist/markup/transform.d.ts +8 -2
- package/dist/markup/transform.js +58 -1
- package/dist/model-Bv3yPEdd.d.cts +96 -0
- package/dist/model-Bv3yPEdd.d.ts +96 -0
- package/dist/snapshot/snapshot.cjs +8 -6
- package/dist/snapshot/snapshot.js +9 -7
- package/package.json +1 -1
package/dist/align/align.cjs
CHANGED
|
@@ -87,8 +87,11 @@ var import_audiobook = require("@storyteller-platform/audiobook");
|
|
|
87
87
|
var import_epub = require("@storyteller-platform/epub");
|
|
88
88
|
var import_ghost_story = require("@storyteller-platform/ghost-story");
|
|
89
89
|
var import_ffmpeg = require("../common/ffmpeg.cjs");
|
|
90
|
+
var import_parseDom = require("../markup/parseDom.cjs");
|
|
90
91
|
var import_segmentation = require("../markup/segmentation.cjs");
|
|
92
|
+
var import_transform = require("../markup/transform.cjs");
|
|
91
93
|
var import_getSentenceRanges = require("./getSentenceRanges.cjs");
|
|
94
|
+
var import_interpolateSentenceRanges = require("./interpolateSentenceRanges.cjs");
|
|
92
95
|
var import_search = require("./search.cjs");
|
|
93
96
|
var import_slugify = require("./slugify.cjs");
|
|
94
97
|
var import_textFragments = require("./textFragments.cjs");
|
|
@@ -172,12 +175,12 @@ class Aligner {
|
|
|
172
175
|
};
|
|
173
176
|
async getChapterSentences(chapterId) {
|
|
174
177
|
const chapterXml = await this.epub.readXhtmlItemContents(chapterId);
|
|
175
|
-
const
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
);
|
|
178
|
+
const original = (0, import_parseDom.parseDom)(import_epub.Epub.getXhtmlBody(chapterXml));
|
|
179
|
+
const inlined = (0, import_transform.inlineFootnotes)(original);
|
|
180
|
+
const lifted = (0, import_transform.liftText)(inlined.root);
|
|
181
|
+
const segmentation = await (0, import_segmentation.segmentChapter)(lifted.result, {
|
|
182
|
+
primaryLocale: this.languageOverride ?? await this.epub.getLanguage()
|
|
183
|
+
});
|
|
181
184
|
return segmentation.filter((s) => s.text.match(/\S/));
|
|
182
185
|
}
|
|
183
186
|
async writeAlignedChapter(alignedChapter) {
|
|
@@ -505,16 +508,25 @@ class Aligner {
|
|
|
505
508
|
});
|
|
506
509
|
const sentenceRanges = [];
|
|
507
510
|
const chapterSentenceCounts = {};
|
|
511
|
+
const audioFileDurations = {};
|
|
508
512
|
for (const alignedChapter of audioOrderedChapters) {
|
|
509
513
|
sentenceRanges.push(...alignedChapter.sentenceRanges);
|
|
514
|
+
for (const sentenceRange of sentenceRanges) {
|
|
515
|
+
if (!(sentenceRange.audiofile in audioFileDurations)) {
|
|
516
|
+
audioFileDurations[sentenceRange.audiofile] = await (0, import_ffmpeg.getTrackDuration)(
|
|
517
|
+
sentenceRange.audiofile
|
|
518
|
+
);
|
|
519
|
+
}
|
|
520
|
+
}
|
|
510
521
|
const sentences = await this.getChapterSentences(
|
|
511
522
|
alignedChapter.chapter.id
|
|
512
523
|
);
|
|
513
524
|
chapterSentenceCounts[alignedChapter.chapter.id] = sentences.length;
|
|
514
525
|
}
|
|
515
|
-
const interpolated =
|
|
526
|
+
const interpolated = (0, import_interpolateSentenceRanges.interpolateSentenceRanges)(
|
|
516
527
|
sentenceRanges,
|
|
517
|
-
chapterSentenceCounts
|
|
528
|
+
chapterSentenceCounts,
|
|
529
|
+
audioFileDurations
|
|
518
530
|
);
|
|
519
531
|
const expanded = (0, import_getSentenceRanges.expandEmptySentenceRanges)(interpolated);
|
|
520
532
|
const collapsed = await (0, import_getSentenceRanges.collapseSentenceRangeGaps)(expanded);
|
|
@@ -525,7 +537,7 @@ class Aligner {
|
|
|
525
537
|
);
|
|
526
538
|
const finalSentenceRanges = collapsed.slice(
|
|
527
539
|
collapsedStart,
|
|
528
|
-
collapsedStart + sentences.length
|
|
540
|
+
collapsedStart + sentences.length
|
|
529
541
|
);
|
|
530
542
|
alignedChapter.sentenceRanges = finalSentenceRanges;
|
|
531
543
|
for (const [i, wordRanges] of (0, import_itertools.enumerate)(alignedChapter.wordRanges)) {
|
package/dist/align/align.js
CHANGED
|
@@ -16,15 +16,17 @@ import {
|
|
|
16
16
|
createTiming
|
|
17
17
|
} from "@storyteller-platform/ghost-story";
|
|
18
18
|
import { getTrackDuration } from "../common/ffmpeg.js";
|
|
19
|
-
import {
|
|
19
|
+
import { parseDom } from "../markup/parseDom.js";
|
|
20
|
+
import { segmentChapter } from "../markup/segmentation.js";
|
|
21
|
+
import { inlineFootnotes, liftText } from "../markup/transform.js";
|
|
20
22
|
import {
|
|
21
23
|
collapseSentenceRangeGaps,
|
|
22
24
|
expandEmptySentenceRanges,
|
|
23
25
|
getChapterDuration,
|
|
24
26
|
getSentenceRanges,
|
|
25
|
-
interpolateSentenceRanges,
|
|
26
27
|
mapTranscriptionTimeline
|
|
27
28
|
} from "./getSentenceRanges.js";
|
|
29
|
+
import { interpolateSentenceRanges } from "./interpolateSentenceRanges.js";
|
|
28
30
|
import { findBoundaries } from "./search.js";
|
|
29
31
|
import { slugify } from "./slugify.js";
|
|
30
32
|
import { TextFragmentTrie } from "./textFragments.js";
|
|
@@ -108,12 +110,12 @@ class Aligner {
|
|
|
108
110
|
};
|
|
109
111
|
async getChapterSentences(chapterId) {
|
|
110
112
|
const chapterXml = await this.epub.readXhtmlItemContents(chapterId);
|
|
111
|
-
const
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
);
|
|
113
|
+
const original = parseDom(Epub.getXhtmlBody(chapterXml));
|
|
114
|
+
const inlined = inlineFootnotes(original);
|
|
115
|
+
const lifted = liftText(inlined.root);
|
|
116
|
+
const segmentation = await segmentChapter(lifted.result, {
|
|
117
|
+
primaryLocale: this.languageOverride ?? await this.epub.getLanguage()
|
|
118
|
+
});
|
|
117
119
|
return segmentation.filter((s) => s.text.match(/\S/));
|
|
118
120
|
}
|
|
119
121
|
async writeAlignedChapter(alignedChapter) {
|
|
@@ -441,16 +443,25 @@ class Aligner {
|
|
|
441
443
|
});
|
|
442
444
|
const sentenceRanges = [];
|
|
443
445
|
const chapterSentenceCounts = {};
|
|
446
|
+
const audioFileDurations = {};
|
|
444
447
|
for (const alignedChapter of audioOrderedChapters) {
|
|
445
448
|
sentenceRanges.push(...alignedChapter.sentenceRanges);
|
|
449
|
+
for (const sentenceRange of sentenceRanges) {
|
|
450
|
+
if (!(sentenceRange.audiofile in audioFileDurations)) {
|
|
451
|
+
audioFileDurations[sentenceRange.audiofile] = await getTrackDuration(
|
|
452
|
+
sentenceRange.audiofile
|
|
453
|
+
);
|
|
454
|
+
}
|
|
455
|
+
}
|
|
446
456
|
const sentences = await this.getChapterSentences(
|
|
447
457
|
alignedChapter.chapter.id
|
|
448
458
|
);
|
|
449
459
|
chapterSentenceCounts[alignedChapter.chapter.id] = sentences.length;
|
|
450
460
|
}
|
|
451
|
-
const interpolated =
|
|
461
|
+
const interpolated = interpolateSentenceRanges(
|
|
452
462
|
sentenceRanges,
|
|
453
|
-
chapterSentenceCounts
|
|
463
|
+
chapterSentenceCounts,
|
|
464
|
+
audioFileDurations
|
|
454
465
|
);
|
|
455
466
|
const expanded = expandEmptySentenceRanges(interpolated);
|
|
456
467
|
const collapsed = await collapseSentenceRangeGaps(expanded);
|
|
@@ -461,7 +472,7 @@ class Aligner {
|
|
|
461
472
|
);
|
|
462
473
|
const finalSentenceRanges = collapsed.slice(
|
|
463
474
|
collapsedStart,
|
|
464
|
-
collapsedStart + sentences.length
|
|
475
|
+
collapsedStart + sentences.length
|
|
465
476
|
);
|
|
466
477
|
alignedChapter.sentenceRanges = finalSentenceRanges;
|
|
467
478
|
for (const [i, wordRanges] of enumerate(alignedChapter.wordRanges)) {
|
|
@@ -23,7 +23,6 @@ __export(getSentenceRanges_exports, {
|
|
|
23
23
|
findEndTimestamp: () => findEndTimestamp,
|
|
24
24
|
getChapterDuration: () => getChapterDuration,
|
|
25
25
|
getSentenceRanges: () => getSentenceRanges,
|
|
26
|
-
interpolateSentenceRanges: () => interpolateSentenceRanges,
|
|
27
26
|
mapTranscriptionTimeline: () => mapTranscriptionTimeline
|
|
28
27
|
});
|
|
29
28
|
module.exports = __toCommonJS(getSentenceRanges_exports);
|
|
@@ -311,62 +310,6 @@ async function getSentenceRanges(transcriptionText, mappedTimeline, sentences, c
|
|
|
311
310
|
lastFoundSentence
|
|
312
311
|
};
|
|
313
312
|
}
|
|
314
|
-
async function getLargestGap(trailing, leading) {
|
|
315
|
-
const leadingGap = leading.start;
|
|
316
|
-
const trailingGap = await (0, import_ffmpeg.getTrackDuration)(trailing.audiofile) - trailing.end;
|
|
317
|
-
if (trailingGap > leadingGap) return [trailingGap, trailing.audiofile];
|
|
318
|
-
return [leadingGap, leading.audiofile];
|
|
319
|
-
}
|
|
320
|
-
async function interpolateSentenceRanges(sentenceRanges, chapterSentenceCounts) {
|
|
321
|
-
const interpolated = [];
|
|
322
|
-
for (let i = 0; i < sentenceRanges.length; i++) {
|
|
323
|
-
const endRange = sentenceRanges[i];
|
|
324
|
-
const startRange = sentenceRanges[i - 1] ?? {
|
|
325
|
-
id: 0,
|
|
326
|
-
audiofile: endRange.audiofile,
|
|
327
|
-
chapterId: endRange.chapterId,
|
|
328
|
-
start: 0,
|
|
329
|
-
end: 0
|
|
330
|
-
};
|
|
331
|
-
const newChapter = startRange.chapterId !== endRange.chapterId;
|
|
332
|
-
const newAudiofile = startRange.audiofile !== endRange.audiofile;
|
|
333
|
-
const count = newChapter ? chapterSentenceCounts[startRange.chapterId] - startRange.id - 1 : endRange.id - startRange.id - 1;
|
|
334
|
-
if (count === 0) {
|
|
335
|
-
interpolated.push(endRange);
|
|
336
|
-
continue;
|
|
337
|
-
}
|
|
338
|
-
let [diff, audiofile] = newAudiofile ? await getLargestGap(startRange, endRange) : [endRange.start - startRange.end, endRange.audiofile];
|
|
339
|
-
if (diff <= 0) {
|
|
340
|
-
if (newAudiofile) {
|
|
341
|
-
const rangeLength = endRange.end - endRange.start;
|
|
342
|
-
diff = rangeLength < 0.5 ? rangeLength / 2 : 0.25;
|
|
343
|
-
endRange.start = diff;
|
|
344
|
-
} else {
|
|
345
|
-
diff = 0.25;
|
|
346
|
-
startRange.end = startRange.start - diff;
|
|
347
|
-
}
|
|
348
|
-
}
|
|
349
|
-
const interpolatedLength = diff / count;
|
|
350
|
-
const start = newAudiofile ? 0 : startRange.end;
|
|
351
|
-
for (let i2 = 0; i2 < count; i2++) {
|
|
352
|
-
let id = startRange.id + i2 + 1;
|
|
353
|
-
let chapterId = startRange.chapterId;
|
|
354
|
-
if (newChapter && i2 > chapterSentenceCounts[startRange.chapterId] - startRange.id) {
|
|
355
|
-
id = i2;
|
|
356
|
-
chapterId = endRange.chapterId;
|
|
357
|
-
}
|
|
358
|
-
interpolated.push({
|
|
359
|
-
id,
|
|
360
|
-
chapterId,
|
|
361
|
-
start: start + interpolatedLength * i2,
|
|
362
|
-
end: start + interpolatedLength * (i2 + 1),
|
|
363
|
-
audiofile
|
|
364
|
-
});
|
|
365
|
-
}
|
|
366
|
-
interpolated.push(endRange);
|
|
367
|
-
}
|
|
368
|
-
return interpolated;
|
|
369
|
-
}
|
|
370
313
|
function expandEmptySentenceRanges(sentenceRanges) {
|
|
371
314
|
const expandedRanges = [];
|
|
372
315
|
for (const sentenceRange of sentenceRanges) {
|
|
@@ -418,6 +361,5 @@ function getChapterDuration(sentenceRanges) {
|
|
|
418
361
|
findEndTimestamp,
|
|
419
362
|
getChapterDuration,
|
|
420
363
|
getSentenceRanges,
|
|
421
|
-
interpolateSentenceRanges,
|
|
422
364
|
mapTranscriptionTimeline
|
|
423
365
|
});
|
|
@@ -54,7 +54,6 @@ declare function getSentenceRanges(transcriptionText: string, mappedTimeline: Ma
|
|
|
54
54
|
firstFoundSentence: number;
|
|
55
55
|
lastFoundSentence: number;
|
|
56
56
|
}>;
|
|
57
|
-
declare function interpolateSentenceRanges(sentenceRanges: SentenceRange[], chapterSentenceCounts: Record<string, number>): Promise<SentenceRange[]>;
|
|
58
57
|
/**
|
|
59
58
|
* Whisper sometimes provides words with no time information,
|
|
60
59
|
* or start and end timestamps that are equal. EpubCheck complains
|
|
@@ -65,4 +64,4 @@ declare function expandEmptySentenceRanges<Range extends SentenceRange | WordRan
|
|
|
65
64
|
declare function collapseSentenceRangeGaps(sentenceRanges: SentenceRange[]): Promise<SentenceRange[]>;
|
|
66
65
|
declare function getChapterDuration(sentenceRanges: SentenceRange[]): number;
|
|
67
66
|
|
|
68
|
-
export { type MappedTimeline, type SentenceRange, type StorytellerTimelineEntry, type StorytellerTranscription, type WordRange, collapseSentenceRangeGaps, expandEmptySentenceRanges, findEndTimestamp, getChapterDuration, getSentenceRanges,
|
|
67
|
+
export { type MappedTimeline, type SentenceRange, type StorytellerTimelineEntry, type StorytellerTranscription, type WordRange, collapseSentenceRangeGaps, expandEmptySentenceRanges, findEndTimestamp, getChapterDuration, getSentenceRanges, mapTranscriptionTimeline };
|
|
@@ -54,7 +54,6 @@ declare function getSentenceRanges(transcriptionText: string, mappedTimeline: Ma
|
|
|
54
54
|
firstFoundSentence: number;
|
|
55
55
|
lastFoundSentence: number;
|
|
56
56
|
}>;
|
|
57
|
-
declare function interpolateSentenceRanges(sentenceRanges: SentenceRange[], chapterSentenceCounts: Record<string, number>): Promise<SentenceRange[]>;
|
|
58
57
|
/**
|
|
59
58
|
* Whisper sometimes provides words with no time information,
|
|
60
59
|
* or start and end timestamps that are equal. EpubCheck complains
|
|
@@ -65,4 +64,4 @@ declare function expandEmptySentenceRanges<Range extends SentenceRange | WordRan
|
|
|
65
64
|
declare function collapseSentenceRangeGaps(sentenceRanges: SentenceRange[]): Promise<SentenceRange[]>;
|
|
66
65
|
declare function getChapterDuration(sentenceRanges: SentenceRange[]): number;
|
|
67
66
|
|
|
68
|
-
export { type MappedTimeline, type SentenceRange, type StorytellerTimelineEntry, type StorytellerTranscription, type WordRange, collapseSentenceRangeGaps, expandEmptySentenceRanges, findEndTimestamp, getChapterDuration, getSentenceRanges,
|
|
67
|
+
export { type MappedTimeline, type SentenceRange, type StorytellerTimelineEntry, type StorytellerTranscription, type WordRange, collapseSentenceRangeGaps, expandEmptySentenceRanges, findEndTimestamp, getChapterDuration, getSentenceRanges, mapTranscriptionTimeline };
|
|
@@ -283,62 +283,6 @@ async function getSentenceRanges(transcriptionText, mappedTimeline, sentences, c
|
|
|
283
283
|
lastFoundSentence
|
|
284
284
|
};
|
|
285
285
|
}
|
|
286
|
-
async function getLargestGap(trailing, leading) {
|
|
287
|
-
const leadingGap = leading.start;
|
|
288
|
-
const trailingGap = await getTrackDuration(trailing.audiofile) - trailing.end;
|
|
289
|
-
if (trailingGap > leadingGap) return [trailingGap, trailing.audiofile];
|
|
290
|
-
return [leadingGap, leading.audiofile];
|
|
291
|
-
}
|
|
292
|
-
async function interpolateSentenceRanges(sentenceRanges, chapterSentenceCounts) {
|
|
293
|
-
const interpolated = [];
|
|
294
|
-
for (let i = 0; i < sentenceRanges.length; i++) {
|
|
295
|
-
const endRange = sentenceRanges[i];
|
|
296
|
-
const startRange = sentenceRanges[i - 1] ?? {
|
|
297
|
-
id: 0,
|
|
298
|
-
audiofile: endRange.audiofile,
|
|
299
|
-
chapterId: endRange.chapterId,
|
|
300
|
-
start: 0,
|
|
301
|
-
end: 0
|
|
302
|
-
};
|
|
303
|
-
const newChapter = startRange.chapterId !== endRange.chapterId;
|
|
304
|
-
const newAudiofile = startRange.audiofile !== endRange.audiofile;
|
|
305
|
-
const count = newChapter ? chapterSentenceCounts[startRange.chapterId] - startRange.id - 1 : endRange.id - startRange.id - 1;
|
|
306
|
-
if (count === 0) {
|
|
307
|
-
interpolated.push(endRange);
|
|
308
|
-
continue;
|
|
309
|
-
}
|
|
310
|
-
let [diff, audiofile] = newAudiofile ? await getLargestGap(startRange, endRange) : [endRange.start - startRange.end, endRange.audiofile];
|
|
311
|
-
if (diff <= 0) {
|
|
312
|
-
if (newAudiofile) {
|
|
313
|
-
const rangeLength = endRange.end - endRange.start;
|
|
314
|
-
diff = rangeLength < 0.5 ? rangeLength / 2 : 0.25;
|
|
315
|
-
endRange.start = diff;
|
|
316
|
-
} else {
|
|
317
|
-
diff = 0.25;
|
|
318
|
-
startRange.end = startRange.start - diff;
|
|
319
|
-
}
|
|
320
|
-
}
|
|
321
|
-
const interpolatedLength = diff / count;
|
|
322
|
-
const start = newAudiofile ? 0 : startRange.end;
|
|
323
|
-
for (let i2 = 0; i2 < count; i2++) {
|
|
324
|
-
let id = startRange.id + i2 + 1;
|
|
325
|
-
let chapterId = startRange.chapterId;
|
|
326
|
-
if (newChapter && i2 > chapterSentenceCounts[startRange.chapterId] - startRange.id) {
|
|
327
|
-
id = i2;
|
|
328
|
-
chapterId = endRange.chapterId;
|
|
329
|
-
}
|
|
330
|
-
interpolated.push({
|
|
331
|
-
id,
|
|
332
|
-
chapterId,
|
|
333
|
-
start: start + interpolatedLength * i2,
|
|
334
|
-
end: start + interpolatedLength * (i2 + 1),
|
|
335
|
-
audiofile
|
|
336
|
-
});
|
|
337
|
-
}
|
|
338
|
-
interpolated.push(endRange);
|
|
339
|
-
}
|
|
340
|
-
return interpolated;
|
|
341
|
-
}
|
|
342
286
|
function expandEmptySentenceRanges(sentenceRanges) {
|
|
343
287
|
const expandedRanges = [];
|
|
344
288
|
for (const sentenceRange of sentenceRanges) {
|
|
@@ -389,6 +333,5 @@ export {
|
|
|
389
333
|
findEndTimestamp,
|
|
390
334
|
getChapterDuration,
|
|
391
335
|
getSentenceRanges,
|
|
392
|
-
interpolateSentenceRanges,
|
|
393
336
|
mapTranscriptionTimeline
|
|
394
337
|
};
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
|
+
var __export = (target, all) => {
|
|
7
|
+
for (var name in all)
|
|
8
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
+
};
|
|
10
|
+
var __copyProps = (to, from, except, desc) => {
|
|
11
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
12
|
+
for (let key of __getOwnPropNames(from))
|
|
13
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
14
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
15
|
+
}
|
|
16
|
+
return to;
|
|
17
|
+
};
|
|
18
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
|
+
var interpolateSentenceRanges_exports = {};
|
|
20
|
+
__export(interpolateSentenceRanges_exports, {
|
|
21
|
+
interpolateSentenceRanges: () => interpolateSentenceRanges
|
|
22
|
+
});
|
|
23
|
+
module.exports = __toCommonJS(interpolateSentenceRanges_exports);
|
|
24
|
+
function buildGapRanges(slots, left, right, audioFileDurations) {
|
|
25
|
+
const n = slots.length;
|
|
26
|
+
if (n === 0) return [];
|
|
27
|
+
if (left.audiofile === right.audiofile) {
|
|
28
|
+
const span = right.time - left.time;
|
|
29
|
+
return slots.map((slot, i) => ({
|
|
30
|
+
...slot,
|
|
31
|
+
audiofile: left.audiofile,
|
|
32
|
+
start: left.time + span * i / n,
|
|
33
|
+
end: left.time + span * (i + 1) / n
|
|
34
|
+
}));
|
|
35
|
+
}
|
|
36
|
+
const leftDuration = audioFileDurations[left.audiofile] ?? left.time;
|
|
37
|
+
const leftAvail = leftDuration - left.time;
|
|
38
|
+
const rightAvail = right.time;
|
|
39
|
+
const total = leftAvail + rightAvail;
|
|
40
|
+
let n1 = total > 0 ? Math.round(n * (leftAvail / total)) : n;
|
|
41
|
+
let n2 = n - n1;
|
|
42
|
+
n1 = Math.max(0, n1);
|
|
43
|
+
n2 = n - n1;
|
|
44
|
+
const result = [];
|
|
45
|
+
if (n1 > 0) {
|
|
46
|
+
for (let i = 0; i < n1; i++) {
|
|
47
|
+
result.push({
|
|
48
|
+
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
|
49
|
+
...slots[i],
|
|
50
|
+
audiofile: left.audiofile,
|
|
51
|
+
start: left.time + leftAvail * i / n1,
|
|
52
|
+
end: left.time + leftAvail * (i + 1) / n1
|
|
53
|
+
});
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
if (n2 > 0) {
|
|
57
|
+
for (let i = 0; i < n2; i++) {
|
|
58
|
+
result.push({
|
|
59
|
+
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
|
60
|
+
...slots[n1 + i],
|
|
61
|
+
audiofile: right.audiofile,
|
|
62
|
+
start: rightAvail * i / n2,
|
|
63
|
+
end: rightAvail * (i + 1) / n2
|
|
64
|
+
});
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
return result;
|
|
68
|
+
}
|
|
69
|
+
function interpolateSentenceRanges(sentenceRanges, chapterSentenceCounts, audioFileDurations) {
|
|
70
|
+
if (sentenceRanges.length === 0) return [];
|
|
71
|
+
const result = [];
|
|
72
|
+
const first = sentenceRanges[0];
|
|
73
|
+
if (first.id > 0) {
|
|
74
|
+
const slots = Array.from({ length: first.id }, (_, i) => ({
|
|
75
|
+
chapterId: first.chapterId,
|
|
76
|
+
id: i
|
|
77
|
+
}));
|
|
78
|
+
const left = { time: 0, audiofile: first.audiofile };
|
|
79
|
+
const right = { time: first.start, audiofile: first.audiofile };
|
|
80
|
+
result.push(...buildGapRanges(slots, left, right, audioFileDurations));
|
|
81
|
+
}
|
|
82
|
+
result.push(first);
|
|
83
|
+
for (let idx = 1; idx < sentenceRanges.length; idx++) {
|
|
84
|
+
const prev = sentenceRanges[idx - 1];
|
|
85
|
+
const curr = sentenceRanges[idx];
|
|
86
|
+
const left = { time: prev.end, audiofile: prev.audiofile };
|
|
87
|
+
const right = { time: curr.start, audiofile: curr.audiofile };
|
|
88
|
+
const gapSlots = [];
|
|
89
|
+
if (prev.chapterId === curr.chapterId) {
|
|
90
|
+
for (let id = prev.id + 1; id < curr.id; id++) {
|
|
91
|
+
gapSlots.push({ chapterId: prev.chapterId, id });
|
|
92
|
+
}
|
|
93
|
+
} else {
|
|
94
|
+
const prevTotal = chapterSentenceCounts[prev.chapterId] ?? prev.id + 1;
|
|
95
|
+
for (let id = prev.id + 1; id < prevTotal; id++) {
|
|
96
|
+
gapSlots.push({ chapterId: prev.chapterId, id });
|
|
97
|
+
}
|
|
98
|
+
for (let id = 0; id < curr.id; id++) {
|
|
99
|
+
gapSlots.push({ chapterId: curr.chapterId, id });
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
if (gapSlots.length > 0) {
|
|
103
|
+
result.push(...buildGapRanges(gapSlots, left, right, audioFileDurations));
|
|
104
|
+
}
|
|
105
|
+
result.push(curr);
|
|
106
|
+
}
|
|
107
|
+
const last = sentenceRanges[sentenceRanges.length - 1];
|
|
108
|
+
const lastTotal = chapterSentenceCounts[last.chapterId] ?? last.id + 1;
|
|
109
|
+
if (last.id < lastTotal - 1) {
|
|
110
|
+
const slots = Array.from(
|
|
111
|
+
{ length: lastTotal - 1 - last.id },
|
|
112
|
+
(_, i) => ({ chapterId: last.chapterId, id: last.id + 1 + i })
|
|
113
|
+
);
|
|
114
|
+
const fileEnd = audioFileDurations[last.audiofile] ?? last.end;
|
|
115
|
+
const left = { time: last.end, audiofile: last.audiofile };
|
|
116
|
+
const right = { time: fileEnd, audiofile: last.audiofile };
|
|
117
|
+
result.push(...buildGapRanges(slots, left, right, audioFileDurations));
|
|
118
|
+
}
|
|
119
|
+
return result;
|
|
120
|
+
}
|
|
121
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
122
|
+
0 && (module.exports = {
|
|
123
|
+
interpolateSentenceRanges
|
|
124
|
+
});
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import { SentenceRange } from './getSentenceRanges.cjs';
|
|
2
|
+
import '@storyteller-platform/ghost-story';
|
|
3
|
+
import '@echogarden/text-segmentation';
|
|
4
|
+
import '@storyteller-platform/transliteration';
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Given a sequence of sentence ranges from an entire book,
|
|
8
|
+
* ordered by occurrence in audio, interpolates sentence ranges
|
|
9
|
+
* to fill any gaps.
|
|
10
|
+
*
|
|
11
|
+
* A gap may be:
|
|
12
|
+
* - A non-linearity between two sequential sentence ranges
|
|
13
|
+
* in the same chapter, e.g. chapter001#0 -> chapter001#3
|
|
14
|
+
* - A chapter whose sentence ranges start at a number greater
|
|
15
|
+
* than 0, e.g. chapter001#330 -> chapter002#2
|
|
16
|
+
* - A chapter whose sentence ranges end at a number lower
|
|
17
|
+
* than the total number of sentences in that chapter,
|
|
18
|
+
* e.g. chapter001#325 -> chapter002#0, where
|
|
19
|
+
* chapterSentenceCounts["chapter001"] === 330
|
|
20
|
+
*/
|
|
21
|
+
declare function interpolateSentenceRanges(sentenceRanges: SentenceRange[], chapterSentenceCounts: Record<string, number>, audioFileDurations: Record<string, number>): SentenceRange[];
|
|
22
|
+
|
|
23
|
+
export { interpolateSentenceRanges };
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import { SentenceRange } from './getSentenceRanges.js';
|
|
2
|
+
import '@storyteller-platform/ghost-story';
|
|
3
|
+
import '@echogarden/text-segmentation';
|
|
4
|
+
import '@storyteller-platform/transliteration';
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Given a sequence of sentence ranges from an entire book,
|
|
8
|
+
* ordered by occurrence in audio, interpolates sentence ranges
|
|
9
|
+
* to fill any gaps.
|
|
10
|
+
*
|
|
11
|
+
* A gap may be:
|
|
12
|
+
* - A non-linearity between two sequential sentence ranges
|
|
13
|
+
* in the same chapter, e.g. chapter001#0 -> chapter001#3
|
|
14
|
+
* - A chapter whose sentence ranges start at a number greater
|
|
15
|
+
* than 0, e.g. chapter001#330 -> chapter002#2
|
|
16
|
+
* - A chapter whose sentence ranges end at a number lower
|
|
17
|
+
* than the total number of sentences in that chapter,
|
|
18
|
+
* e.g. chapter001#325 -> chapter002#0, where
|
|
19
|
+
* chapterSentenceCounts["chapter001"] === 330
|
|
20
|
+
*/
|
|
21
|
+
declare function interpolateSentenceRanges(sentenceRanges: SentenceRange[], chapterSentenceCounts: Record<string, number>, audioFileDurations: Record<string, number>): SentenceRange[];
|
|
22
|
+
|
|
23
|
+
export { interpolateSentenceRanges };
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import "../chunk-BIEQXUOY.js";
|
|
2
|
+
function buildGapRanges(slots, left, right, audioFileDurations) {
|
|
3
|
+
const n = slots.length;
|
|
4
|
+
if (n === 0) return [];
|
|
5
|
+
if (left.audiofile === right.audiofile) {
|
|
6
|
+
const span = right.time - left.time;
|
|
7
|
+
return slots.map((slot, i) => ({
|
|
8
|
+
...slot,
|
|
9
|
+
audiofile: left.audiofile,
|
|
10
|
+
start: left.time + span * i / n,
|
|
11
|
+
end: left.time + span * (i + 1) / n
|
|
12
|
+
}));
|
|
13
|
+
}
|
|
14
|
+
const leftDuration = audioFileDurations[left.audiofile] ?? left.time;
|
|
15
|
+
const leftAvail = leftDuration - left.time;
|
|
16
|
+
const rightAvail = right.time;
|
|
17
|
+
const total = leftAvail + rightAvail;
|
|
18
|
+
let n1 = total > 0 ? Math.round(n * (leftAvail / total)) : n;
|
|
19
|
+
let n2 = n - n1;
|
|
20
|
+
n1 = Math.max(0, n1);
|
|
21
|
+
n2 = n - n1;
|
|
22
|
+
const result = [];
|
|
23
|
+
if (n1 > 0) {
|
|
24
|
+
for (let i = 0; i < n1; i++) {
|
|
25
|
+
result.push({
|
|
26
|
+
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
|
27
|
+
...slots[i],
|
|
28
|
+
audiofile: left.audiofile,
|
|
29
|
+
start: left.time + leftAvail * i / n1,
|
|
30
|
+
end: left.time + leftAvail * (i + 1) / n1
|
|
31
|
+
});
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
if (n2 > 0) {
|
|
35
|
+
for (let i = 0; i < n2; i++) {
|
|
36
|
+
result.push({
|
|
37
|
+
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
|
38
|
+
...slots[n1 + i],
|
|
39
|
+
audiofile: right.audiofile,
|
|
40
|
+
start: rightAvail * i / n2,
|
|
41
|
+
end: rightAvail * (i + 1) / n2
|
|
42
|
+
});
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
return result;
|
|
46
|
+
}
|
|
47
|
+
function interpolateSentenceRanges(sentenceRanges, chapterSentenceCounts, audioFileDurations) {
|
|
48
|
+
if (sentenceRanges.length === 0) return [];
|
|
49
|
+
const result = [];
|
|
50
|
+
const first = sentenceRanges[0];
|
|
51
|
+
if (first.id > 0) {
|
|
52
|
+
const slots = Array.from({ length: first.id }, (_, i) => ({
|
|
53
|
+
chapterId: first.chapterId,
|
|
54
|
+
id: i
|
|
55
|
+
}));
|
|
56
|
+
const left = { time: 0, audiofile: first.audiofile };
|
|
57
|
+
const right = { time: first.start, audiofile: first.audiofile };
|
|
58
|
+
result.push(...buildGapRanges(slots, left, right, audioFileDurations));
|
|
59
|
+
}
|
|
60
|
+
result.push(first);
|
|
61
|
+
for (let idx = 1; idx < sentenceRanges.length; idx++) {
|
|
62
|
+
const prev = sentenceRanges[idx - 1];
|
|
63
|
+
const curr = sentenceRanges[idx];
|
|
64
|
+
const left = { time: prev.end, audiofile: prev.audiofile };
|
|
65
|
+
const right = { time: curr.start, audiofile: curr.audiofile };
|
|
66
|
+
const gapSlots = [];
|
|
67
|
+
if (prev.chapterId === curr.chapterId) {
|
|
68
|
+
for (let id = prev.id + 1; id < curr.id; id++) {
|
|
69
|
+
gapSlots.push({ chapterId: prev.chapterId, id });
|
|
70
|
+
}
|
|
71
|
+
} else {
|
|
72
|
+
const prevTotal = chapterSentenceCounts[prev.chapterId] ?? prev.id + 1;
|
|
73
|
+
for (let id = prev.id + 1; id < prevTotal; id++) {
|
|
74
|
+
gapSlots.push({ chapterId: prev.chapterId, id });
|
|
75
|
+
}
|
|
76
|
+
for (let id = 0; id < curr.id; id++) {
|
|
77
|
+
gapSlots.push({ chapterId: curr.chapterId, id });
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
if (gapSlots.length > 0) {
|
|
81
|
+
result.push(...buildGapRanges(gapSlots, left, right, audioFileDurations));
|
|
82
|
+
}
|
|
83
|
+
result.push(curr);
|
|
84
|
+
}
|
|
85
|
+
const last = sentenceRanges[sentenceRanges.length - 1];
|
|
86
|
+
const lastTotal = chapterSentenceCounts[last.chapterId] ?? last.id + 1;
|
|
87
|
+
if (last.id < lastTotal - 1) {
|
|
88
|
+
const slots = Array.from(
|
|
89
|
+
{ length: lastTotal - 1 - last.id },
|
|
90
|
+
(_, i) => ({ chapterId: last.chapterId, id: last.id + 1 + i })
|
|
91
|
+
);
|
|
92
|
+
const fileEnd = audioFileDurations[last.audiofile] ?? last.end;
|
|
93
|
+
const left = { time: last.end, audiofile: last.audiofile };
|
|
94
|
+
const right = { time: fileEnd, audiofile: last.audiofile };
|
|
95
|
+
result.push(...buildGapRanges(slots, left, right, audioFileDurations));
|
|
96
|
+
}
|
|
97
|
+
return result;
|
|
98
|
+
}
|
|
99
|
+
export {
|
|
100
|
+
interpolateSentenceRanges
|
|
101
|
+
};
|
package/dist/align/search.cjs
CHANGED
|
@@ -37,16 +37,16 @@ function buildNgramIndex(text) {
|
|
|
37
37
|
}
|
|
38
38
|
return index;
|
|
39
39
|
}
|
|
40
|
+
const NGRAM_SIZE = 5;
|
|
40
41
|
function* ngrams(text) {
|
|
41
42
|
const words = text.split("-");
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
yield [ngram, pos];
|
|
46
|
-
pos += words[i].length + 1;
|
|
43
|
+
for (const i of (0, import_itertools.range)(words.length - NGRAM_SIZE - 1)) {
|
|
44
|
+
const ngram = words.slice(i, i + NGRAM_SIZE).join("-");
|
|
45
|
+
yield [ngram, i];
|
|
47
46
|
}
|
|
48
47
|
}
|
|
49
48
|
function collectBoundaryVotes(query, document) {
|
|
49
|
+
const queryWords = query.split("-");
|
|
50
50
|
const documentIndex = buildNgramIndex(document);
|
|
51
51
|
let skippedNgrams = 0;
|
|
52
52
|
let totalNgrams = 0;
|
|
@@ -61,7 +61,7 @@ function collectBoundaryVotes(query, document) {
|
|
|
61
61
|
}
|
|
62
62
|
for (const documentStart of documentStarts) {
|
|
63
63
|
startVotes.push(documentStart - start);
|
|
64
|
-
endVotes.push(documentStart + (
|
|
64
|
+
endVotes.push(documentStart + (queryWords.length - start));
|
|
65
65
|
}
|
|
66
66
|
}
|
|
67
67
|
if (skippedNgrams > totalNgrams / 2) {
|
|
@@ -97,6 +97,14 @@ function chooseBestFromBins(bins, dir) {
|
|
|
97
97
|
}
|
|
98
98
|
return dir > 0 ? (0, import_itertools.max)(best) ?? null : (0, import_itertools.min)(best) ?? null;
|
|
99
99
|
}
|
|
100
|
+
function getOffsetFromWordIndex(wordIndex, document) {
|
|
101
|
+
const words = document.split("-");
|
|
102
|
+
let offset = 0;
|
|
103
|
+
for (const i of (0, import_itertools.range)(Math.min(words.length, Math.max(0, wordIndex)))) {
|
|
104
|
+
offset += words[i].length + 1;
|
|
105
|
+
}
|
|
106
|
+
return offset;
|
|
107
|
+
}
|
|
100
108
|
function findBoundaries(query, document) {
|
|
101
109
|
const boundaryVotes = collectBoundaryVotes(query, document);
|
|
102
110
|
if (!boundaryVotes) return null;
|
|
@@ -111,7 +119,10 @@ function findBoundaries(query, document) {
|
|
|
111
119
|
if (bestEnd === null) {
|
|
112
120
|
return null;
|
|
113
121
|
}
|
|
114
|
-
return {
|
|
122
|
+
return {
|
|
123
|
+
start: getOffsetFromWordIndex(bestStart, document),
|
|
124
|
+
end: getOffsetFromWordIndex(bestEnd, document)
|
|
125
|
+
};
|
|
115
126
|
}
|
|
116
127
|
// Annotate the CommonJS export names for ESM import in node:
|
|
117
128
|
0 && (module.exports = {
|