@storyteller-platform/align 0.1.36 → 0.1.37

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -105,14 +105,25 @@ async function align(input, output, transcriptionsDir, audiobookDir, options) {
105
105
  var _stack2 = [];
106
106
  try {
107
107
  const outFormat = options.outFormat ?? "epub";
108
+ const epubPath = outFormat === "epub" ? (0, import_node_path.join)(
109
+ (0, import_node_os.tmpdir)(),
110
+ `storyteller-platform-align-${(0, import_node_crypto.randomUUID)()}`,
111
+ (0, import_posix.basename)(output)
112
+ ) : input;
113
+ const stack = __using(_stack2, new DisposableStack());
114
+ stack.defer(() => {
115
+ if (outFormat === "epub") {
116
+ (0, import_node_fs.rmSync)((0, import_posix.dirname)(epubPath), { recursive: true, force: true });
117
+ }
118
+ });
108
119
  if (outFormat === "epub") {
109
- await (0, import_promises.mkdir)((0, import_posix.dirname)(output), { recursive: true });
110
- await (0, import_promises.copyFile)(input, output);
120
+ await (0, import_promises.mkdir)((0, import_posix.dirname)(epubPath), { recursive: true });
121
+ await (0, import_promises.copyFile)(input, epubPath);
111
122
  }
112
123
  const audiobookFiles = await (0, import_promises.readdir)(audiobookDir).then(
113
124
  (filenames) => filenames.filter((f) => (0, import_audiobook.isAudioFile)(f)).map((f) => (0, import_node_path.join)(audiobookDir, f))
114
125
  );
115
- const epub = __using(_stack2, await import_epub.Epub.from(outFormat === "epub" ? output : input));
126
+ const epub = __using(_stack2, await import_epub.Epub.from(epubPath));
116
127
  const transcriptions = await (0, import_promises.readdir)(transcriptionsDir).then(
117
128
  (filenames) => filenames.filter((f) => f.endsWith(".json")).map((f) => (0, import_node_path.join)(transcriptionsDir, f))
118
129
  ).then(
@@ -146,6 +157,8 @@ async function align(input, output, transcriptionsDir, audiobookDir, options) {
146
157
  const timing = await aligner.alignBook(options.onProgress);
147
158
  if (outFormat === "epub") {
148
159
  await epub.saveAndClose();
160
+ await (0, import_promises.mkdir)((0, import_posix.dirname)(output), { recursive: true });
161
+ await (0, import_promises.copyFile)(epubPath, output);
149
162
  } else {
150
163
  var _stack = [];
151
164
  try {
@@ -166,8 +179,8 @@ async function align(input, output, transcriptionsDir, audiobookDir, options) {
166
179
  writeStream.on("close", () => {
167
180
  resolve();
168
181
  });
169
- const stack = __using(_stack, new AsyncDisposableStack(), true);
170
- stack.defer(async () => {
182
+ const stack2 = __using(_stack, new AsyncDisposableStack(), true);
183
+ stack2.defer(async () => {
171
184
  writeStream.close();
172
185
  await (0, import_promises.rm)(tmpArchivePath, { force: true });
173
186
  });
@@ -229,8 +242,12 @@ class Aligner {
229
242
  timing = (0, import_ghost_story.createAggregator)();
230
243
  granularity;
231
244
  textRef;
245
+ audioFileDurations = {};
232
246
  report = {
233
- chapters: []
247
+ chapters: [],
248
+ unalignedChapters: [],
249
+ audioFiles: [],
250
+ unalignedAudioFiles: []
234
251
  };
235
252
  async getChapterSentences(chapterId) {
236
253
  const chapterXml = await this.epub.readXhtmlItemContents(chapterId);
@@ -301,18 +318,24 @@ class Aligner {
301
318
  );
302
319
  }
303
320
  if (this.granularity === "word") {
304
- const wordFactory = new import_textFragments.TextFragmentFactory(
305
- blockRanges.flatMap((range) => {
306
- const sentence = sentences[range.id];
307
- const wordRanges2 = wordRangeMap.get(range.id);
308
- const toFragment = wordIdToFragment.get(range.id);
309
- if (!wordRanges2 || !toFragment) return [];
310
- const words = sentence.words.entries.filter(
311
- (w) => w.text.match(/\S/)
312
- );
313
- return words.map((w) => w.text.replace("\n", ""));
314
- })
315
- );
321
+ const allWords = [];
322
+ for (const range of blockRanges) {
323
+ const sentence = sentences[range.id];
324
+ const words = [];
325
+ for (const w of sentence.words.entries) {
326
+ if (w.isPunctuation) {
327
+ const lastWord = words.at(-1);
328
+ if (lastWord === void 0) {
329
+ continue;
330
+ }
331
+ words[words.length - 1] = lastWord + w.text.replace("\n", "");
332
+ } else {
333
+ words.push(w.text);
334
+ }
335
+ }
336
+ allWords.push(...words);
337
+ }
338
+ const wordFactory = new import_textFragments.TextFragmentFactory(allWords);
316
339
  let wordRangeIndex = 0;
317
340
  for (const range of blockRanges) {
318
341
  const wordRanges2 = wordRangeMap.get(range.id);
@@ -388,19 +411,53 @@ class Aligner {
388
411
  value: import_epub.Epub.formatSmilDuration(chapterDuration)
389
412
  });
390
413
  }
391
- addChapterReport(chapter, chapterSentences, sentenceRanges, startSentence, endSentence, transcriptionOffset) {
414
+ addChapterReport(chapter, chapterSentences, sentenceRanges, startSentence, endSentence, mapping, transcriptionOffset, endTranscriptionOffset) {
415
+ const audioFiles = sentenceRanges.reduce(
416
+ (acc, range) => {
417
+ const existing = acc.find(
418
+ (context) => context.filepath === range.audiofile
419
+ );
420
+ if (existing) {
421
+ existing.end = range.end;
422
+ return acc;
423
+ }
424
+ acc.push({
425
+ filepath: range.audiofile,
426
+ start: range.start,
427
+ end: range.end
428
+ });
429
+ return acc;
430
+ },
431
+ []
432
+ );
433
+ const mappedTranscriptionOffset = mapping.invert().map(transcriptionOffset);
434
+ const mappedEndTranscriptionOffset = mapping.invert().map(endTranscriptionOffset);
392
435
  this.report.chapters.push({
393
436
  href: chapter.href,
394
- transcriptionOffset,
437
+ transcriptionOffset: mappedTranscriptionOffset,
438
+ endTranscriptionOffset: mappedEndTranscriptionOffset,
395
439
  transcriptionContext: {
396
440
  before: this.transcription.transcript.slice(
397
- Math.max(0, transcriptionOffset - 30),
398
- transcriptionOffset
441
+ Math.max(0, mappedTranscriptionOffset - 80),
442
+ mappedTranscriptionOffset
399
443
  ),
400
444
  after: this.transcription.transcript.slice(
401
- transcriptionOffset,
445
+ mappedTranscriptionOffset,
402
446
  Math.min(
403
- transcriptionOffset + 30,
447
+ mappedTranscriptionOffset + 80,
448
+ this.transcription.transcript.length - 1
449
+ )
450
+ )
451
+ },
452
+ endTranscriptionContext: {
453
+ before: this.transcription.transcript.slice(
454
+ Math.max(0, mappedEndTranscriptionOffset - 80),
455
+ mappedEndTranscriptionOffset
456
+ ),
457
+ after: this.transcription.transcript.slice(
458
+ mappedEndTranscriptionOffset,
459
+ Math.min(
460
+ mappedEndTranscriptionOffset + 80,
404
461
  this.transcription.transcript.length - 1
405
462
  )
406
463
  )
@@ -421,24 +478,30 @@ class Aligner {
421
478
  },
422
479
  chapterSentenceCount: chapterSentences.length,
423
480
  alignedSentenceCount: sentenceRanges.length,
424
- audioFiles: sentenceRanges.reduce((acc, range) => {
425
- const existing = acc.find(
426
- (context) => context.filepath === range.audiofile
427
- );
428
- if (existing) {
429
- existing.end = range.end;
430
- return acc;
431
- }
432
- acc.push({
433
- filepath: range.audiofile,
434
- start: range.start,
435
- end: range.end
436
- });
437
- return acc;
438
- }, [])
481
+ audioFiles
439
482
  });
483
+ for (const audioFile of audioFiles) {
484
+ const existing = this.report.audioFiles.find(
485
+ ({ filepath }) => audioFile.filepath === filepath
486
+ );
487
+ if (existing) {
488
+ existing.matchedRanges.push({
489
+ start: audioFile.start,
490
+ end: audioFile.end
491
+ });
492
+ existing.matchedRanges.sort((a, b) => a.start - b.start);
493
+ existing.alignedDuration += audioFile.end - audioFile.start;
494
+ } else {
495
+ this.report.audioFiles.push({
496
+ alignedDuration: audioFile.end - audioFile.start,
497
+ duration: this.audioFileDurations[audioFile.filepath] ?? 0,
498
+ filepath: audioFile.filepath,
499
+ matchedRanges: [{ start: audioFile.start, end: audioFile.end }]
500
+ });
501
+ }
502
+ }
440
503
  }
441
- async alignChapter(chapterId, transcriptionText, transcriptionOffset, transcriptionEndOffset, locale, mappedTimeline) {
504
+ async alignChapter(chapterId, transcriptionText, transcriptionOffset, transcriptionEndOffset, locale, mappedTimeline, mapping) {
442
505
  const timing = (0, import_ghost_story.createTiming)();
443
506
  timing.start("read contents");
444
507
  const manifest = await this.epub.getManifest();
@@ -493,7 +556,9 @@ class Aligner {
493
556
  sentenceRanges,
494
557
  firstFoundSentence,
495
558
  lastFoundSentence,
496
- transcriptionOffset
559
+ mapping,
560
+ transcriptionOffset,
561
+ endTranscriptionOffset
497
562
  );
498
563
  return {
499
564
  lastSentenceRange: sentenceRanges.at(-1) ?? null,
@@ -525,6 +590,9 @@ class Aligner {
525
590
  const locale = this.languageOverride ?? await this.epub.getLanguage() ?? new Intl.Locale("en-US");
526
591
  this.timing.setMetadata("language", locale.toString());
527
592
  this.timing.setMetadata("granularity", this.granularity);
593
+ for (const audiofile of this.audiofiles) {
594
+ this.audioFileDurations[audiofile] = await (0, import_ffmpeg.getTrackDuration)(audiofile);
595
+ }
528
596
  const spine = await this.epub.getSpineItems();
529
597
  const manifest = await this.epub.getManifest();
530
598
  const { result: transcriptionText, mapping } = await (0, import_slugify.slugify)(
@@ -540,6 +608,10 @@ class Aligner {
540
608
  );
541
609
  const chapterId = spineItem.id;
542
610
  if (manifest[chapterId]?.properties?.includes("nav")) {
611
+ this.report.unalignedChapters.push({
612
+ href: spineItem.href,
613
+ reason: "is-nav"
614
+ });
543
615
  continue;
544
616
  }
545
617
  const chapterSentences = await this.getChapterSentences(chapterId);
@@ -551,6 +623,10 @@ class Aligner {
551
623
  }
552
624
  if (chapterSentences.length === 0) {
553
625
  this.logger?.info(`Chapter #${index} has no text; skipping`);
626
+ this.report.unalignedChapters.push({
627
+ href: spineItem.href,
628
+ reason: "no-text"
629
+ });
554
630
  continue;
555
631
  }
556
632
  if (chapterSentences.length < 2 && // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
@@ -558,6 +634,10 @@ class Aligner {
558
634
  this.logger?.info(
559
635
  `Chapter #${index} is fewer than four words; skipping`
560
636
  );
637
+ this.report.unalignedChapters.push({
638
+ href: spineItem.href,
639
+ reason: "too-short"
640
+ });
561
641
  continue;
562
642
  }
563
643
  const boundaries = (0, import_search.findBoundaries)(
@@ -568,6 +648,12 @@ class Aligner {
568
648
  this.logger?.info(
569
649
  `Could not find chapter #${index} in the transcripton`
570
650
  );
651
+ this.report.unalignedChapters.push({
652
+ href: spineItem.href,
653
+ reason: "not-found",
654
+ start: chapterSentences.slice(0, 3).map((s) => s.text).join("").slice(0, 80),
655
+ end: chapterSentences.slice(-3).map((s) => s.text).join("").slice(-80)
656
+ });
571
657
  continue;
572
658
  }
573
659
  const { start, end } = this.narrowToAvailableBoundary(boundaries);
@@ -580,7 +666,8 @@ class Aligner {
580
666
  Math.max(start, 0),
581
667
  Math.min(end, transcriptionText.length),
582
668
  locale,
583
- mappedTimeline
669
+ mappedTimeline,
670
+ mapping
584
671
  );
585
672
  this.timing.add(result.timing.summary());
586
673
  }
@@ -602,16 +689,8 @@ class Aligner {
602
689
  });
603
690
  const sentenceRanges = [];
604
691
  const chapterSentenceCounts = {};
605
- const audioFileDurations = {};
606
692
  for (const alignedChapter of audioOrderedChapters) {
607
693
  sentenceRanges.push(...alignedChapter.sentenceRanges);
608
- for (const sentenceRange of sentenceRanges) {
609
- if (!(sentenceRange.audiofile in audioFileDurations)) {
610
- audioFileDurations[sentenceRange.audiofile] = await (0, import_ffmpeg.getTrackDuration)(
611
- sentenceRange.audiofile
612
- );
613
- }
614
- }
615
694
  const sentences = await this.getChapterSentences(
616
695
  alignedChapter.chapter.id
617
696
  );
@@ -620,7 +699,7 @@ class Aligner {
620
699
  const interpolated = (0, import_interpolateSentenceRanges.interpolateSentenceRanges)(
621
700
  sentenceRanges,
622
701
  chapterSentenceCounts,
623
- audioFileDurations
702
+ this.audioFileDurations
624
703
  );
625
704
  const expanded = (0, import_getSentenceRanges.expandEmptySentenceRanges)(interpolated);
626
705
  const collapsed = await (0, import_getSentenceRanges.collapseSentenceRangeGaps)(expanded);
@@ -640,6 +719,11 @@ class Aligner {
640
719
  await this.writeAlignedChapter(alignedChapter);
641
720
  collapsedStart += sentences.length;
642
721
  }
722
+ for (const audiofile of this.audiofiles) {
723
+ if (!this.report.audioFiles.some(({ filepath }) => filepath === audiofile)) {
724
+ this.report.unalignedAudioFiles.push({ filepath: audiofile });
725
+ }
726
+ }
643
727
  await this.epub.addMetadata({
644
728
  type: "meta",
645
729
  properties: { property: "media:duration" },
@@ -14,10 +14,15 @@ interface AudioFileContext {
14
14
  interface ChapterReport {
15
15
  href: string;
16
16
  transcriptionOffset: number;
17
+ endTranscriptionOffset: number;
17
18
  transcriptionContext: {
18
19
  before: string;
19
20
  after: string;
20
21
  };
22
+ endTranscriptionContext: {
23
+ before: string;
24
+ after: string;
25
+ };
21
26
  firstMatchedSentenceId: number;
22
27
  firstMatchedSentenceContext: {
23
28
  prevSentence: string | null;
@@ -34,8 +39,34 @@ interface ChapterReport {
34
39
  alignedSentenceCount: number;
35
40
  audioFiles: AudioFileContext[];
36
41
  }
42
+ type UnalignedChapterReason = "too-short" | "not-found" | "is-nav" | "no-text";
43
+ interface UnalignedChapterReport {
44
+ href: string;
45
+ reason: Exclude<UnalignedChapterReason, "not-found">;
46
+ }
47
+ interface UnalignedNotFoundChapterReport {
48
+ href: string;
49
+ reason: "not-found";
50
+ start: string;
51
+ end: string;
52
+ }
53
+ interface AudioFileReport {
54
+ filepath: string;
55
+ matchedRanges: {
56
+ start: number;
57
+ end: number;
58
+ }[];
59
+ duration: number;
60
+ alignedDuration: number;
61
+ }
62
+ interface UnalignedAudioFileReport {
63
+ filepath: string;
64
+ }
37
65
  interface Report {
38
66
  chapters: ChapterReport[];
67
+ unalignedChapters: (UnalignedChapterReport | UnalignedNotFoundChapterReport)[];
68
+ audioFiles: AudioFileReport[];
69
+ unalignedAudioFiles: UnalignedAudioFileReport[];
39
70
  }
40
71
  interface AlignOptions {
41
72
  reportsPath?: string | null | undefined;
@@ -58,6 +89,7 @@ declare class Aligner {
58
89
  private timing;
59
90
  private granularity;
60
91
  private textRef;
92
+ private audioFileDurations;
61
93
  report: Report;
62
94
  constructor(epub: Epub, audiofiles: string[], transcriptions: Pick<RecognitionResult, "transcript" | "timeline">[], granularity: "sentence" | "word" | null | undefined, textRef: "id-fragment" | "text-fragment" | null | undefined, languageOverride?: (Intl.Locale | null) | undefined, logger?: (Logger | null) | undefined);
63
95
  private getChapterSentences;
@@ -14,10 +14,15 @@ interface AudioFileContext {
14
14
  interface ChapterReport {
15
15
  href: string;
16
16
  transcriptionOffset: number;
17
+ endTranscriptionOffset: number;
17
18
  transcriptionContext: {
18
19
  before: string;
19
20
  after: string;
20
21
  };
22
+ endTranscriptionContext: {
23
+ before: string;
24
+ after: string;
25
+ };
21
26
  firstMatchedSentenceId: number;
22
27
  firstMatchedSentenceContext: {
23
28
  prevSentence: string | null;
@@ -34,8 +39,34 @@ interface ChapterReport {
34
39
  alignedSentenceCount: number;
35
40
  audioFiles: AudioFileContext[];
36
41
  }
42
+ type UnalignedChapterReason = "too-short" | "not-found" | "is-nav" | "no-text";
43
+ interface UnalignedChapterReport {
44
+ href: string;
45
+ reason: Exclude<UnalignedChapterReason, "not-found">;
46
+ }
47
+ interface UnalignedNotFoundChapterReport {
48
+ href: string;
49
+ reason: "not-found";
50
+ start: string;
51
+ end: string;
52
+ }
53
+ interface AudioFileReport {
54
+ filepath: string;
55
+ matchedRanges: {
56
+ start: number;
57
+ end: number;
58
+ }[];
59
+ duration: number;
60
+ alignedDuration: number;
61
+ }
62
+ interface UnalignedAudioFileReport {
63
+ filepath: string;
64
+ }
37
65
  interface Report {
38
66
  chapters: ChapterReport[];
67
+ unalignedChapters: (UnalignedChapterReport | UnalignedNotFoundChapterReport)[];
68
+ audioFiles: AudioFileReport[];
69
+ unalignedAudioFiles: UnalignedAudioFileReport[];
39
70
  }
40
71
  interface AlignOptions {
41
72
  reportsPath?: string | null | undefined;
@@ -58,6 +89,7 @@ declare class Aligner {
58
89
  private timing;
59
90
  private granularity;
60
91
  private textRef;
92
+ private audioFileDurations;
61
93
  report: Report;
62
94
  constructor(epub: Epub, audiofiles: string[], transcriptions: Pick<RecognitionResult, "transcript" | "timeline">[], granularity: "sentence" | "word" | null | undefined, textRef: "id-fragment" | "text-fragment" | null | undefined, languageOverride?: (Intl.Locale | null) | undefined, logger?: (Logger | null) | undefined);
63
95
  private getChapterSentences;
@@ -3,7 +3,7 @@ import {
3
3
  __using
4
4
  } from "../chunk-BIEQXUOY.js";
5
5
  import { randomUUID } from "node:crypto";
6
- import { createWriteStream } from "node:fs";
6
+ import { createWriteStream, rmSync } from "node:fs";
7
7
  import {
8
8
  copyFile,
9
9
  cp,
@@ -51,14 +51,25 @@ async function align(input, output, transcriptionsDir, audiobookDir, options) {
51
51
  var _stack2 = [];
52
52
  try {
53
53
  const outFormat = options.outFormat ?? "epub";
54
+ const epubPath = outFormat === "epub" ? autoJoin(
55
+ tmpdir(),
56
+ `storyteller-platform-align-${randomUUID()}`,
57
+ basename(output)
58
+ ) : input;
59
+ const stack = __using(_stack2, new DisposableStack());
60
+ stack.defer(() => {
61
+ if (outFormat === "epub") {
62
+ rmSync(dirname(epubPath), { recursive: true, force: true });
63
+ }
64
+ });
54
65
  if (outFormat === "epub") {
55
- await mkdir(dirname(output), { recursive: true });
56
- await copyFile(input, output);
66
+ await mkdir(dirname(epubPath), { recursive: true });
67
+ await copyFile(input, epubPath);
57
68
  }
58
69
  const audiobookFiles = await readdir(audiobookDir).then(
59
70
  (filenames) => filenames.filter((f) => isAudioFile(f)).map((f) => autoJoin(audiobookDir, f))
60
71
  );
61
- const epub = __using(_stack2, await Epub.from(outFormat === "epub" ? output : input));
72
+ const epub = __using(_stack2, await Epub.from(epubPath));
62
73
  const transcriptions = await readdir(transcriptionsDir).then(
63
74
  (filenames) => filenames.filter((f) => f.endsWith(".json")).map((f) => autoJoin(transcriptionsDir, f))
64
75
  ).then(
@@ -92,6 +103,8 @@ async function align(input, output, transcriptionsDir, audiobookDir, options) {
92
103
  const timing = await aligner.alignBook(options.onProgress);
93
104
  if (outFormat === "epub") {
94
105
  await epub.saveAndClose();
106
+ await mkdir(dirname(output), { recursive: true });
107
+ await copyFile(epubPath, output);
95
108
  } else {
96
109
  var _stack = [];
97
110
  try {
@@ -112,8 +125,8 @@ async function align(input, output, transcriptionsDir, audiobookDir, options) {
112
125
  writeStream.on("close", () => {
113
126
  resolve();
114
127
  });
115
- const stack = __using(_stack, new AsyncDisposableStack(), true);
116
- stack.defer(async () => {
128
+ const stack2 = __using(_stack, new AsyncDisposableStack(), true);
129
+ stack2.defer(async () => {
117
130
  writeStream.close();
118
131
  await rm(tmpArchivePath, { force: true });
119
132
  });
@@ -175,8 +188,12 @@ class Aligner {
175
188
  timing = createAggregator();
176
189
  granularity;
177
190
  textRef;
191
+ audioFileDurations = {};
178
192
  report = {
179
- chapters: []
193
+ chapters: [],
194
+ unalignedChapters: [],
195
+ audioFiles: [],
196
+ unalignedAudioFiles: []
180
197
  };
181
198
  async getChapterSentences(chapterId) {
182
199
  const chapterXml = await this.epub.readXhtmlItemContents(chapterId);
@@ -247,18 +264,24 @@ class Aligner {
247
264
  );
248
265
  }
249
266
  if (this.granularity === "word") {
250
- const wordFactory = new TextFragmentFactory(
251
- blockRanges.flatMap((range) => {
252
- const sentence = sentences[range.id];
253
- const wordRanges2 = wordRangeMap.get(range.id);
254
- const toFragment = wordIdToFragment.get(range.id);
255
- if (!wordRanges2 || !toFragment) return [];
256
- const words = sentence.words.entries.filter(
257
- (w) => w.text.match(/\S/)
258
- );
259
- return words.map((w) => w.text.replace("\n", ""));
260
- })
261
- );
267
+ const allWords = [];
268
+ for (const range of blockRanges) {
269
+ const sentence = sentences[range.id];
270
+ const words = [];
271
+ for (const w of sentence.words.entries) {
272
+ if (w.isPunctuation) {
273
+ const lastWord = words.at(-1);
274
+ if (lastWord === void 0) {
275
+ continue;
276
+ }
277
+ words[words.length - 1] = lastWord + w.text.replace("\n", "");
278
+ } else {
279
+ words.push(w.text);
280
+ }
281
+ }
282
+ allWords.push(...words);
283
+ }
284
+ const wordFactory = new TextFragmentFactory(allWords);
262
285
  let wordRangeIndex = 0;
263
286
  for (const range of blockRanges) {
264
287
  const wordRanges2 = wordRangeMap.get(range.id);
@@ -334,19 +357,53 @@ class Aligner {
334
357
  value: Epub.formatSmilDuration(chapterDuration)
335
358
  });
336
359
  }
337
- addChapterReport(chapter, chapterSentences, sentenceRanges, startSentence, endSentence, transcriptionOffset) {
360
+ addChapterReport(chapter, chapterSentences, sentenceRanges, startSentence, endSentence, mapping, transcriptionOffset, endTranscriptionOffset) {
361
+ const audioFiles = sentenceRanges.reduce(
362
+ (acc, range) => {
363
+ const existing = acc.find(
364
+ (context) => context.filepath === range.audiofile
365
+ );
366
+ if (existing) {
367
+ existing.end = range.end;
368
+ return acc;
369
+ }
370
+ acc.push({
371
+ filepath: range.audiofile,
372
+ start: range.start,
373
+ end: range.end
374
+ });
375
+ return acc;
376
+ },
377
+ []
378
+ );
379
+ const mappedTranscriptionOffset = mapping.invert().map(transcriptionOffset);
380
+ const mappedEndTranscriptionOffset = mapping.invert().map(endTranscriptionOffset);
338
381
  this.report.chapters.push({
339
382
  href: chapter.href,
340
- transcriptionOffset,
383
+ transcriptionOffset: mappedTranscriptionOffset,
384
+ endTranscriptionOffset: mappedEndTranscriptionOffset,
341
385
  transcriptionContext: {
342
386
  before: this.transcription.transcript.slice(
343
- Math.max(0, transcriptionOffset - 30),
344
- transcriptionOffset
387
+ Math.max(0, mappedTranscriptionOffset - 80),
388
+ mappedTranscriptionOffset
345
389
  ),
346
390
  after: this.transcription.transcript.slice(
347
- transcriptionOffset,
391
+ mappedTranscriptionOffset,
348
392
  Math.min(
349
- transcriptionOffset + 30,
393
+ mappedTranscriptionOffset + 80,
394
+ this.transcription.transcript.length - 1
395
+ )
396
+ )
397
+ },
398
+ endTranscriptionContext: {
399
+ before: this.transcription.transcript.slice(
400
+ Math.max(0, mappedEndTranscriptionOffset - 80),
401
+ mappedEndTranscriptionOffset
402
+ ),
403
+ after: this.transcription.transcript.slice(
404
+ mappedEndTranscriptionOffset,
405
+ Math.min(
406
+ mappedEndTranscriptionOffset + 80,
350
407
  this.transcription.transcript.length - 1
351
408
  )
352
409
  )
@@ -367,24 +424,30 @@ class Aligner {
367
424
  },
368
425
  chapterSentenceCount: chapterSentences.length,
369
426
  alignedSentenceCount: sentenceRanges.length,
370
- audioFiles: sentenceRanges.reduce((acc, range) => {
371
- const existing = acc.find(
372
- (context) => context.filepath === range.audiofile
373
- );
374
- if (existing) {
375
- existing.end = range.end;
376
- return acc;
377
- }
378
- acc.push({
379
- filepath: range.audiofile,
380
- start: range.start,
381
- end: range.end
382
- });
383
- return acc;
384
- }, [])
427
+ audioFiles
385
428
  });
429
+ for (const audioFile of audioFiles) {
430
+ const existing = this.report.audioFiles.find(
431
+ ({ filepath }) => audioFile.filepath === filepath
432
+ );
433
+ if (existing) {
434
+ existing.matchedRanges.push({
435
+ start: audioFile.start,
436
+ end: audioFile.end
437
+ });
438
+ existing.matchedRanges.sort((a, b) => a.start - b.start);
439
+ existing.alignedDuration += audioFile.end - audioFile.start;
440
+ } else {
441
+ this.report.audioFiles.push({
442
+ alignedDuration: audioFile.end - audioFile.start,
443
+ duration: this.audioFileDurations[audioFile.filepath] ?? 0,
444
+ filepath: audioFile.filepath,
445
+ matchedRanges: [{ start: audioFile.start, end: audioFile.end }]
446
+ });
447
+ }
448
+ }
386
449
  }
387
- async alignChapter(chapterId, transcriptionText, transcriptionOffset, transcriptionEndOffset, locale, mappedTimeline) {
450
+ async alignChapter(chapterId, transcriptionText, transcriptionOffset, transcriptionEndOffset, locale, mappedTimeline, mapping) {
388
451
  const timing = createTiming();
389
452
  timing.start("read contents");
390
453
  const manifest = await this.epub.getManifest();
@@ -439,7 +502,9 @@ class Aligner {
439
502
  sentenceRanges,
440
503
  firstFoundSentence,
441
504
  lastFoundSentence,
442
- transcriptionOffset
505
+ mapping,
506
+ transcriptionOffset,
507
+ endTranscriptionOffset
443
508
  );
444
509
  return {
445
510
  lastSentenceRange: sentenceRanges.at(-1) ?? null,
@@ -471,6 +536,9 @@ class Aligner {
471
536
  const locale = this.languageOverride ?? await this.epub.getLanguage() ?? new Intl.Locale("en-US");
472
537
  this.timing.setMetadata("language", locale.toString());
473
538
  this.timing.setMetadata("granularity", this.granularity);
539
+ for (const audiofile of this.audiofiles) {
540
+ this.audioFileDurations[audiofile] = await getTrackDuration(audiofile);
541
+ }
474
542
  const spine = await this.epub.getSpineItems();
475
543
  const manifest = await this.epub.getManifest();
476
544
  const { result: transcriptionText, mapping } = await slugify(
@@ -486,6 +554,10 @@ class Aligner {
486
554
  );
487
555
  const chapterId = spineItem.id;
488
556
  if (manifest[chapterId]?.properties?.includes("nav")) {
557
+ this.report.unalignedChapters.push({
558
+ href: spineItem.href,
559
+ reason: "is-nav"
560
+ });
489
561
  continue;
490
562
  }
491
563
  const chapterSentences = await this.getChapterSentences(chapterId);
@@ -497,6 +569,10 @@ class Aligner {
497
569
  }
498
570
  if (chapterSentences.length === 0) {
499
571
  this.logger?.info(`Chapter #${index} has no text; skipping`);
572
+ this.report.unalignedChapters.push({
573
+ href: spineItem.href,
574
+ reason: "no-text"
575
+ });
500
576
  continue;
501
577
  }
502
578
  if (chapterSentences.length < 2 && // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
@@ -504,6 +580,10 @@ class Aligner {
504
580
  this.logger?.info(
505
581
  `Chapter #${index} is fewer than four words; skipping`
506
582
  );
583
+ this.report.unalignedChapters.push({
584
+ href: spineItem.href,
585
+ reason: "too-short"
586
+ });
507
587
  continue;
508
588
  }
509
589
  const boundaries = findBoundaries(
@@ -514,6 +594,12 @@ class Aligner {
514
594
  this.logger?.info(
515
595
  `Could not find chapter #${index} in the transcripton`
516
596
  );
597
+ this.report.unalignedChapters.push({
598
+ href: spineItem.href,
599
+ reason: "not-found",
600
+ start: chapterSentences.slice(0, 3).map((s) => s.text).join("").slice(0, 80),
601
+ end: chapterSentences.slice(-3).map((s) => s.text).join("").slice(-80)
602
+ });
517
603
  continue;
518
604
  }
519
605
  const { start, end } = this.narrowToAvailableBoundary(boundaries);
@@ -526,7 +612,8 @@ class Aligner {
526
612
  Math.max(start, 0),
527
613
  Math.min(end, transcriptionText.length),
528
614
  locale,
529
- mappedTimeline
615
+ mappedTimeline,
616
+ mapping
530
617
  );
531
618
  this.timing.add(result.timing.summary());
532
619
  }
@@ -548,16 +635,8 @@ class Aligner {
548
635
  });
549
636
  const sentenceRanges = [];
550
637
  const chapterSentenceCounts = {};
551
- const audioFileDurations = {};
552
638
  for (const alignedChapter of audioOrderedChapters) {
553
639
  sentenceRanges.push(...alignedChapter.sentenceRanges);
554
- for (const sentenceRange of sentenceRanges) {
555
- if (!(sentenceRange.audiofile in audioFileDurations)) {
556
- audioFileDurations[sentenceRange.audiofile] = await getTrackDuration(
557
- sentenceRange.audiofile
558
- );
559
- }
560
- }
561
640
  const sentences = await this.getChapterSentences(
562
641
  alignedChapter.chapter.id
563
642
  );
@@ -566,7 +645,7 @@ class Aligner {
566
645
  const interpolated = interpolateSentenceRanges(
567
646
  sentenceRanges,
568
647
  chapterSentenceCounts,
569
- audioFileDurations
648
+ this.audioFileDurations
570
649
  );
571
650
  const expanded = expandEmptySentenceRanges(interpolated);
572
651
  const collapsed = await collapseSentenceRangeGaps(expanded);
@@ -586,6 +665,11 @@ class Aligner {
586
665
  await this.writeAlignedChapter(alignedChapter);
587
666
  collapsedStart += sentences.length;
588
667
  }
668
+ for (const audiofile of this.audiofiles) {
669
+ if (!this.report.audioFiles.some(({ filepath }) => filepath === audiofile)) {
670
+ this.report.unalignedAudioFiles.push({ filepath: audiofile });
671
+ }
672
+ }
589
673
  await this.epub.addMetadata({
590
674
  type: "meta",
591
675
  properties: { property: "media:duration" },
@@ -40,7 +40,9 @@ const alignParser = (0, import_core.object)("Alignment", {
40
40
  }),
41
41
  "id-fragment"
42
42
  ),
43
- reports: (0, import_core.optional)((0, import_core.option)("--reports", (0, import_valueparser.path)({ type: "directory" }))),
43
+ reports: (0, import_core.optional)(
44
+ (0, import_core.option)("--reports", (0, import_valueparser.path)({ type: "file", extensions: [".json"] }))
45
+ ),
44
46
  outFormat: (0, import_core.withDefault)(
45
47
  (0, import_core.option)("--out-format", (0, import_core.choice)(["epub", "gnp"]), {
46
48
  description: import_core.message`Whether to output a full EPUB 3 package with embedded media overlays and audio, or a Readium Guided Navigation Package with just a manifest and guided navigation documents.`
@@ -31,7 +31,9 @@ const alignParser = object("Alignment", {
31
31
  }),
32
32
  "id-fragment"
33
33
  ),
34
- reports: optional(option("--reports", path({ type: "directory" }))),
34
+ reports: optional(
35
+ option("--reports", path({ type: "file", extensions: [".json"] }))
36
+ ),
35
37
  outFormat: withDefault(
36
38
  option("--out-format", choice(["epub", "gnp"]), {
37
39
  description: message`Whether to output a full EPUB 3 package with embedded media overlays and audio, or a Readium Guided Navigation Package with just a manifest and guided navigation documents.`
@@ -133,6 +133,7 @@ async function slugify(text, locale) {
133
133
  replacerMap.set(locale, replacers);
134
134
  const { result, mapping } = await (0, import_transliteration.slugify)(text, {
135
135
  allowedChars: "a-zA-Z0-9",
136
+ locale,
136
137
  replace: replacers
137
138
  });
138
139
  return { result, mapping };
@@ -111,6 +111,7 @@ async function slugify(text, locale) {
111
111
  replacerMap.set(locale, replacers);
112
112
  const { result, mapping } = await transliterateSlugify(text, {
113
113
  allowedChars: "a-zA-Z0-9",
114
+ locale,
114
115
  replace: replacers
115
116
  });
116
117
  return { result, mapping };
@@ -64,15 +64,23 @@ class TextFragmentFactory {
64
64
  toRemove.toReversed().map((r) => candidates.splice(r, 1));
65
65
  i++;
66
66
  }
67
+ while (chars.at(i)?.match(/[\p{L}\p{N}]/u) && i < chars.length) i++;
67
68
  let fragment = "";
68
69
  const start = chars.slice(0, i).join("");
69
70
  fragment += encodeTextFragmentPart(start);
70
- const remainingSpan = span.slice(i);
71
+ const remainingChars = chars.slice(i);
72
+ while (!remainingChars.at(-1)?.match(/[\p{L}\p{N}]/u) && remainingChars.length) {
73
+ remainingChars.splice(remainingChars.length - 1, 1);
74
+ }
75
+ let e = remainingChars.length;
71
76
  let end = "";
72
- let e = remainingSpan.length - 1;
73
- if (remainingSpan.at(-1) === "\n") e--;
74
- while (remainingSpan.indexOf(end) !== e + 1 && e >= 0) {
75
- end = remainingSpan.slice(e);
77
+ const remainingSpan = remainingChars.join("");
78
+ while (remainingSpan.indexOf(end) !== remainingSpan.length - remainingChars.slice(e).join("").length && e >= 0) {
79
+ e--;
80
+ end = remainingChars.slice(e).join("");
81
+ }
82
+ while (remainingChars.at(e)?.match(/[\p{L}\p{N}]/u) && e >= 0) {
83
+ end = remainingChars.slice(e).join("");
76
84
  e--;
77
85
  }
78
86
  if (end) {
@@ -93,7 +101,10 @@ class TextFragmentFactory {
93
101
  p++;
94
102
  if (!candidates.length) break;
95
103
  }
96
- const prefix = this.runes.slice(startPos - p + 1, startPos).join("");
104
+ while (this.runes.at(startPos - p - 1)?.match(/[\p{L}\p{N}]/u) && p <= startPos) {
105
+ p++;
106
+ }
107
+ const prefix = this.runes.slice(startPos - p, startPos).join("");
97
108
  fragment = `${encodeTextFragmentPart(prefix)}-,${fragment}`;
98
109
  }
99
110
  return `:~:text=${fragment}`;
@@ -42,15 +42,23 @@ class TextFragmentFactory {
42
42
  toRemove.toReversed().map((r) => candidates.splice(r, 1));
43
43
  i++;
44
44
  }
45
+ while (chars.at(i)?.match(/[\p{L}\p{N}]/u) && i < chars.length) i++;
45
46
  let fragment = "";
46
47
  const start = chars.slice(0, i).join("");
47
48
  fragment += encodeTextFragmentPart(start);
48
- const remainingSpan = span.slice(i);
49
+ const remainingChars = chars.slice(i);
50
+ while (!remainingChars.at(-1)?.match(/[\p{L}\p{N}]/u) && remainingChars.length) {
51
+ remainingChars.splice(remainingChars.length - 1, 1);
52
+ }
53
+ let e = remainingChars.length;
49
54
  let end = "";
50
- let e = remainingSpan.length - 1;
51
- if (remainingSpan.at(-1) === "\n") e--;
52
- while (remainingSpan.indexOf(end) !== e + 1 && e >= 0) {
53
- end = remainingSpan.slice(e);
55
+ const remainingSpan = remainingChars.join("");
56
+ while (remainingSpan.indexOf(end) !== remainingSpan.length - remainingChars.slice(e).join("").length && e >= 0) {
57
+ e--;
58
+ end = remainingChars.slice(e).join("");
59
+ }
60
+ while (remainingChars.at(e)?.match(/[\p{L}\p{N}]/u) && e >= 0) {
61
+ end = remainingChars.slice(e).join("");
54
62
  e--;
55
63
  }
56
64
  if (end) {
@@ -71,7 +79,10 @@ class TextFragmentFactory {
71
79
  p++;
72
80
  if (!candidates.length) break;
73
81
  }
74
- const prefix = this.runes.slice(startPos - p + 1, startPos).join("");
82
+ while (this.runes.at(startPos - p - 1)?.match(/[\p{L}\p{N}]/u) && p <= startPos) {
83
+ p++;
84
+ }
85
+ const prefix = this.runes.slice(startPos - p, startPos).join("");
75
86
  fragment = `${encodeTextFragmentPart(prefix)}-,${fragment}`;
76
87
  }
77
88
  return `:~:text=${fragment}`;
package/dist/cli/bin.cjs CHANGED
@@ -229,6 +229,7 @@ async function main() {
229
229
  textRef: parsed.textRef,
230
230
  outFormat: parsed.outFormat,
231
231
  primaryLocale: parsed.language,
232
+ reportsPath: parsed.reports,
232
233
  logger,
233
234
  ...!parsed.noProgress && parsed.logLevel === "silent" && {
234
235
  onProgress: (progress) => {
@@ -349,6 +350,7 @@ async function main() {
349
350
  textRef: parsed.textRef,
350
351
  outFormat: parsed.outFormat,
351
352
  primaryLocale,
353
+ reportsPath: parsed.reports,
352
354
  logger,
353
355
  ...!parsed.noProgress && parsed.logLevel === "silent" && {
354
356
  onProgress: (progress) => {
package/dist/cli/bin.js CHANGED
@@ -180,6 +180,7 @@ async function main() {
180
180
  textRef: parsed.textRef,
181
181
  outFormat: parsed.outFormat,
182
182
  primaryLocale: parsed.language,
183
+ reportsPath: parsed.reports,
183
184
  logger,
184
185
  ...!parsed.noProgress && parsed.logLevel === "silent" && {
185
186
  onProgress: (progress) => {
@@ -300,6 +301,7 @@ async function main() {
300
301
  textRef: parsed.textRef,
301
302
  outFormat: parsed.outFormat,
302
303
  primaryLocale,
304
+ reportsPath: parsed.reports,
303
305
  logger,
304
306
  ...!parsed.noProgress && parsed.logLevel === "silent" && {
305
307
  onProgress: (progress) => {
@@ -1,2 +1,2 @@
1
1
  import '@storyteller-platform/epub';
2
- export { F as FootnoteNode, M as Mark, N as Node, b as NoterefNode, R as Root, T as TextNode, d as descendants } from '../model-TZi1QUQh.cjs';
2
+ export { F as FootnoteNode, M as Mark, N as Node, b as NoterefNode, R as Root, T as TextNode, d as descendants } from '../model-Bv3yPEdd.cjs';
@@ -1,2 +1,2 @@
1
1
  import '@storyteller-platform/epub';
2
- export { F as FootnoteNode, M as Mark, N as Node, b as NoterefNode, R as Root, T as TextNode, d as descendants } from '../model-TZi1QUQh.js';
2
+ export { F as FootnoteNode, M as Mark, N as Node, b as NoterefNode, R as Root, T as TextNode, d as descendants } from '../model-Bv3yPEdd.js';
@@ -1,5 +1,5 @@
1
1
  import { ParsedXml } from '@storyteller-platform/epub';
2
- import { R as Root, N as Node } from '../model-TZi1QUQh.cjs';
2
+ import { R as Root, N as Node } from '../model-Bv3yPEdd.cjs';
3
3
 
4
4
  declare function parseDom(xml: ParsedXml): Root;
5
5
  declare function findFootnotePairs(root: Root | Node): Map<number, number>;
@@ -1,5 +1,5 @@
1
1
  import { ParsedXml } from '@storyteller-platform/epub';
2
- import { R as Root, N as Node } from '../model-TZi1QUQh.js';
2
+ import { R as Root, N as Node } from '../model-Bv3yPEdd.js';
3
3
 
4
4
  declare function parseDom(xml: ParsedXml): Root;
5
5
  declare function findFootnotePairs(root: Root | Node): Map<number, number>;
@@ -1,2 +1,2 @@
1
- export { a as ResolvedPos } from '../model-TZi1QUQh.cjs';
1
+ export { a as ResolvedPos } from '../model-Bv3yPEdd.cjs';
2
2
  import '@storyteller-platform/epub';
@@ -1,2 +1,2 @@
1
- export { a as ResolvedPos } from '../model-TZi1QUQh.js';
1
+ export { a as ResolvedPos } from '../model-Bv3yPEdd.js';
2
2
  import '@storyteller-platform/epub';
@@ -1,5 +1,5 @@
1
1
  import { ParsedXml } from '@storyteller-platform/epub';
2
- import { R as Root } from '../model-TZi1QUQh.cjs';
2
+ import { R as Root } from '../model-Bv3yPEdd.cjs';
3
3
 
4
4
  declare function serializeDom(doc: Root): ParsedXml;
5
5
 
@@ -1,5 +1,5 @@
1
1
  import { ParsedXml } from '@storyteller-platform/epub';
2
- import { R as Root } from '../model-TZi1QUQh.js';
2
+ import { R as Root } from '../model-Bv3yPEdd.js';
3
3
 
4
4
  declare function serializeDom(doc: Root): ParsedXml;
5
5
 
@@ -81,11 +81,22 @@ function liftText(root) {
81
81
  ])
82
82
  );
83
83
  }
84
- lastTextEnd = pos + node.nodeSize;
85
84
  let result = node.text.replaceAll(/\n/g, " ");
85
+ if (text.endsWith("\n")) {
86
+ const contentStart = result.match(/\S/u)?.index ?? result.length;
87
+ if (contentStart !== 0) {
88
+ result = result.slice(contentStart);
89
+ mapping.appendMap(
90
+ new import_map.StepMap([mapping.map(lastTextEnd), contentStart, 0])
91
+ );
92
+ }
93
+ }
94
+ lastTextEnd = pos + node.nodeSize;
86
95
  const hasBlockSiblings = parent.children.some((child) => child.isBlock);
87
96
  if (hasBlockSiblings && !result.match(/\S/)) {
88
- mapping.appendMap(new import_map.StepMap([textLength, result.length, 0]));
97
+ if (result.length) {
98
+ mapping.appendMap(new import_map.StepMap([textLength, result.length, 0]));
99
+ }
89
100
  result = "";
90
101
  }
91
102
  if (parent.isBlock && index === parent.children.length - 1 && !(text + result).endsWith("\n")) {
@@ -1,5 +1,5 @@
1
1
  import { Mapping } from './map.cjs';
2
- import { R as Root, M as Mark } from '../model-TZi1QUQh.cjs';
2
+ import { R as Root, M as Mark } from '../model-Bv3yPEdd.cjs';
3
3
  import '@storyteller-platform/epub';
4
4
 
5
5
  declare function addMark(root: Root, from: number, to: number, mark: Mark): Root;
@@ -1,5 +1,5 @@
1
1
  import { Mapping } from './map.js';
2
- import { R as Root, M as Mark } from '../model-TZi1QUQh.js';
2
+ import { R as Root, M as Mark } from '../model-Bv3yPEdd.js';
3
3
  import '@storyteller-platform/epub';
4
4
 
5
5
  declare function addMark(root: Root, from: number, to: number, mark: Mark): Root;
@@ -61,11 +61,22 @@ function liftText(root) {
61
61
  ])
62
62
  );
63
63
  }
64
- lastTextEnd = pos + node.nodeSize;
65
64
  let result = node.text.replaceAll(/\n/g, " ");
65
+ if (text.endsWith("\n")) {
66
+ const contentStart = result.match(/\S/u)?.index ?? result.length;
67
+ if (contentStart !== 0) {
68
+ result = result.slice(contentStart);
69
+ mapping.appendMap(
70
+ new StepMap([mapping.map(lastTextEnd), contentStart, 0])
71
+ );
72
+ }
73
+ }
74
+ lastTextEnd = pos + node.nodeSize;
66
75
  const hasBlockSiblings = parent.children.some((child) => child.isBlock);
67
76
  if (hasBlockSiblings && !result.match(/\S/)) {
68
- mapping.appendMap(new StepMap([textLength, result.length, 0]));
77
+ if (result.length) {
78
+ mapping.appendMap(new StepMap([textLength, result.length, 0]));
79
+ }
69
80
  result = "";
70
81
  }
71
82
  if (parent.isBlock && index === parent.children.length - 1 && !(text + result).endsWith("\n")) {
@@ -45,7 +45,7 @@ declare class Node {
45
45
  get isLeaf(): boolean;
46
46
  get isInline(): boolean;
47
47
  get isBlock(): boolean;
48
- get border(): 1 | 0;
48
+ get border(): 0 | 1;
49
49
  get nodeSize(): number;
50
50
  get contentSize(): number;
51
51
  get textContent(): string;
@@ -45,7 +45,7 @@ declare class Node {
45
45
  get isLeaf(): boolean;
46
46
  get isInline(): boolean;
47
47
  get isBlock(): boolean;
48
- get border(): 1 | 0;
48
+ get border(): 0 | 1;
49
49
  get nodeSize(): number;
50
50
  get contentSize(): number;
51
51
  get textContent(): string;
@@ -71,6 +71,7 @@ var import_promises = require("node:fs/promises");
71
71
  var import_node_path = require("node:path");
72
72
  var import_posix = require("node:path/posix");
73
73
  var import_epub = require("@storyteller-platform/epub");
74
+ var import_ghost_story = require("@storyteller-platform/ghost-story");
74
75
  var import_parseDom = require("../markup/parseDom.cjs");
75
76
  var import_segmentation = require("../markup/segmentation.cjs");
76
77
  var import_transform = require("../markup/transform.cjs");
@@ -184,7 +185,9 @@ async function createAlignmentSnapshot(epub, transcriptionFilepaths, textRef) {
184
185
  }
185
186
  word = transcription.timeline[++i];
186
187
  }
187
- const transcriptionSentence = transcriptionWords.join(" ");
188
+ const transcriptionSentence = transcriptionWords.map(
189
+ (w, idx) => (0, import_ghost_story.startsWithSpacelessScript)(w) || idx === transcriptionWords.length - 1 ? w : `${w} `
190
+ ).join("");
188
191
  newSnapshot += `Audio: ${transcriptionSentence}
189
192
  `;
190
193
  }
@@ -11,6 +11,9 @@ import {
11
11
  import {
12
12
  Epub
13
13
  } from "@storyteller-platform/epub";
14
+ import {
15
+ startsWithSpacelessScript
16
+ } from "@storyteller-platform/ghost-story";
14
17
  import { parseDom } from "../markup/parseDom.js";
15
18
  import { segmentChapter } from "../markup/segmentation.js";
16
19
  import { inlineFootnotes, liftText } from "../markup/transform.js";
@@ -124,7 +127,9 @@ async function createAlignmentSnapshot(epub, transcriptionFilepaths, textRef) {
124
127
  }
125
128
  word = transcription.timeline[++i];
126
129
  }
127
- const transcriptionSentence = transcriptionWords.join(" ");
130
+ const transcriptionSentence = transcriptionWords.map(
131
+ (w, idx) => startsWithSpacelessScript(w) || idx === transcriptionWords.length - 1 ? w : `${w} `
132
+ ).join("");
128
133
  newSnapshot += `Audio: ${transcriptionSentence}
129
134
  `;
130
135
  }
@@ -84,10 +84,6 @@ var import_async_semaphore = require("@esfx/async-semaphore");
84
84
  var import_audiobook = require("@storyteller-platform/audiobook");
85
85
  var import_ghost_story = require("@storyteller-platform/ghost-story");
86
86
  async function transcribe(input, output, locale, options) {
87
- if (process.env["DEBUG_TRANSCRIBE"] === "true") {
88
- const inspector = await import("node:inspector");
89
- inspector.open(9231, "0.0.0.0", true);
90
- }
91
87
  const semaphore = new import_async_semaphore.AsyncSemaphore(options.parallelism ?? 1);
92
88
  const controller = new AbortController();
93
89
  const signal = AbortSignal.any([
@@ -15,10 +15,6 @@ import {
15
15
  recognize
16
16
  } from "@storyteller-platform/ghost-story";
17
17
  async function transcribe(input, output, locale, options) {
18
- if (process.env["DEBUG_TRANSCRIBE"] === "true") {
19
- const inspector = await import("node:inspector");
20
- inspector.open(9231, "0.0.0.0", true);
21
- }
22
18
  const semaphore = new AsyncSemaphore(options.parallelism ?? 1);
23
19
  const controller = new AbortController();
24
20
  const signal = AbortSignal.any([
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@storyteller-platform/align",
3
- "version": "0.1.36",
3
+ "version": "0.1.37",
4
4
  "description": "A library and CLI for automatically aligning audiobooks and EPUBs to produce Media Overlays",
5
5
  "author": "Shane Friedman",
6
6
  "license": "MIT",
@@ -62,7 +62,7 @@
62
62
  "@readium/shared": "patch:@readium/shared@npm%3A2.1.5#~/.yarn/patches/@readium-shared-npm-2.1.5-8d6f9d2432.patch",
63
63
  "@storyteller-platform/audiobook": "^0.3.10",
64
64
  "@storyteller-platform/epub": "^0.5.0",
65
- "@storyteller-platform/ghost-story": "^0.1.10",
65
+ "@storyteller-platform/ghost-story": "^0.1.11",
66
66
  "@storyteller-platform/transliteration": "^3.1.2",
67
67
  "chalk": "^5.4.1",
68
68
  "change-case": "^5.4.4",