npm - @storyteller-platform/align - Versions diffs - 0.1.36 → 0.1.37 - Mend

@storyteller-platform/align 0.1.36 → 0.1.37

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/dist/align/align.cjs +135 -51
package/dist/align/align.d.cts +32 -0
package/dist/align/align.d.ts +32 -0
package/dist/align/align.js +136 -52
package/dist/align/parse.cjs +3 -1
package/dist/align/parse.js +3 -1
package/dist/align/slugify.cjs +1 -0
package/dist/align/slugify.js +1 -0
package/dist/align/textFragments.cjs +17 -6
package/dist/align/textFragments.js +17 -6
package/dist/cli/bin.cjs +2 -0
package/dist/cli/bin.js +2 -0
package/dist/markup/model.d.cts +1 -1
package/dist/markup/model.d.ts +1 -1
package/dist/markup/parseDom.d.cts +1 -1
package/dist/markup/parseDom.d.ts +1 -1
package/dist/markup/resolvedPos.d.cts +1 -1
package/dist/markup/resolvedPos.d.ts +1 -1
package/dist/markup/serializeDom.d.cts +1 -1
package/dist/markup/serializeDom.d.ts +1 -1
package/dist/markup/transform.cjs +13 -2
package/dist/markup/transform.d.cts +1 -1
package/dist/markup/transform.d.ts +1 -1
package/dist/markup/transform.js +13 -2
package/dist/{model-TZi1QUQh.d.cts → model-Bv3yPEdd.d.cts} +1 -1
package/dist/{model-TZi1QUQh.d.ts → model-Bv3yPEdd.d.ts} +1 -1
package/dist/snapshot/snapshot.cjs +4 -1
package/dist/snapshot/snapshot.js +6 -1
package/dist/transcribe/transcribe.cjs +0 -4
package/dist/transcribe/transcribe.js +0 -4
package/package.json +2 -2

package/dist/align/align.cjs CHANGED Viewed

@@ -105,14 +105,25 @@ async function align(input, output, transcriptionsDir, audiobookDir, options) {
   var _stack2 = [];
   try {
     const outFormat = options.outFormat ?? "epub";
+    const epubPath = outFormat === "epub" ? (0, import_node_path.join)(
+      (0, import_node_os.tmpdir)(),
+      `storyteller-platform-align-${(0, import_node_crypto.randomUUID)()}`,
+      (0, import_posix.basename)(output)
+    ) : input;
+    const stack = __using(_stack2, new DisposableStack());
+    stack.defer(() => {
+      if (outFormat === "epub") {
+        (0, import_node_fs.rmSync)((0, import_posix.dirname)(epubPath), { recursive: true, force: true });
+      }
+    });
     if (outFormat === "epub") {
-      await (0, import_promises.mkdir)((0, import_posix.dirname)(output), { recursive: true });
-      await (0, import_promises.copyFile)(input, output);
+      await (0, import_promises.mkdir)((0, import_posix.dirname)(epubPath), { recursive: true });
+      await (0, import_promises.copyFile)(input, epubPath);
     }
     const audiobookFiles = await (0, import_promises.readdir)(audiobookDir).then(
       (filenames) => filenames.filter((f) => (0, import_audiobook.isAudioFile)(f)).map((f) => (0, import_node_path.join)(audiobookDir, f))
     );
-    const epub = __using(_stack2, await import_epub.Epub.from(outFormat === "epub" ? output : input));
+    const epub = __using(_stack2, await import_epub.Epub.from(epubPath));
     const transcriptions = await (0, import_promises.readdir)(transcriptionsDir).then(
       (filenames) => filenames.filter((f) => f.endsWith(".json")).map((f) => (0, import_node_path.join)(transcriptionsDir, f))
     ).then(
@@ -146,6 +157,8 @@ async function align(input, output, transcriptionsDir, audiobookDir, options) {
     const timing = await aligner.alignBook(options.onProgress);
     if (outFormat === "epub") {
       await epub.saveAndClose();
+      await (0, import_promises.mkdir)((0, import_posix.dirname)(output), { recursive: true });
+      await (0, import_promises.copyFile)(epubPath, output);
     } else {
       var _stack = [];
       try {
@@ -166,8 +179,8 @@ async function align(input, output, transcriptionsDir, audiobookDir, options) {
         writeStream.on("close", () => {
           resolve();
         });
-        const stack = __using(_stack, new AsyncDisposableStack(), true);
-        stack.defer(async () => {
+        const stack2 = __using(_stack, new AsyncDisposableStack(), true);
+        stack2.defer(async () => {
           writeStream.close();
           await (0, import_promises.rm)(tmpArchivePath, { force: true });
         });
@@ -229,8 +242,12 @@ class Aligner {
   timing = (0, import_ghost_story.createAggregator)();
   granularity;
   textRef;
+  audioFileDurations = {};
   report = {
-    chapters: []
+    chapters: [],
+    unalignedChapters: [],
+    audioFiles: [],
+    unalignedAudioFiles: []
   };
   async getChapterSentences(chapterId) {
     const chapterXml = await this.epub.readXhtmlItemContents(chapterId);
@@ -301,18 +318,24 @@ class Aligner {
           );
         }
         if (this.granularity === "word") {
-          const wordFactory = new import_textFragments.TextFragmentFactory(
-            blockRanges.flatMap((range) => {
-              const sentence = sentences[range.id];
-              const wordRanges2 = wordRangeMap.get(range.id);
-              const toFragment = wordIdToFragment.get(range.id);
-              if (!wordRanges2 || !toFragment) return [];
-              const words = sentence.words.entries.filter(
-                (w) => w.text.match(/\S/)
-              );
-              return words.map((w) => w.text.replace("\n", ""));
-            })
-          );
+          const allWords = [];
+          for (const range of blockRanges) {
+            const sentence = sentences[range.id];
+            const words = [];
+            for (const w of sentence.words.entries) {
+              if (w.isPunctuation) {
+                const lastWord = words.at(-1);
+                if (lastWord === void 0) {
+                  continue;
+                }
+                words[words.length - 1] = lastWord + w.text.replace("\n", "");
+              } else {
+                words.push(w.text);
+              }
+            }
+            allWords.push(...words);
+          }
+          const wordFactory = new import_textFragments.TextFragmentFactory(allWords);
           let wordRangeIndex = 0;
           for (const range of blockRanges) {
             const wordRanges2 = wordRangeMap.get(range.id);
@@ -388,19 +411,53 @@ class Aligner {
       value: import_epub.Epub.formatSmilDuration(chapterDuration)
     });
   }
-  addChapterReport(chapter, chapterSentences, sentenceRanges, startSentence, endSentence, transcriptionOffset) {
+  addChapterReport(chapter, chapterSentences, sentenceRanges, startSentence, endSentence, mapping, transcriptionOffset, endTranscriptionOffset) {
+    const audioFiles = sentenceRanges.reduce(
+      (acc, range) => {
+        const existing = acc.find(
+          (context) => context.filepath === range.audiofile
+        );
+        if (existing) {
+          existing.end = range.end;
+          return acc;
+        }
+        acc.push({
+          filepath: range.audiofile,
+          start: range.start,
+          end: range.end
+        });
+        return acc;
+      },
+      []
+    );
+    const mappedTranscriptionOffset = mapping.invert().map(transcriptionOffset);
+    const mappedEndTranscriptionOffset = mapping.invert().map(endTranscriptionOffset);
     this.report.chapters.push({
       href: chapter.href,
-      transcriptionOffset,
+      transcriptionOffset: mappedTranscriptionOffset,
+      endTranscriptionOffset: mappedEndTranscriptionOffset,
       transcriptionContext: {
         before: this.transcription.transcript.slice(
-          Math.max(0, transcriptionOffset - 30),
-          transcriptionOffset
+          Math.max(0, mappedTranscriptionOffset - 80),
+          mappedTranscriptionOffset
         ),
         after: this.transcription.transcript.slice(
-          transcriptionOffset,
+          mappedTranscriptionOffset,
           Math.min(
-            transcriptionOffset + 30,
+            mappedTranscriptionOffset + 80,
+            this.transcription.transcript.length - 1
+          )
+        )
+      },
+      endTranscriptionContext: {
+        before: this.transcription.transcript.slice(
+          Math.max(0, mappedEndTranscriptionOffset - 80),
+          mappedEndTranscriptionOffset
+        ),
+        after: this.transcription.transcript.slice(
+          mappedEndTranscriptionOffset,
+          Math.min(
+            mappedEndTranscriptionOffset + 80,
             this.transcription.transcript.length - 1
           )
         )
@@ -421,24 +478,30 @@ class Aligner {
       },
       chapterSentenceCount: chapterSentences.length,
       alignedSentenceCount: sentenceRanges.length,
-      audioFiles: sentenceRanges.reduce((acc, range) => {
-        const existing = acc.find(
-          (context) => context.filepath === range.audiofile
-        );
-        if (existing) {
-          existing.end = range.end;
-          return acc;
-        }
-        acc.push({
-          filepath: range.audiofile,
-          start: range.start,
-          end: range.end
-        });
-        return acc;
-      }, [])
+      audioFiles
     });
+    for (const audioFile of audioFiles) {
+      const existing = this.report.audioFiles.find(
+        ({ filepath }) => audioFile.filepath === filepath
+      );
+      if (existing) {
+        existing.matchedRanges.push({
+          start: audioFile.start,
+          end: audioFile.end
+        });
+        existing.matchedRanges.sort((a, b) => a.start - b.start);
+        existing.alignedDuration += audioFile.end - audioFile.start;
+      } else {
+        this.report.audioFiles.push({
+          alignedDuration: audioFile.end - audioFile.start,
+          duration: this.audioFileDurations[audioFile.filepath] ?? 0,
+          filepath: audioFile.filepath,
+          matchedRanges: [{ start: audioFile.start, end: audioFile.end }]
+        });
+      }
+    }
   }
-  async alignChapter(chapterId, transcriptionText, transcriptionOffset, transcriptionEndOffset, locale, mappedTimeline) {
+  async alignChapter(chapterId, transcriptionText, transcriptionOffset, transcriptionEndOffset, locale, mappedTimeline, mapping) {
     const timing = (0, import_ghost_story.createTiming)();
     timing.start("read contents");
     const manifest = await this.epub.getManifest();
@@ -493,7 +556,9 @@ class Aligner {
       sentenceRanges,
       firstFoundSentence,
       lastFoundSentence,
-      transcriptionOffset
+      mapping,
+      transcriptionOffset,
+      endTranscriptionOffset
     );
     return {
       lastSentenceRange: sentenceRanges.at(-1) ?? null,
@@ -525,6 +590,9 @@ class Aligner {
     const locale = this.languageOverride ?? await this.epub.getLanguage() ?? new Intl.Locale("en-US");
     this.timing.setMetadata("language", locale.toString());
     this.timing.setMetadata("granularity", this.granularity);
+    for (const audiofile of this.audiofiles) {
+      this.audioFileDurations[audiofile] = await (0, import_ffmpeg.getTrackDuration)(audiofile);
+    }
     const spine = await this.epub.getSpineItems();
     const manifest = await this.epub.getManifest();
     const { result: transcriptionText, mapping } = await (0, import_slugify.slugify)(
@@ -540,6 +608,10 @@ class Aligner {
       );
       const chapterId = spineItem.id;
       if (manifest[chapterId]?.properties?.includes("nav")) {
+        this.report.unalignedChapters.push({
+          href: spineItem.href,
+          reason: "is-nav"
+        });
         continue;
       }
       const chapterSentences = await this.getChapterSentences(chapterId);
@@ -551,6 +623,10 @@ class Aligner {
       }
       if (chapterSentences.length === 0) {
         this.logger?.info(`Chapter #${index} has no text; skipping`);
+        this.report.unalignedChapters.push({
+          href: spineItem.href,
+          reason: "no-text"
+        });
         continue;
       }
       if (chapterSentences.length < 2 && // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
@@ -558,6 +634,10 @@ class Aligner {
         this.logger?.info(
           `Chapter #${index} is fewer than four words; skipping`
         );
+        this.report.unalignedChapters.push({
+          href: spineItem.href,
+          reason: "too-short"
+        });
         continue;
       }
       const boundaries = (0, import_search.findBoundaries)(
@@ -568,6 +648,12 @@ class Aligner {
         this.logger?.info(
           `Could not find chapter #${index} in the transcripton`
         );
+        this.report.unalignedChapters.push({
+          href: spineItem.href,
+          reason: "not-found",
+          start: chapterSentences.slice(0, 3).map((s) => s.text).join("").slice(0, 80),
+          end: chapterSentences.slice(-3).map((s) => s.text).join("").slice(-80)
+        });
         continue;
       }
       const { start, end } = this.narrowToAvailableBoundary(boundaries);
@@ -580,7 +666,8 @@ class Aligner {
         Math.max(start, 0),
         Math.min(end, transcriptionText.length),
         locale,
-        mappedTimeline
+        mappedTimeline,
+        mapping
       );
       this.timing.add(result.timing.summary());
     }
@@ -602,16 +689,8 @@ class Aligner {
     });
     const sentenceRanges = [];
     const chapterSentenceCounts = {};
-    const audioFileDurations = {};
     for (const alignedChapter of audioOrderedChapters) {
       sentenceRanges.push(...alignedChapter.sentenceRanges);
-      for (const sentenceRange of sentenceRanges) {
-        if (!(sentenceRange.audiofile in audioFileDurations)) {
-          audioFileDurations[sentenceRange.audiofile] = await (0, import_ffmpeg.getTrackDuration)(
-            sentenceRange.audiofile
-          );
-        }
-      }
       const sentences = await this.getChapterSentences(
         alignedChapter.chapter.id
       );
@@ -620,7 +699,7 @@ class Aligner {
     const interpolated = (0, import_interpolateSentenceRanges.interpolateSentenceRanges)(
       sentenceRanges,
       chapterSentenceCounts,
-      audioFileDurations
+      this.audioFileDurations
     );
     const expanded = (0, import_getSentenceRanges.expandEmptySentenceRanges)(interpolated);
     const collapsed = await (0, import_getSentenceRanges.collapseSentenceRangeGaps)(expanded);
@@ -640,6 +719,11 @@ class Aligner {
       await this.writeAlignedChapter(alignedChapter);
       collapsedStart += sentences.length;
     }
+    for (const audiofile of this.audiofiles) {
+      if (!this.report.audioFiles.some(({ filepath }) => filepath === audiofile)) {
+        this.report.unalignedAudioFiles.push({ filepath: audiofile });
+      }
+    }
     await this.epub.addMetadata({
       type: "meta",
       properties: { property: "media:duration" },

package/dist/align/align.d.cts CHANGED Viewed

@@ -14,10 +14,15 @@ interface AudioFileContext {
 interface ChapterReport {
     href: string;
     transcriptionOffset: number;
+    endTranscriptionOffset: number;
     transcriptionContext: {
         before: string;
         after: string;
     };
+    endTranscriptionContext: {
+        before: string;
+        after: string;
+    };
     firstMatchedSentenceId: number;
     firstMatchedSentenceContext: {
         prevSentence: string | null;
@@ -34,8 +39,34 @@ interface ChapterReport {
     alignedSentenceCount: number;
     audioFiles: AudioFileContext[];
 }
+type UnalignedChapterReason = "too-short" | "not-found" | "is-nav" | "no-text";
+interface UnalignedChapterReport {
+    href: string;
+    reason: Exclude<UnalignedChapterReason, "not-found">;
+}
+interface UnalignedNotFoundChapterReport {
+    href: string;
+    reason: "not-found";
+    start: string;
+    end: string;
+}
+interface AudioFileReport {
+    filepath: string;
+    matchedRanges: {
+        start: number;
+        end: number;
+    }[];
+    duration: number;
+    alignedDuration: number;
+}
+interface UnalignedAudioFileReport {
+    filepath: string;
+}
 interface Report {
     chapters: ChapterReport[];
+    unalignedChapters: (UnalignedChapterReport | UnalignedNotFoundChapterReport)[];
+    audioFiles: AudioFileReport[];
+    unalignedAudioFiles: UnalignedAudioFileReport[];
 }
 interface AlignOptions {
     reportsPath?: string | null | undefined;
@@ -58,6 +89,7 @@ declare class Aligner {
     private timing;
     private granularity;
     private textRef;
+    private audioFileDurations;
     report: Report;
     constructor(epub: Epub, audiofiles: string[], transcriptions: Pick<RecognitionResult, "transcript" | "timeline">[], granularity: "sentence" | "word" | null | undefined, textRef: "id-fragment" | "text-fragment" | null | undefined, languageOverride?: (Intl.Locale | null) | undefined, logger?: (Logger | null) | undefined);
     private getChapterSentences;

package/dist/align/align.d.ts CHANGED Viewed

@@ -14,10 +14,15 @@ interface AudioFileContext {
 interface ChapterReport {
     href: string;
     transcriptionOffset: number;
+    endTranscriptionOffset: number;
     transcriptionContext: {
         before: string;
         after: string;
     };
+    endTranscriptionContext: {
+        before: string;
+        after: string;
+    };
     firstMatchedSentenceId: number;
     firstMatchedSentenceContext: {
         prevSentence: string | null;
@@ -34,8 +39,34 @@ interface ChapterReport {
     alignedSentenceCount: number;
     audioFiles: AudioFileContext[];
 }
+type UnalignedChapterReason = "too-short" | "not-found" | "is-nav" | "no-text";
+interface UnalignedChapterReport {
+    href: string;
+    reason: Exclude<UnalignedChapterReason, "not-found">;
+}
+interface UnalignedNotFoundChapterReport {
+    href: string;
+    reason: "not-found";
+    start: string;
+    end: string;
+}
+interface AudioFileReport {
+    filepath: string;
+    matchedRanges: {
+        start: number;
+        end: number;
+    }[];
+    duration: number;
+    alignedDuration: number;
+}
+interface UnalignedAudioFileReport {
+    filepath: string;
+}
 interface Report {
     chapters: ChapterReport[];
+    unalignedChapters: (UnalignedChapterReport | UnalignedNotFoundChapterReport)[];
+    audioFiles: AudioFileReport[];
+    unalignedAudioFiles: UnalignedAudioFileReport[];
 }
 interface AlignOptions {
     reportsPath?: string | null | undefined;
@@ -58,6 +89,7 @@ declare class Aligner {
     private timing;
     private granularity;
     private textRef;
+    private audioFileDurations;
     report: Report;
     constructor(epub: Epub, audiofiles: string[], transcriptions: Pick<RecognitionResult, "transcript" | "timeline">[], granularity: "sentence" | "word" | null | undefined, textRef: "id-fragment" | "text-fragment" | null | undefined, languageOverride?: (Intl.Locale | null) | undefined, logger?: (Logger | null) | undefined);
     private getChapterSentences;

package/dist/align/align.js CHANGED Viewed

@@ -3,7 +3,7 @@ import {
   __using
 } from "../chunk-BIEQXUOY.js";
 import { randomUUID } from "node:crypto";
-import { createWriteStream } from "node:fs";
+import { createWriteStream, rmSync } from "node:fs";
 import {
   copyFile,
   cp,
@@ -51,14 +51,25 @@ async function align(input, output, transcriptionsDir, audiobookDir, options) {
   var _stack2 = [];
   try {
     const outFormat = options.outFormat ?? "epub";
+    const epubPath = outFormat === "epub" ? autoJoin(
+      tmpdir(),
+      `storyteller-platform-align-${randomUUID()}`,
+      basename(output)
+    ) : input;
+    const stack = __using(_stack2, new DisposableStack());
+    stack.defer(() => {
+      if (outFormat === "epub") {
+        rmSync(dirname(epubPath), { recursive: true, force: true });
+      }
+    });
     if (outFormat === "epub") {
-      await mkdir(dirname(output), { recursive: true });
-      await copyFile(input, output);
+      await mkdir(dirname(epubPath), { recursive: true });
+      await copyFile(input, epubPath);
     }
     const audiobookFiles = await readdir(audiobookDir).then(
       (filenames) => filenames.filter((f) => isAudioFile(f)).map((f) => autoJoin(audiobookDir, f))
     );
-    const epub = __using(_stack2, await Epub.from(outFormat === "epub" ? output : input));
+    const epub = __using(_stack2, await Epub.from(epubPath));
     const transcriptions = await readdir(transcriptionsDir).then(
       (filenames) => filenames.filter((f) => f.endsWith(".json")).map((f) => autoJoin(transcriptionsDir, f))
     ).then(
@@ -92,6 +103,8 @@ async function align(input, output, transcriptionsDir, audiobookDir, options) {
     const timing = await aligner.alignBook(options.onProgress);
     if (outFormat === "epub") {
       await epub.saveAndClose();
+      await mkdir(dirname(output), { recursive: true });
+      await copyFile(epubPath, output);
     } else {
       var _stack = [];
       try {
@@ -112,8 +125,8 @@ async function align(input, output, transcriptionsDir, audiobookDir, options) {
         writeStream.on("close", () => {
           resolve();
         });
-        const stack = __using(_stack, new AsyncDisposableStack(), true);
-        stack.defer(async () => {
+        const stack2 = __using(_stack, new AsyncDisposableStack(), true);
+        stack2.defer(async () => {
           writeStream.close();
           await rm(tmpArchivePath, { force: true });
         });
@@ -175,8 +188,12 @@ class Aligner {
   timing = createAggregator();
   granularity;
   textRef;
+  audioFileDurations = {};
   report = {
-    chapters: []
+    chapters: [],
+    unalignedChapters: [],
+    audioFiles: [],
+    unalignedAudioFiles: []
   };
   async getChapterSentences(chapterId) {
     const chapterXml = await this.epub.readXhtmlItemContents(chapterId);
@@ -247,18 +264,24 @@ class Aligner {
           );
         }
         if (this.granularity === "word") {
-          const wordFactory = new TextFragmentFactory(
-            blockRanges.flatMap((range) => {
-              const sentence = sentences[range.id];
-              const wordRanges2 = wordRangeMap.get(range.id);
-              const toFragment = wordIdToFragment.get(range.id);
-              if (!wordRanges2 || !toFragment) return [];
-              const words = sentence.words.entries.filter(
-                (w) => w.text.match(/\S/)
-              );
-              return words.map((w) => w.text.replace("\n", ""));
-            })
-          );
+          const allWords = [];
+          for (const range of blockRanges) {
+            const sentence = sentences[range.id];
+            const words = [];
+            for (const w of sentence.words.entries) {
+              if (w.isPunctuation) {
+                const lastWord = words.at(-1);
+                if (lastWord === void 0) {
+                  continue;
+                }
+                words[words.length - 1] = lastWord + w.text.replace("\n", "");
+              } else {
+                words.push(w.text);
+              }
+            }
+            allWords.push(...words);
+          }
+          const wordFactory = new TextFragmentFactory(allWords);
           let wordRangeIndex = 0;
           for (const range of blockRanges) {
             const wordRanges2 = wordRangeMap.get(range.id);
@@ -334,19 +357,53 @@ class Aligner {
       value: Epub.formatSmilDuration(chapterDuration)
     });
   }
-  addChapterReport(chapter, chapterSentences, sentenceRanges, startSentence, endSentence, transcriptionOffset) {
+  addChapterReport(chapter, chapterSentences, sentenceRanges, startSentence, endSentence, mapping, transcriptionOffset, endTranscriptionOffset) {
+    const audioFiles = sentenceRanges.reduce(
+      (acc, range) => {
+        const existing = acc.find(
+          (context) => context.filepath === range.audiofile
+        );
+        if (existing) {
+          existing.end = range.end;
+          return acc;
+        }
+        acc.push({
+          filepath: range.audiofile,
+          start: range.start,
+          end: range.end
+        });
+        return acc;
+      },
+      []
+    );
+    const mappedTranscriptionOffset = mapping.invert().map(transcriptionOffset);
+    const mappedEndTranscriptionOffset = mapping.invert().map(endTranscriptionOffset);
     this.report.chapters.push({
       href: chapter.href,
-      transcriptionOffset,
+      transcriptionOffset: mappedTranscriptionOffset,
+      endTranscriptionOffset: mappedEndTranscriptionOffset,
       transcriptionContext: {
         before: this.transcription.transcript.slice(
-          Math.max(0, transcriptionOffset - 30),
-          transcriptionOffset
+          Math.max(0, mappedTranscriptionOffset - 80),
+          mappedTranscriptionOffset
         ),
         after: this.transcription.transcript.slice(
-          transcriptionOffset,
+          mappedTranscriptionOffset,
           Math.min(
-            transcriptionOffset + 30,
+            mappedTranscriptionOffset + 80,
+            this.transcription.transcript.length - 1
+          )
+        )
+      },
+      endTranscriptionContext: {
+        before: this.transcription.transcript.slice(
+          Math.max(0, mappedEndTranscriptionOffset - 80),
+          mappedEndTranscriptionOffset
+        ),
+        after: this.transcription.transcript.slice(
+          mappedEndTranscriptionOffset,
+          Math.min(
+            mappedEndTranscriptionOffset + 80,
             this.transcription.transcript.length - 1
           )
         )
@@ -367,24 +424,30 @@ class Aligner {
       },
       chapterSentenceCount: chapterSentences.length,
       alignedSentenceCount: sentenceRanges.length,
-      audioFiles: sentenceRanges.reduce((acc, range) => {
-        const existing = acc.find(
-          (context) => context.filepath === range.audiofile
-        );
-        if (existing) {
-          existing.end = range.end;
-          return acc;
-        }
-        acc.push({
-          filepath: range.audiofile,
-          start: range.start,
-          end: range.end
-        });
-        return acc;
-      }, [])
+      audioFiles
     });
+    for (const audioFile of audioFiles) {
+      const existing = this.report.audioFiles.find(
+        ({ filepath }) => audioFile.filepath === filepath
+      );
+      if (existing) {
+        existing.matchedRanges.push({
+          start: audioFile.start,
+          end: audioFile.end
+        });
+        existing.matchedRanges.sort((a, b) => a.start - b.start);
+        existing.alignedDuration += audioFile.end - audioFile.start;
+      } else {
+        this.report.audioFiles.push({
+          alignedDuration: audioFile.end - audioFile.start,
+          duration: this.audioFileDurations[audioFile.filepath] ?? 0,
+          filepath: audioFile.filepath,
+          matchedRanges: [{ start: audioFile.start, end: audioFile.end }]
+        });
+      }
+    }
   }
-  async alignChapter(chapterId, transcriptionText, transcriptionOffset, transcriptionEndOffset, locale, mappedTimeline) {
+  async alignChapter(chapterId, transcriptionText, transcriptionOffset, transcriptionEndOffset, locale, mappedTimeline, mapping) {
     const timing = createTiming();
     timing.start("read contents");
     const manifest = await this.epub.getManifest();
@@ -439,7 +502,9 @@ class Aligner {
       sentenceRanges,
       firstFoundSentence,
       lastFoundSentence,
-      transcriptionOffset
+      mapping,
+      transcriptionOffset,
+      endTranscriptionOffset
     );
     return {
       lastSentenceRange: sentenceRanges.at(-1) ?? null,
@@ -471,6 +536,9 @@ class Aligner {
     const locale = this.languageOverride ?? await this.epub.getLanguage() ?? new Intl.Locale("en-US");
     this.timing.setMetadata("language", locale.toString());
     this.timing.setMetadata("granularity", this.granularity);
+    for (const audiofile of this.audiofiles) {
+      this.audioFileDurations[audiofile] = await getTrackDuration(audiofile);
+    }
     const spine = await this.epub.getSpineItems();
     const manifest = await this.epub.getManifest();
     const { result: transcriptionText, mapping } = await slugify(
@@ -486,6 +554,10 @@ class Aligner {
       );
       const chapterId = spineItem.id;
       if (manifest[chapterId]?.properties?.includes("nav")) {
+        this.report.unalignedChapters.push({
+          href: spineItem.href,
+          reason: "is-nav"
+        });
         continue;
       }
       const chapterSentences = await this.getChapterSentences(chapterId);
@@ -497,6 +569,10 @@ class Aligner {
       }
       if (chapterSentences.length === 0) {
         this.logger?.info(`Chapter #${index} has no text; skipping`);
+        this.report.unalignedChapters.push({
+          href: spineItem.href,
+          reason: "no-text"
+        });
         continue;
       }
       if (chapterSentences.length < 2 && // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
@@ -504,6 +580,10 @@ class Aligner {
         this.logger?.info(
           `Chapter #${index} is fewer than four words; skipping`
         );
+        this.report.unalignedChapters.push({
+          href: spineItem.href,
+          reason: "too-short"
+        });
         continue;
       }
       const boundaries = findBoundaries(
@@ -514,6 +594,12 @@ class Aligner {
         this.logger?.info(
           `Could not find chapter #${index} in the transcripton`
         );
+        this.report.unalignedChapters.push({
+          href: spineItem.href,
+          reason: "not-found",
+          start: chapterSentences.slice(0, 3).map((s) => s.text).join("").slice(0, 80),
+          end: chapterSentences.slice(-3).map((s) => s.text).join("").slice(-80)
+        });
         continue;
       }
       const { start, end } = this.narrowToAvailableBoundary(boundaries);
@@ -526,7 +612,8 @@ class Aligner {
         Math.max(start, 0),
         Math.min(end, transcriptionText.length),
         locale,
-        mappedTimeline
+        mappedTimeline,
+        mapping
       );
       this.timing.add(result.timing.summary());
     }
@@ -548,16 +635,8 @@ class Aligner {
     });
     const sentenceRanges = [];
     const chapterSentenceCounts = {};
-    const audioFileDurations = {};
     for (const alignedChapter of audioOrderedChapters) {
       sentenceRanges.push(...alignedChapter.sentenceRanges);
-      for (const sentenceRange of sentenceRanges) {
-        if (!(sentenceRange.audiofile in audioFileDurations)) {
-          audioFileDurations[sentenceRange.audiofile] = await getTrackDuration(
-            sentenceRange.audiofile
-          );
-        }
-      }
       const sentences = await this.getChapterSentences(
         alignedChapter.chapter.id
       );
@@ -566,7 +645,7 @@ class Aligner {
     const interpolated = interpolateSentenceRanges(
       sentenceRanges,
       chapterSentenceCounts,
-      audioFileDurations
+      this.audioFileDurations
     );
     const expanded = expandEmptySentenceRanges(interpolated);
     const collapsed = await collapseSentenceRangeGaps(expanded);
@@ -586,6 +665,11 @@ class Aligner {
       await this.writeAlignedChapter(alignedChapter);
       collapsedStart += sentences.length;
     }
+    for (const audiofile of this.audiofiles) {
+      if (!this.report.audioFiles.some(({ filepath }) => filepath === audiofile)) {
+        this.report.unalignedAudioFiles.push({ filepath: audiofile });
+      }
+    }
     await this.epub.addMetadata({
       type: "meta",
       properties: { property: "media:duration" },

package/dist/align/parse.cjs CHANGED Viewed

@@ -40,7 +40,9 @@ const alignParser = (0, import_core.object)("Alignment", {
     }),
     "id-fragment"
   ),
-  reports: (0, import_core.optional)((0, import_core.option)("--reports", (0, import_valueparser.path)({ type: "directory" }))),
+  reports: (0, import_core.optional)(
+    (0, import_core.option)("--reports", (0, import_valueparser.path)({ type: "file", extensions: [".json"] }))
+  ),
   outFormat: (0, import_core.withDefault)(
     (0, import_core.option)("--out-format", (0, import_core.choice)(["epub", "gnp"]), {
       description: import_core.message`Whether to output a full EPUB 3 package with embedded media overlays and audio, or a Readium Guided Navigation Package with just a manifest and guided navigation documents.`

package/dist/align/parse.js CHANGED Viewed

@@ -31,7 +31,9 @@ const alignParser = object("Alignment", {
     }),
     "id-fragment"
   ),
-  reports: optional(option("--reports", path({ type: "directory" }))),
+  reports: optional(
+    option("--reports", path({ type: "file", extensions: [".json"] }))
+  ),
   outFormat: withDefault(
     option("--out-format", choice(["epub", "gnp"]), {
       description: message`Whether to output a full EPUB 3 package with embedded media overlays and audio, or a Readium Guided Navigation Package with just a manifest and guided navigation documents.`

package/dist/align/slugify.cjs CHANGED Viewed

@@ -133,6 +133,7 @@ async function slugify(text, locale) {
   replacerMap.set(locale, replacers);
   const { result, mapping } = await (0, import_transliteration.slugify)(text, {
     allowedChars: "a-zA-Z0-9",
+    locale,
     replace: replacers
   });
   return { result, mapping };

package/dist/align/slugify.js CHANGED Viewed

@@ -111,6 +111,7 @@ async function slugify(text, locale) {
   replacerMap.set(locale, replacers);
   const { result, mapping } = await transliterateSlugify(text, {
     allowedChars: "a-zA-Z0-9",
+    locale,
     replace: replacers
   });
   return { result, mapping };

package/dist/align/textFragments.cjs CHANGED Viewed

@@ -64,15 +64,23 @@ class TextFragmentFactory {
       toRemove.toReversed().map((r) => candidates.splice(r, 1));
       i++;
     }
+    while (chars.at(i)?.match(/[\p{L}\p{N}]/u) && i < chars.length) i++;
     let fragment = "";
     const start = chars.slice(0, i).join("");
     fragment += encodeTextFragmentPart(start);
-    const remainingSpan = span.slice(i);
+    const remainingChars = chars.slice(i);
+    while (!remainingChars.at(-1)?.match(/[\p{L}\p{N}]/u) && remainingChars.length) {
+      remainingChars.splice(remainingChars.length - 1, 1);
+    }
+    let e = remainingChars.length;
     let end = "";
-    let e = remainingSpan.length - 1;
-    if (remainingSpan.at(-1) === "\n") e--;
-    while (remainingSpan.indexOf(end) !== e + 1 && e >= 0) {
-      end = remainingSpan.slice(e);
+    const remainingSpan = remainingChars.join("");
+    while (remainingSpan.indexOf(end) !== remainingSpan.length - remainingChars.slice(e).join("").length && e >= 0) {
+      e--;
+      end = remainingChars.slice(e).join("");
+    }
+    while (remainingChars.at(e)?.match(/[\p{L}\p{N}]/u) && e >= 0) {
+      end = remainingChars.slice(e).join("");
       e--;
     }
     if (end) {
@@ -93,7 +101,10 @@ class TextFragmentFactory {
         p++;
         if (!candidates.length) break;
       }
-      const prefix = this.runes.slice(startPos - p + 1, startPos).join("");
+      while (this.runes.at(startPos - p - 1)?.match(/[\p{L}\p{N}]/u) && p <= startPos) {
+        p++;
+      }
+      const prefix = this.runes.slice(startPos - p, startPos).join("");
       fragment = `${encodeTextFragmentPart(prefix)}-,${fragment}`;
     }
     return `:~:text=${fragment}`;

package/dist/align/textFragments.js CHANGED Viewed

@@ -42,15 +42,23 @@ class TextFragmentFactory {
       toRemove.toReversed().map((r) => candidates.splice(r, 1));
       i++;
     }
+    while (chars.at(i)?.match(/[\p{L}\p{N}]/u) && i < chars.length) i++;
     let fragment = "";
     const start = chars.slice(0, i).join("");
     fragment += encodeTextFragmentPart(start);
-    const remainingSpan = span.slice(i);
+    const remainingChars = chars.slice(i);
+    while (!remainingChars.at(-1)?.match(/[\p{L}\p{N}]/u) && remainingChars.length) {
+      remainingChars.splice(remainingChars.length - 1, 1);
+    }
+    let e = remainingChars.length;
     let end = "";
-    let e = remainingSpan.length - 1;
-    if (remainingSpan.at(-1) === "\n") e--;
-    while (remainingSpan.indexOf(end) !== e + 1 && e >= 0) {
-      end = remainingSpan.slice(e);
+    const remainingSpan = remainingChars.join("");
+    while (remainingSpan.indexOf(end) !== remainingSpan.length - remainingChars.slice(e).join("").length && e >= 0) {
+      e--;
+      end = remainingChars.slice(e).join("");
+    }
+    while (remainingChars.at(e)?.match(/[\p{L}\p{N}]/u) && e >= 0) {
+      end = remainingChars.slice(e).join("");
       e--;
     }
     if (end) {
@@ -71,7 +79,10 @@ class TextFragmentFactory {
         p++;
         if (!candidates.length) break;
       }
-      const prefix = this.runes.slice(startPos - p + 1, startPos).join("");
+      while (this.runes.at(startPos - p - 1)?.match(/[\p{L}\p{N}]/u) && p <= startPos) {
+        p++;
+      }
+      const prefix = this.runes.slice(startPos - p, startPos).join("");
       fragment = `${encodeTextFragmentPart(prefix)}-,${fragment}`;
     }
     return `:~:text=${fragment}`;

package/dist/cli/bin.cjs CHANGED Viewed

@@ -229,6 +229,7 @@ async function main() {
             textRef: parsed.textRef,
             outFormat: parsed.outFormat,
             primaryLocale: parsed.language,
+            reportsPath: parsed.reports,
             logger,
             ...!parsed.noProgress && parsed.logLevel === "silent" && {
               onProgress: (progress) => {
@@ -349,6 +350,7 @@ async function main() {
               textRef: parsed.textRef,
               outFormat: parsed.outFormat,
               primaryLocale,
+              reportsPath: parsed.reports,
               logger,
               ...!parsed.noProgress && parsed.logLevel === "silent" && {
                 onProgress: (progress) => {

package/dist/cli/bin.js CHANGED Viewed

@@ -180,6 +180,7 @@ async function main() {
             textRef: parsed.textRef,
             outFormat: parsed.outFormat,
             primaryLocale: parsed.language,
+            reportsPath: parsed.reports,
             logger,
             ...!parsed.noProgress && parsed.logLevel === "silent" && {
               onProgress: (progress) => {
@@ -300,6 +301,7 @@ async function main() {
               textRef: parsed.textRef,
               outFormat: parsed.outFormat,
               primaryLocale,
+              reportsPath: parsed.reports,
               logger,
               ...!parsed.noProgress && parsed.logLevel === "silent" && {
                 onProgress: (progress) => {

package/dist/markup/model.d.cts CHANGED Viewed

@@ -1,2 +1,2 @@
 import '@storyteller-platform/epub';
-export { F as FootnoteNode, M as Mark, N as Node, b as NoterefNode, R as Root, T as TextNode, d as descendants } from '../model-TZi1QUQh.cjs';
+export { F as FootnoteNode, M as Mark, N as Node, b as NoterefNode, R as Root, T as TextNode, d as descendants } from '../model-Bv3yPEdd.cjs';

package/dist/markup/model.d.ts CHANGED Viewed

@@ -1,2 +1,2 @@
 import '@storyteller-platform/epub';
-export { F as FootnoteNode, M as Mark, N as Node, b as NoterefNode, R as Root, T as TextNode, d as descendants } from '../model-TZi1QUQh.js';
+export { F as FootnoteNode, M as Mark, N as Node, b as NoterefNode, R as Root, T as TextNode, d as descendants } from '../model-Bv3yPEdd.js';

package/dist/markup/parseDom.d.cts CHANGED Viewed

@@ -1,5 +1,5 @@
 import { ParsedXml } from '@storyteller-platform/epub';
-import { R as Root, N as Node } from '../model-TZi1QUQh.cjs';
+import { R as Root, N as Node } from '../model-Bv3yPEdd.cjs';
 declare function parseDom(xml: ParsedXml): Root;
 declare function findFootnotePairs(root: Root | Node): Map<number, number>;

package/dist/markup/parseDom.d.ts CHANGED Viewed

@@ -1,5 +1,5 @@
 import { ParsedXml } from '@storyteller-platform/epub';
-import { R as Root, N as Node } from '../model-TZi1QUQh.js';
+import { R as Root, N as Node } from '../model-Bv3yPEdd.js';
 declare function parseDom(xml: ParsedXml): Root;
 declare function findFootnotePairs(root: Root | Node): Map<number, number>;

package/dist/markup/resolvedPos.d.cts CHANGED Viewed

@@ -1,2 +1,2 @@
-export { a as ResolvedPos } from '../model-TZi1QUQh.cjs';
+export { a as ResolvedPos } from '../model-Bv3yPEdd.cjs';
 import '@storyteller-platform/epub';

package/dist/markup/resolvedPos.d.ts CHANGED Viewed

@@ -1,2 +1,2 @@
-export { a as ResolvedPos } from '../model-TZi1QUQh.js';
+export { a as ResolvedPos } from '../model-Bv3yPEdd.js';
 import '@storyteller-platform/epub';

package/dist/markup/serializeDom.d.cts CHANGED Viewed

@@ -1,5 +1,5 @@
 import { ParsedXml } from '@storyteller-platform/epub';
-import { R as Root } from '../model-TZi1QUQh.cjs';
+import { R as Root } from '../model-Bv3yPEdd.cjs';
 declare function serializeDom(doc: Root): ParsedXml;

package/dist/markup/serializeDom.d.ts CHANGED Viewed

@@ -1,5 +1,5 @@
 import { ParsedXml } from '@storyteller-platform/epub';
-import { R as Root } from '../model-TZi1QUQh.js';
+import { R as Root } from '../model-Bv3yPEdd.js';
 declare function serializeDom(doc: Root): ParsedXml;

package/dist/markup/transform.cjs CHANGED Viewed

@@ -81,11 +81,22 @@ function liftText(root) {
         ])
       );
     }
-    lastTextEnd = pos + node.nodeSize;
     let result = node.text.replaceAll(/\n/g, " ");
+    if (text.endsWith("\n")) {
+      const contentStart = result.match(/\S/u)?.index ?? result.length;
+      if (contentStart !== 0) {
+        result = result.slice(contentStart);
+        mapping.appendMap(
+          new import_map.StepMap([mapping.map(lastTextEnd), contentStart, 0])
+        );
+      }
+    }
+    lastTextEnd = pos + node.nodeSize;
     const hasBlockSiblings = parent.children.some((child) => child.isBlock);
     if (hasBlockSiblings && !result.match(/\S/)) {
-      mapping.appendMap(new import_map.StepMap([textLength, result.length, 0]));
+      if (result.length) {
+        mapping.appendMap(new import_map.StepMap([textLength, result.length, 0]));
+      }
       result = "";
     }
     if (parent.isBlock && index === parent.children.length - 1 && !(text + result).endsWith("\n")) {

package/dist/markup/transform.d.cts CHANGED Viewed

@@ -1,5 +1,5 @@
 import { Mapping } from './map.cjs';
-import { R as Root, M as Mark } from '../model-TZi1QUQh.cjs';
+import { R as Root, M as Mark } from '../model-Bv3yPEdd.cjs';
 import '@storyteller-platform/epub';
 declare function addMark(root: Root, from: number, to: number, mark: Mark): Root;

package/dist/markup/transform.d.ts CHANGED Viewed

@@ -1,5 +1,5 @@
 import { Mapping } from './map.js';
-import { R as Root, M as Mark } from '../model-TZi1QUQh.js';
+import { R as Root, M as Mark } from '../model-Bv3yPEdd.js';
 import '@storyteller-platform/epub';
 declare function addMark(root: Root, from: number, to: number, mark: Mark): Root;

package/dist/markup/transform.js CHANGED Viewed

@@ -61,11 +61,22 @@ function liftText(root) {
         ])
       );
     }
-    lastTextEnd = pos + node.nodeSize;
     let result = node.text.replaceAll(/\n/g, " ");
+    if (text.endsWith("\n")) {
+      const contentStart = result.match(/\S/u)?.index ?? result.length;
+      if (contentStart !== 0) {
+        result = result.slice(contentStart);
+        mapping.appendMap(
+          new StepMap([mapping.map(lastTextEnd), contentStart, 0])
+        );
+      }
+    }
+    lastTextEnd = pos + node.nodeSize;
     const hasBlockSiblings = parent.children.some((child) => child.isBlock);
     if (hasBlockSiblings && !result.match(/\S/)) {
-      mapping.appendMap(new StepMap([textLength, result.length, 0]));
+      if (result.length) {
+        mapping.appendMap(new StepMap([textLength, result.length, 0]));
+      }
       result = "";
     }
     if (parent.isBlock && index === parent.children.length - 1 && !(text + result).endsWith("\n")) {

package/dist/{model-TZi1QUQh.d.cts → model-Bv3yPEdd.d.cts} RENAMED Viewed

@@ -45,7 +45,7 @@ declare class Node {
     get isLeaf(): boolean;
     get isInline(): boolean;
     get isBlock(): boolean;
-    get border(): 1 | 0;
+    get border(): 0 | 1;
     get nodeSize(): number;
     get contentSize(): number;
     get textContent(): string;

package/dist/{model-TZi1QUQh.d.ts → model-Bv3yPEdd.d.ts} RENAMED Viewed

@@ -45,7 +45,7 @@ declare class Node {
     get isLeaf(): boolean;
     get isInline(): boolean;
     get isBlock(): boolean;
-    get border(): 1 | 0;
+    get border(): 0 | 1;
     get nodeSize(): number;
     get contentSize(): number;
     get textContent(): string;

package/dist/snapshot/snapshot.cjs CHANGED Viewed

@@ -71,6 +71,7 @@ var import_promises = require("node:fs/promises");
 var import_node_path = require("node:path");
 var import_posix = require("node:path/posix");
 var import_epub = require("@storyteller-platform/epub");
+var import_ghost_story = require("@storyteller-platform/ghost-story");
 var import_parseDom = require("../markup/parseDom.cjs");
 var import_segmentation = require("../markup/segmentation.cjs");
 var import_transform = require("../markup/transform.cjs");
@@ -184,7 +185,9 @@ async function createAlignmentSnapshot(epub, transcriptionFilepaths, textRef) {
         }
         word = transcription.timeline[++i];
       }
-      const transcriptionSentence = transcriptionWords.join(" ");
+      const transcriptionSentence = transcriptionWords.map(
+        (w, idx) => (0, import_ghost_story.startsWithSpacelessScript)(w) || idx === transcriptionWords.length - 1 ? w : `${w} `
+      ).join("");
       newSnapshot += `Audio: ${transcriptionSentence}
 `;
     }

package/dist/snapshot/snapshot.js CHANGED Viewed

@@ -11,6 +11,9 @@ import {
 import {
   Epub
 } from "@storyteller-platform/epub";
+import {
+  startsWithSpacelessScript
+} from "@storyteller-platform/ghost-story";
 import { parseDom } from "../markup/parseDom.js";
 import { segmentChapter } from "../markup/segmentation.js";
 import { inlineFootnotes, liftText } from "../markup/transform.js";
@@ -124,7 +127,9 @@ async function createAlignmentSnapshot(epub, transcriptionFilepaths, textRef) {
         }
         word = transcription.timeline[++i];
       }
-      const transcriptionSentence = transcriptionWords.join(" ");
+      const transcriptionSentence = transcriptionWords.map(
+        (w, idx) => startsWithSpacelessScript(w) || idx === transcriptionWords.length - 1 ? w : `${w} `
+      ).join("");
       newSnapshot += `Audio: ${transcriptionSentence}
 `;
     }

package/dist/transcribe/transcribe.cjs CHANGED Viewed

@@ -84,10 +84,6 @@ var import_async_semaphore = require("@esfx/async-semaphore");
 var import_audiobook = require("@storyteller-platform/audiobook");
 var import_ghost_story = require("@storyteller-platform/ghost-story");
 async function transcribe(input, output, locale, options) {
-  if (process.env["DEBUG_TRANSCRIBE"] === "true") {
-    const inspector = await import("node:inspector");
-    inspector.open(9231, "0.0.0.0", true);
-  }
   const semaphore = new import_async_semaphore.AsyncSemaphore(options.parallelism ?? 1);
   const controller = new AbortController();
   const signal = AbortSignal.any([

package/dist/transcribe/transcribe.js CHANGED Viewed

@@ -15,10 +15,6 @@ import {
   recognize
 } from "@storyteller-platform/ghost-story";
 async function transcribe(input, output, locale, options) {
-  if (process.env["DEBUG_TRANSCRIBE"] === "true") {
-    const inspector = await import("node:inspector");
-    inspector.open(9231, "0.0.0.0", true);
-  }
   const semaphore = new AsyncSemaphore(options.parallelism ?? 1);
   const controller = new AbortController();
   const signal = AbortSignal.any([

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@storyteller-platform/align",
-  "version": "0.1.36",
+  "version": "0.1.37",
   "description": "A library and CLI for automatically aligning audiobooks and EPUBs to produce Media Overlays",
   "author": "Shane Friedman",
   "license": "MIT",
@@ -62,7 +62,7 @@
     "@readium/shared": "patch:@readium/shared@npm%3A2.1.5#~/.yarn/patches/@readium-shared-npm-2.1.5-8d6f9d2432.patch",
     "@storyteller-platform/audiobook": "^0.3.10",
     "@storyteller-platform/epub": "^0.5.0",
-    "@storyteller-platform/ghost-story": "^0.1.10",
+    "@storyteller-platform/ghost-story": "^0.1.11",
     "@storyteller-platform/transliteration": "^3.1.2",
     "chalk": "^5.4.1",
     "change-case": "^5.4.4",