npm - @storyteller-platform/align - Versions diffs - 0.1.6 → 0.1.7 - Mend

@storyteller-platform/align 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/dist/align/__tests__/align.test.cjs +282 -0
package/dist/align/__tests__/align.test.d.cts +2 -0
package/dist/align/__tests__/align.test.d.ts +2 -0
package/dist/align/__tests__/align.test.js +218 -0
package/dist/align/__tests__/slugify.test.cjs +64 -0
package/dist/align/__tests__/slugify.test.d.cts +2 -0
package/dist/align/__tests__/slugify.test.d.ts +2 -0
package/dist/align/__tests__/slugify.test.js +41 -0
package/dist/align/align.cjs +41 -21
package/dist/align/align.js +41 -21
package/dist/align/fuzzy.cjs +1 -1
package/dist/align/fuzzy.js +1 -1
package/dist/align/getSentenceRanges.cjs +24 -12
package/dist/align/getSentenceRanges.d.cts +1 -1
package/dist/align/getSentenceRanges.d.ts +1 -1
package/dist/align/getSentenceRanges.js +24 -12
package/dist/align/slugify.cjs +125 -0
package/dist/align/slugify.d.cts +8 -0
package/dist/align/slugify.d.ts +8 -0
package/dist/align/slugify.js +102 -0
package/package.json +6 -3

package/dist/align/align.cjs CHANGED Viewed

@@ -89,10 +89,12 @@ var import_ffmpeg = require("../common/ffmpeg.cjs");
 var import_segmentation = require("../markup/segmentation.cjs");
 var import_fuzzy = require("./fuzzy.cjs");
 var import_getSentenceRanges = require("./getSentenceRanges.cjs");
+var import_slugify = require("./slugify.cjs");
 const OFFSET_SEARCH_WINDOW_SIZE = 5e3;
 async function align(input, output, transcriptionsDir, audiobookDir, options) {
   var _stack = [];
   try {
+    await (0, import_promises.mkdir)((0, import_posix.dirname)(output), { recursive: true });
     await (0, import_promises.copyFile)(input, output);
     const audiobookFiles = await (0, import_promises.readdir)(audiobookDir).then(
       (filenames) => filenames.filter((f) => (0, import_audiobook.isAudioFile)(f)).map((f) => (0, import_node_path.join)(audiobookDir, f))
@@ -118,6 +120,7 @@ async function align(input, output, transcriptionsDir, audiobookDir, options) {
       options.logger
     );
     const timing = await aligner.alignBook(options.onProgress);
+    await epub.saveAndClose();
     if (options.reportsPath) {
       await (0, import_promises.mkdir)((0, import_node_path.dirname)(options.reportsPath), { recursive: true });
       await (0, import_promises.writeFile)(
@@ -152,7 +155,7 @@ class Aligner {
   report = {
     chapters: []
   };
-  findBestOffset(epubSentences, transcriptionText, lastMatchOffset) {
+  findBestOffset(epubSentences, transcriptionText, lastMatchOffset, mapping) {
     let i = 0;
     while (i < transcriptionText.length) {
       let startSentence = 0;
@@ -164,11 +167,13 @@ class Aligner {
       let startSeen = null;
       let endSeen = null;
       for (const aligned of this.alignedChapters) {
-        if (startSeen !== null && endSeen === aligned.startOffset) {
-          endSeen = aligned.endOffset;
+        const alignedStart = mapping.map(aligned.startOffset, -1);
+        const alignedEnd = mapping.map(aligned.endOffset, -1);
+        if (startSeen !== null && endSeen === alignedStart) {
+          endSeen = alignedEnd;
         } else {
-          startSeen = aligned.startOffset;
-          endSeen = aligned.endOffset;
+          startSeen = alignedStart;
+          endSeen = alignedEnd;
         }
         if (startIndex >= startSeen && startIndex < endSeen) {
           startIndex = endSeen;
@@ -183,7 +188,7 @@ class Aligner {
           endIndex
         );
         while (startSentence < epubSentences.length) {
-          const queryString = epubSentences.slice(startSentence, startSentence + 6).join(" ");
+          const queryString = epubSentences.slice(startSentence, startSentence + 6).join("-");
           const firstMatch = (0, import_fuzzy.findNearestMatch)(
             queryString.toLowerCase(),
             transcriptionTextSlice.toLowerCase(),
@@ -309,7 +314,7 @@ class Aligner {
       }, [])
     });
   }
-  async alignChapter(startSentence, chapterId, transcriptionOffset, lastSentenceRange) {
+  async alignChapter(startSentence, chapterId, transcriptionOffset, locale, lastSentenceRange) {
     const timing = (0, import_ghost_story.createTiming)();
     timing.start("read contents");
     const manifest = await this.epub.getManifest();
@@ -329,6 +334,7 @@ class Aligner {
       this.transcription,
       chapterSentences,
       transcriptionOffset,
+      locale,
       lastSentenceRange
     );
     timing.end("align sentences");
@@ -369,53 +375,67 @@ class Aligner {
     };
   }
   async alignBook(onProgress) {
-    var _a, _b, _c, _d, _e, _f;
-    this.timing.setMetadata(
-      "language",
-      ((_a = this.languageOverride ?? await this.epub.getLanguage()) == null ? void 0 : _a.language) ?? "unknown"
-    );
+    var _a, _b, _c, _d, _e, _f, _g;
+    const locale = this.languageOverride ?? await this.epub.getLanguage() ?? new Intl.Locale("en-US");
+    this.timing.setMetadata("language", locale.toString());
     this.timing.setMetadata("granularity", this.granularity);
     const spine = await this.epub.getSpineItems();
-    const transcriptionText = this.transcription.transcript;
+    const manifest = await this.epub.getManifest();
+    const { result: transcriptionText, mapping } = await (0, import_slugify.slugify)(
+      this.transcription.transcript,
+      locale
+    );
     let lastTranscriptionOffset = 0;
     let lastSentenceRange = null;
     for (let index = 0; index < spine.length; index++) {
       onProgress == null ? void 0 : onProgress(index / spine.length);
       const spineItem = spine[index];
-      (_b = this.logger) == null ? void 0 : _b.info(
+      (_a = this.logger) == null ? void 0 : _a.info(
         `Aligning epub item #${index} : ${(0, import_posix.basename)(spineItem.href)}`
       );
       const chapterId = spineItem.id;
+      if ((_c = (_b = manifest[chapterId]) == null ? void 0 : _b.properties) == null ? void 0 : _c.includes("nav")) {
+        continue;
+      }
       const chapterSentences = await this.getChapterSentences(chapterId);
+      const slugifiedChapterSentences = [];
+      for (const chapterSentence of chapterSentences) {
+        slugifiedChapterSentences.push(
+          (await (0, import_slugify.slugify)(chapterSentence, locale)).result
+        );
+      }
       if (chapterSentences.length === 0) {
-        (_c = this.logger) == null ? void 0 : _c.info(`Chapter #${index} has no text; skipping`);
+        (_d = this.logger) == null ? void 0 : _d.info(`Chapter #${index} has no text; skipping`);
         continue;
       }
       if (chapterSentences.length < 2 && // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
       chapterSentences[0].split(" ").length < 4) {
-        (_d = this.logger) == null ? void 0 : _d.info(
+        (_e = this.logger) == null ? void 0 : _e.info(
           `Chapter #${index} is fewer than four words; skipping`
         );
         continue;
       }
-      const { startSentence, transcriptionOffset } = this.findBestOffset(
-        chapterSentences,
+      const { startSentence, transcriptionOffset: slugifiedOffset } = this.findBestOffset(
+        slugifiedChapterSentences,
         transcriptionText,
-        lastTranscriptionOffset
+        mapping.map(lastTranscriptionOffset, -1),
+        mapping
       );
+      const transcriptionOffset = slugifiedOffset && mapping.invert().map(slugifiedOffset, -1);
       if (transcriptionOffset === null) {
-        (_e = this.logger) == null ? void 0 : _e.info(
+        (_f = this.logger) == null ? void 0 : _f.info(
           `Couldn't find matching transcription for chapter #${index}`
         );
         continue;
       }
-      (_f = this.logger) == null ? void 0 : _f.info(
+      (_g = this.logger) == null ? void 0 : _g.info(
         `Chapter #${index} best matches transcription at offset ${transcriptionOffset}, starting at sentence ${startSentence}`
       );
       const result = await this.alignChapter(
         startSentence,
         chapterId,
         transcriptionOffset,
+        locale,
         lastSentenceRange
       );
       lastSentenceRange = result.lastSentenceRange;

package/dist/align/align.js CHANGED Viewed

@@ -23,10 +23,12 @@ import {
   getSentenceRanges,
   interpolateSentenceRanges
 } from "./getSentenceRanges.js";
+import { slugify } from "./slugify.js";
 const OFFSET_SEARCH_WINDOW_SIZE = 5e3;
 async function align(input, output, transcriptionsDir, audiobookDir, options) {
   var _stack = [];
   try {
+    await mkdir(dirname(output), { recursive: true });
     await copyFile(input, output);
     const audiobookFiles = await readdir(audiobookDir).then(
       (filenames) => filenames.filter((f) => isAudioFile(f)).map((f) => autoJoin(audiobookDir, f))
@@ -52,6 +54,7 @@ async function align(input, output, transcriptionsDir, audiobookDir, options) {
       options.logger
     );
     const timing = await aligner.alignBook(options.onProgress);
+    await epub.saveAndClose();
     if (options.reportsPath) {
       await mkdir(autoDirname(options.reportsPath), { recursive: true });
       await writeFile(
@@ -86,7 +89,7 @@ class Aligner {
   report = {
     chapters: []
   };
-  findBestOffset(epubSentences, transcriptionText, lastMatchOffset) {
+  findBestOffset(epubSentences, transcriptionText, lastMatchOffset, mapping) {
     let i = 0;
     while (i < transcriptionText.length) {
       let startSentence = 0;
@@ -98,11 +101,13 @@ class Aligner {
       let startSeen = null;
       let endSeen = null;
       for (const aligned of this.alignedChapters) {
-        if (startSeen !== null && endSeen === aligned.startOffset) {
-          endSeen = aligned.endOffset;
+        const alignedStart = mapping.map(aligned.startOffset, -1);
+        const alignedEnd = mapping.map(aligned.endOffset, -1);
+        if (startSeen !== null && endSeen === alignedStart) {
+          endSeen = alignedEnd;
         } else {
-          startSeen = aligned.startOffset;
-          endSeen = aligned.endOffset;
+          startSeen = alignedStart;
+          endSeen = alignedEnd;
         }
         if (startIndex >= startSeen && startIndex < endSeen) {
           startIndex = endSeen;
@@ -117,7 +122,7 @@ class Aligner {
           endIndex
         );
         while (startSentence < epubSentences.length) {
-          const queryString = epubSentences.slice(startSentence, startSentence + 6).join(" ");
+          const queryString = epubSentences.slice(startSentence, startSentence + 6).join("-");
           const firstMatch = findNearestMatch(
             queryString.toLowerCase(),
             transcriptionTextSlice.toLowerCase(),
@@ -243,7 +248,7 @@ class Aligner {
       }, [])
     });
   }
-  async alignChapter(startSentence, chapterId, transcriptionOffset, lastSentenceRange) {
+  async alignChapter(startSentence, chapterId, transcriptionOffset, locale, lastSentenceRange) {
     const timing = createTiming();
     timing.start("read contents");
     const manifest = await this.epub.getManifest();
@@ -263,6 +268,7 @@ class Aligner {
       this.transcription,
       chapterSentences,
       transcriptionOffset,
+      locale,
       lastSentenceRange
     );
     timing.end("align sentences");
@@ -303,53 +309,67 @@ class Aligner {
     };
   }
   async alignBook(onProgress) {
-    var _a, _b, _c, _d, _e, _f;
-    this.timing.setMetadata(
-      "language",
-      ((_a = this.languageOverride ?? await this.epub.getLanguage()) == null ? void 0 : _a.language) ?? "unknown"
-    );
+    var _a, _b, _c, _d, _e, _f, _g;
+    const locale = this.languageOverride ?? await this.epub.getLanguage() ?? new Intl.Locale("en-US");
+    this.timing.setMetadata("language", locale.toString());
     this.timing.setMetadata("granularity", this.granularity);
     const spine = await this.epub.getSpineItems();
-    const transcriptionText = this.transcription.transcript;
+    const manifest = await this.epub.getManifest();
+    const { result: transcriptionText, mapping } = await slugify(
+      this.transcription.transcript,
+      locale
+    );
     let lastTranscriptionOffset = 0;
     let lastSentenceRange = null;
     for (let index = 0; index < spine.length; index++) {
       onProgress == null ? void 0 : onProgress(index / spine.length);
       const spineItem = spine[index];
-      (_b = this.logger) == null ? void 0 : _b.info(
+      (_a = this.logger) == null ? void 0 : _a.info(
         `Aligning epub item #${index} : ${basename(spineItem.href)}`
       );
       const chapterId = spineItem.id;
+      if ((_c = (_b = manifest[chapterId]) == null ? void 0 : _b.properties) == null ? void 0 : _c.includes("nav")) {
+        continue;
+      }
       const chapterSentences = await this.getChapterSentences(chapterId);
+      const slugifiedChapterSentences = [];
+      for (const chapterSentence of chapterSentences) {
+        slugifiedChapterSentences.push(
+          (await slugify(chapterSentence, locale)).result
+        );
+      }
       if (chapterSentences.length === 0) {
-        (_c = this.logger) == null ? void 0 : _c.info(`Chapter #${index} has no text; skipping`);
+        (_d = this.logger) == null ? void 0 : _d.info(`Chapter #${index} has no text; skipping`);
         continue;
       }
       if (chapterSentences.length < 2 && // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
       chapterSentences[0].split(" ").length < 4) {
-        (_d = this.logger) == null ? void 0 : _d.info(
+        (_e = this.logger) == null ? void 0 : _e.info(
           `Chapter #${index} is fewer than four words; skipping`
         );
         continue;
       }
-      const { startSentence, transcriptionOffset } = this.findBestOffset(
-        chapterSentences,
+      const { startSentence, transcriptionOffset: slugifiedOffset } = this.findBestOffset(
+        slugifiedChapterSentences,
         transcriptionText,
-        lastTranscriptionOffset
+        mapping.map(lastTranscriptionOffset, -1),
+        mapping
       );
+      const transcriptionOffset = slugifiedOffset && mapping.invert().map(slugifiedOffset, -1);
       if (transcriptionOffset === null) {
-        (_e = this.logger) == null ? void 0 : _e.info(
+        (_f = this.logger) == null ? void 0 : _f.info(
           `Couldn't find matching transcription for chapter #${index}`
         );
         continue;
       }
-      (_f = this.logger) == null ? void 0 : _f.info(
+      (_g = this.logger) == null ? void 0 : _g.info(
         `Chapter #${index} best matches transcription at offset ${transcriptionOffset}, starting at sentence ${startSentence}`
       );
       const result = await this.alignChapter(
         startSentence,
         chapterId,
         transcriptionOffset,
+        locale,
         lastSentenceRange
       );
       lastSentenceRange = result.lastSentenceRange;

package/dist/align/fuzzy.cjs CHANGED Viewed

@@ -108,7 +108,7 @@ function expand(subsequence, sequence, maxDist) {
 function* levenshteinNgram(subsequence, sequence, maxDist) {
   const subsequenceLength = subsequence.length;
   const sequenceLength = sequence.length;
-  const ngramLength = Math.round(subsequenceLength / (maxDist + 1));
+  const ngramLength = Math.floor(subsequenceLength / (maxDist + 1));
   if (ngramLength === 0) {
     throw new Error("The subsequence length must be greater than maxDist");
   }

package/dist/align/fuzzy.js CHANGED Viewed

@@ -86,7 +86,7 @@ function expand(subsequence, sequence, maxDist) {
 function* levenshteinNgram(subsequence, sequence, maxDist) {
   const subsequenceLength = subsequence.length;
   const sequenceLength = sequence.length;
-  const ngramLength = Math.round(subsequenceLength / (maxDist + 1));
+  const ngramLength = Math.floor(subsequenceLength / (maxDist + 1));
   if (ngramLength === 0) {
     throw new Error("The subsequence length must be greater than maxDist");
   }

package/dist/align/getSentenceRanges.cjs CHANGED Viewed

@@ -28,6 +28,7 @@ module.exports = __toCommonJS(getSentenceRanges_exports);
 var import_text_segmentation = require("@echogarden/text-segmentation");
 var import_ffmpeg = require("../common/ffmpeg.cjs");
 var import_fuzzy = require("./fuzzy.cjs");
+var import_slugify = require("./slugify.cjs");
 async function getSentencesWithOffsets(text) {
   const sentences = await (0, import_text_segmentation.segmentText)(text).then(
     (r) => r.sentences.map((s) => s.text)
@@ -75,7 +76,7 @@ function getWindowIndexFromOffset(window, offset) {
 function collapseWhitespace(input) {
   return input.replaceAll(/\s+/g, " ");
 }
-async function getSentenceRanges(startSentence, transcription, sentences, chapterOffset, lastSentenceRange) {
+async function getSentenceRanges(startSentence, transcription, sentences, chapterOffset, locale, lastSentenceRange) {
   const sentenceRanges = [];
   const fullTranscriptionText = transcription.transcript;
   const transcriptionText = fullTranscriptionText.slice(chapterOffset);
@@ -83,13 +84,15 @@ async function getSentenceRanges(startSentence, transcription, sentences, chapte
     transcriptionText
   ).then((s) => s.map((sentence) => sentence.toLowerCase()));
   let startSentenceEntry = startSentence;
-  const sentenceEntries = sentences.map((sentence, index) => [index, sentence]).filter(([index, sentence]) => {
-    if (sentence.replaceAll(/[.-_()[\],/?!@#$%^^&*`~;:='"<>+ˌˈ]/g, "").length <= 3) {
-      if (index < startSentence) startSentenceEntry--;
-      return false;
+  const sentenceEntries = [];
+  for (let i = 0; i < sentences.length; i++) {
+    const sentence = (await (0, import_slugify.slugify)(sentences[i], locale)).result;
+    if (sentence.length <= 3) {
+      if (i < startSentence) startSentenceEntry--;
+      continue;
     }
-    return true;
-  });
+    sentenceEntries.push([i, sentence]);
+  }
   let transcriptionWindowIndex = 0;
   let transcriptionWindowOffset = 0;
   let lastGoodTranscriptionWindow = 0;
@@ -102,7 +105,11 @@ async function getSentenceRanges(startSentence, transcription, sentences, chapte
       transcriptionWindowIndex,
       transcriptionWindowIndex + 10
     );
-    const transcriptionWindow = transcriptionWindowList.join("").slice(transcriptionWindowOffset);
+    const { result: transcriptionWindow, mapping } = await (0, import_slugify.slugify)(
+      transcriptionWindowList.join("-").slice(transcriptionWindowOffset),
+      locale
+    );
+    const inverted = mapping.invert();
     const query = collapseWhitespace(sentence.trim()).toLowerCase();
     const firstMatch = (0, import_fuzzy.findNearestMatch)(
       query,
@@ -125,8 +132,13 @@ async function getSentenceRanges(startSentence, transcription, sentences, chapte
       continue;
     }
     const transcriptionOffset = transcriptionSentences.slice(0, transcriptionWindowIndex).join("").length;
+    const matchStart = inverted.map(firstMatch.index, 1);
+    const matchEnd = inverted.map(
+      firstMatch.index + firstMatch.match.length,
+      -1
+    );
     const startResult = findStartTimestamp(
-      firstMatch.index + transcriptionOffset + transcriptionWindowOffset + chapterOffset,
+      matchStart + transcriptionOffset + transcriptionWindowOffset + chapterOffset,
       transcription
     );
     if (!startResult) {
@@ -136,7 +148,7 @@ async function getSentenceRanges(startSentence, transcription, sentences, chapte
     let start = startResult.start;
     const audiofile = startResult.audiofile;
     const end = findEndTimestamp(
-      firstMatch.index + firstMatch.match.length + transcriptionOffset + transcriptionWindowOffset + chapterOffset,
+      matchEnd + transcriptionOffset + transcriptionWindowOffset + chapterOffset,
       transcription
     ) ?? startResult.end;
     if (sentenceRanges.length > 0) {
@@ -177,10 +189,10 @@ async function getSentenceRanges(startSentence, transcription, sentences, chapte
       audiofile
     });
     notFound = 0;
-    lastMatchEnd = firstMatch.index + firstMatch.match.length + transcriptionOffset + transcriptionWindowOffset + chapterOffset;
+    lastMatchEnd = matchEnd + transcriptionOffset + transcriptionWindowOffset + chapterOffset;
     const windowIndexResult = getWindowIndexFromOffset(
       transcriptionWindowList,
-      firstMatch.index + firstMatch.match.length + transcriptionWindowOffset
+      matchEnd + transcriptionWindowOffset
     );
     transcriptionWindowIndex += windowIndexResult.index;
     transcriptionWindowOffset = windowIndexResult.offset;

package/dist/align/getSentenceRanges.d.cts CHANGED Viewed

@@ -14,7 +14,7 @@ type SentenceRange = {
     audiofile: string;
 };
 declare function findEndTimestamp(matchEndIndex: number, transcription: StorytellerTranscription): number | null;
-declare function getSentenceRanges(startSentence: number, transcription: StorytellerTranscription, sentences: string[], chapterOffset: number, lastSentenceRange: SentenceRange | null): Promise<{
+declare function getSentenceRanges(startSentence: number, transcription: StorytellerTranscription, sentences: string[], chapterOffset: number, locale: Intl.Locale, lastSentenceRange: SentenceRange | null): Promise<{
     sentenceRanges: SentenceRange[];
     transcriptionOffset: number;
 }>;

package/dist/align/getSentenceRanges.d.ts CHANGED Viewed

@@ -14,7 +14,7 @@ type SentenceRange = {
     audiofile: string;
 };
 declare function findEndTimestamp(matchEndIndex: number, transcription: StorytellerTranscription): number | null;
-declare function getSentenceRanges(startSentence: number, transcription: StorytellerTranscription, sentences: string[], chapterOffset: number, lastSentenceRange: SentenceRange | null): Promise<{
+declare function getSentenceRanges(startSentence: number, transcription: StorytellerTranscription, sentences: string[], chapterOffset: number, locale: Intl.Locale, lastSentenceRange: SentenceRange | null): Promise<{
     sentenceRanges: SentenceRange[];
     transcriptionOffset: number;
 }>;

package/dist/align/getSentenceRanges.js CHANGED Viewed

@@ -2,6 +2,7 @@ import "../chunk-BIEQXUOY.js";
 import { segmentText } from "@echogarden/text-segmentation";
 import { getTrackDuration } from "../common/ffmpeg.js";
 import { findNearestMatch } from "./fuzzy.js";
+import { slugify } from "./slugify.js";
 async function getSentencesWithOffsets(text) {
   const sentences = await segmentText(text).then(
     (r) => r.sentences.map((s) => s.text)
@@ -49,7 +50,7 @@ function getWindowIndexFromOffset(window, offset) {
 function collapseWhitespace(input) {
   return input.replaceAll(/\s+/g, " ");
 }
-async function getSentenceRanges(startSentence, transcription, sentences, chapterOffset, lastSentenceRange) {
+async function getSentenceRanges(startSentence, transcription, sentences, chapterOffset, locale, lastSentenceRange) {
   const sentenceRanges = [];
   const fullTranscriptionText = transcription.transcript;
   const transcriptionText = fullTranscriptionText.slice(chapterOffset);
@@ -57,13 +58,15 @@ async function getSentenceRanges(startSentence, transcription, sentences, chapte
     transcriptionText
   ).then((s) => s.map((sentence) => sentence.toLowerCase()));
   let startSentenceEntry = startSentence;
-  const sentenceEntries = sentences.map((sentence, index) => [index, sentence]).filter(([index, sentence]) => {
-    if (sentence.replaceAll(/[.-_()[\],/?!@#$%^^&*`~;:='"<>+ˌˈ]/g, "").length <= 3) {
-      if (index < startSentence) startSentenceEntry--;
-      return false;
+  const sentenceEntries = [];
+  for (let i = 0; i < sentences.length; i++) {
+    const sentence = (await slugify(sentences[i], locale)).result;
+    if (sentence.length <= 3) {
+      if (i < startSentence) startSentenceEntry--;
+      continue;
     }
-    return true;
-  });
+    sentenceEntries.push([i, sentence]);
+  }
   let transcriptionWindowIndex = 0;
   let transcriptionWindowOffset = 0;
   let lastGoodTranscriptionWindow = 0;
@@ -76,7 +79,11 @@ async function getSentenceRanges(startSentence, transcription, sentences, chapte
       transcriptionWindowIndex,
       transcriptionWindowIndex + 10
     );
-    const transcriptionWindow = transcriptionWindowList.join("").slice(transcriptionWindowOffset);
+    const { result: transcriptionWindow, mapping } = await slugify(
+      transcriptionWindowList.join("-").slice(transcriptionWindowOffset),
+      locale
+    );
+    const inverted = mapping.invert();
     const query = collapseWhitespace(sentence.trim()).toLowerCase();
     const firstMatch = findNearestMatch(
       query,
@@ -99,8 +106,13 @@ async function getSentenceRanges(startSentence, transcription, sentences, chapte
       continue;
     }
     const transcriptionOffset = transcriptionSentences.slice(0, transcriptionWindowIndex).join("").length;
+    const matchStart = inverted.map(firstMatch.index, 1);
+    const matchEnd = inverted.map(
+      firstMatch.index + firstMatch.match.length,
+      -1
+    );
     const startResult = findStartTimestamp(
-      firstMatch.index + transcriptionOffset + transcriptionWindowOffset + chapterOffset,
+      matchStart + transcriptionOffset + transcriptionWindowOffset + chapterOffset,
       transcription
     );
     if (!startResult) {
@@ -110,7 +122,7 @@ async function getSentenceRanges(startSentence, transcription, sentences, chapte
     let start = startResult.start;
     const audiofile = startResult.audiofile;
     const end = findEndTimestamp(
-      firstMatch.index + firstMatch.match.length + transcriptionOffset + transcriptionWindowOffset + chapterOffset,
+      matchEnd + transcriptionOffset + transcriptionWindowOffset + chapterOffset,
       transcription
     ) ?? startResult.end;
     if (sentenceRanges.length > 0) {
@@ -151,10 +163,10 @@ async function getSentenceRanges(startSentence, transcription, sentences, chapte
       audiofile
     });
     notFound = 0;
-    lastMatchEnd = firstMatch.index + firstMatch.match.length + transcriptionOffset + transcriptionWindowOffset + chapterOffset;
+    lastMatchEnd = matchEnd + transcriptionOffset + transcriptionWindowOffset + chapterOffset;
     const windowIndexResult = getWindowIndexFromOffset(
       transcriptionWindowList,
-      firstMatch.index + firstMatch.match.length + transcriptionWindowOffset
+      matchEnd + transcriptionWindowOffset
     );
     transcriptionWindowIndex += windowIndexResult.index;
     transcriptionWindowOffset = windowIndexResult.offset;

package/dist/align/slugify.cjs ADDED Viewed

@@ -0,0 +1,125 @@
+"use strict";
+var __defProp = Object.defineProperty;
+var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
+var __getOwnPropNames = Object.getOwnPropertyNames;
+var __hasOwnProp = Object.prototype.hasOwnProperty;
+var __export = (target, all) => {
+  for (var name in all)
+    __defProp(target, name, { get: all[name], enumerable: true });
+};
+var __copyProps = (to, from, except, desc) => {
+  if (from && typeof from === "object" || typeof from === "function") {
+    for (let key of __getOwnPropNames(from))
+      if (!__hasOwnProp.call(to, key) && key !== except)
+        __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
+  }
+  return to;
+};
+var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
+var slugify_exports = {};
+__export(slugify_exports, {
+  slugify: () => slugify
+});
+module.exports = __toCommonJS(slugify_exports);
+var import_locale_currency = require("locale-currency");
+var import_to_words = require("to-words");
+var import_transliteration = require("@storyteller-platform/transliteration");
+const replacerMap = /* @__PURE__ */ new WeakMap();
+function createReplacers(locale) {
+  const maximizedLocale = locale.maximize();
+  const demoNumber = 123456.789;
+  const currencyFormat = new Intl.NumberFormat(locale, {
+    style: "currency",
+    // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
+    currency: (0, import_locale_currency.getCurrency)(locale.maximize().region)
+  });
+  const currencyParts = currencyFormat.formatToParts(demoNumber);
+  const currencySymbols = currencyParts.reduce(
+    (acc, part, index) => {
+      if (part.type === "group") {
+        return {
+          ...acc,
+          group: part.value
+        };
+      }
+      if (part.type === "decimal") {
+        return {
+          ...acc,
+          decimal: part.value
+        };
+      }
+      if (part.type === "currency") {
+        return {
+          ...acc,
+          currency: part.value,
+          currencyLeading: index === 0
+        };
+      }
+      return acc;
+    },
+    { group: "", decimal: "", currency: "", currencyLeading: true }
+  );
+  const numeralRegexPart = `(\\p{Number}[\\p{Number}${currencySymbols.group}]*(?:[${currencySymbols.decimal}]\\p{Number}*)?)`;
+  const currencyRegex = currencySymbols.currencyLeading ? new RegExp(`[${currencySymbols.currency}]\\s?${numeralRegexPart}`, "gu") : new RegExp(`${numeralRegexPart}\\s?[${currencySymbols.currency}]`, "gu");
+  function currencyReplacer(match) {
+    const numeralMatch = match[1];
+    if (!numeralMatch) return match[0];
+    const normalizedNumeral = numeralMatch.replaceAll(new RegExp(`\\${currencySymbols.group}`, "g"), "").replace(new RegExp(`\\${currencySymbols.decimal}`), ".");
+    const number = parseFloat(normalizedNumeral);
+    return (0, import_to_words.toWords)(number, {
+      localeCode: `${maximizedLocale.language}-${maximizedLocale.region}`,
+      currency: true,
+      doNotAddOnly: true
+    });
+  }
+  const numberFormat = new Intl.NumberFormat(locale);
+  const numberParts = numberFormat.formatToParts(demoNumber);
+  const numberSymbols = numberParts.reduce(
+    (acc, part) => {
+      if (part.type === "group") {
+        return {
+          ...acc,
+          group: part.value
+        };
+      }
+      if (part.type === "decimal") {
+        return {
+          ...acc,
+          decimal: part.value
+        };
+      }
+      return acc;
+    },
+    { group: "", decimal: "" }
+  );
+  const numberRegex = new RegExp(
+    `(\\p{Number}[\\p{Number}${numberSymbols.group}]*(?:[${numberSymbols.decimal}]\\p{Number}*)?)`,
+    "gu"
+  );
+  function numberReplacer(match) {
+    const numeralMatch = match[1];
+    if (!numeralMatch) return match[0];
+    const normalizedNumeral = numeralMatch.replaceAll(new RegExp(`\\${numberSymbols.group}`, "g"), "").replace(new RegExp(`\\${numberSymbols.decimal}`), ".");
+    const number = parseFloat(normalizedNumeral);
+    return (0, import_to_words.toWords)(number, {
+      localeCode: `${maximizedLocale.language}-${maximizedLocale.region}`
+    });
+  }
+  return [
+    [currencyRegex, currencyReplacer],
+    [numberRegex, numberReplacer]
+  ];
+}
+async function slugify(text, locale) {
+  const replacers = replacerMap.get(locale) ?? createReplacers(locale);
+  replacerMap.set(locale, replacers);
+  const { result, mapping } = await (0, import_transliteration.slugify)(text, {
+    allowedChars: "a-zA-Z0-9",
+    replace: replacers
+  });
+  return { result, mapping };
+}
+// Annotate the CommonJS export names for ESM import in node:
+0 && (module.exports = {
+  slugify
+});

package/dist/align/slugify.d.cts ADDED Viewed

@@ -0,0 +1,8 @@
+import * as _storyteller_platform_transliteration from '@storyteller-platform/transliteration';
+declare function slugify(text: string, locale: Intl.Locale): Promise<{
+    result: string;
+    mapping: _storyteller_platform_transliteration.Mapping;
+}>;
+export { slugify };