npm - @storyteller-platform/align - Versions diffs - 0.1.9 → 0.1.11 - Mend

@storyteller-platform/align 0.1.9 → 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (113) hide show

package/dist/align/__tests__/align.test.cjs +6 -5
package/dist/align/__tests__/align.test.js +6 -5
package/dist/align/align.cjs +133 -81
package/dist/align/align.d.cts +1 -0
package/dist/align/align.d.ts +1 -0
package/dist/align/align.js +133 -81
package/dist/align/getSentenceRanges.cjs +78 -149
package/dist/align/getSentenceRanges.d.cts +1 -1
package/dist/align/getSentenceRanges.d.ts +1 -1
package/dist/align/getSentenceRanges.js +78 -149
package/dist/align/slugify.cjs +16 -8
package/dist/align/slugify.js +16 -8
package/dist/errorAlign/__tests__/errorAlign.test.cjs +100 -0
package/dist/errorAlign/__tests__/errorAlign.test.d.cts +2 -0
package/dist/errorAlign/__tests__/errorAlign.test.d.ts +2 -0
package/dist/errorAlign/__tests__/errorAlign.test.js +77 -0
package/dist/errorAlign/__tests__/native.test.cjs +118 -0
package/dist/errorAlign/__tests__/native.test.d.cts +2 -0
package/dist/errorAlign/__tests__/native.test.d.ts +2 -0
package/dist/errorAlign/__tests__/native.test.js +107 -0
package/dist/errorAlign/backtraceGraph.cjs +298 -0
package/dist/errorAlign/backtraceGraph.d.cts +103 -0
package/dist/errorAlign/backtraceGraph.d.ts +103 -0
package/dist/errorAlign/backtraceGraph.js +270 -0
package/dist/errorAlign/beamSearch.cjs +302 -0
package/dist/errorAlign/beamSearch.d.cts +53 -0
package/dist/errorAlign/beamSearch.d.ts +53 -0
package/dist/errorAlign/beamSearch.js +268 -0
package/dist/errorAlign/core.cjs +33 -0
package/dist/errorAlign/core.d.cts +5 -0
package/dist/errorAlign/core.d.ts +5 -0
package/dist/errorAlign/core.js +11 -0
package/dist/errorAlign/editDistance.cjs +115 -0
package/dist/errorAlign/editDistance.d.cts +46 -0
package/dist/errorAlign/editDistance.d.ts +46 -0
package/dist/errorAlign/editDistance.js +90 -0
package/dist/errorAlign/errorAlign.cjs +159 -0
package/dist/errorAlign/errorAlign.d.cts +15 -0
package/dist/errorAlign/errorAlign.d.ts +15 -0
package/dist/errorAlign/errorAlign.js +145 -0
package/dist/errorAlign/graphMetadata.cjs +97 -0
package/dist/errorAlign/graphMetadata.d.cts +44 -0
package/dist/errorAlign/graphMetadata.d.ts +44 -0
package/dist/errorAlign/graphMetadata.js +64 -0
package/dist/errorAlign/hash.cjs +173 -0
package/dist/errorAlign/hash.d.cts +28 -0
package/dist/errorAlign/hash.d.ts +28 -0
package/dist/errorAlign/hash.js +150 -0
package/dist/errorAlign/native.cjs +60 -0
package/dist/errorAlign/native.d.cts +18 -0
package/dist/errorAlign/native.d.ts +18 -0
package/dist/errorAlign/native.js +24 -0
package/dist/errorAlign/node-gyp-build.d.cjs +1 -0
package/dist/errorAlign/node-gyp-build.d.d.cts +3 -0
package/dist/errorAlign/node-gyp-build.d.d.ts +3 -0
package/dist/errorAlign/node-gyp-build.d.js +0 -0
package/dist/errorAlign/pathToAlignment.cjs +122 -0
package/dist/errorAlign/pathToAlignment.d.cts +11 -0
package/dist/errorAlign/pathToAlignment.d.ts +11 -0
package/dist/errorAlign/pathToAlignment.js +89 -0
package/dist/errorAlign/utils.cjs +301 -0
package/dist/errorAlign/utils.d.cts +107 -0
package/dist/errorAlign/utils.d.ts +107 -0
package/dist/errorAlign/utils.js +248 -0
package/dist/index.d.cts +1 -0
package/dist/index.d.ts +1 -0
package/dist/markup/__tests__/markup.test.cjs +108 -81
package/dist/markup/__tests__/markup.test.js +109 -82
package/dist/markup/__tests__/parseDom.test.cjs +112 -0
package/dist/markup/__tests__/parseDom.test.d.cts +2 -0
package/dist/markup/__tests__/parseDom.test.d.ts +2 -0
package/dist/markup/__tests__/parseDom.test.js +89 -0
package/dist/markup/__tests__/serializeDom.test.cjs +120 -0
package/dist/markup/__tests__/serializeDom.test.d.cts +2 -0
package/dist/markup/__tests__/serializeDom.test.d.ts +2 -0
package/dist/markup/__tests__/serializeDom.test.js +97 -0
package/dist/markup/__tests__/transform.test.cjs +122 -0
package/dist/markup/__tests__/transform.test.d.cts +2 -0
package/dist/markup/__tests__/transform.test.d.ts +2 -0
package/dist/markup/__tests__/transform.test.js +99 -0
package/dist/markup/map.cjs +261 -0
package/dist/markup/map.d.cts +50 -0
package/dist/markup/map.d.ts +50 -0
package/dist/markup/map.js +236 -0
package/dist/markup/markup.cjs +23 -201
package/dist/markup/markup.d.cts +5 -9
package/dist/markup/markup.d.ts +5 -9
package/dist/markup/markup.js +24 -203
package/dist/markup/model.cjs +172 -0
package/dist/markup/model.d.cts +57 -0
package/dist/markup/model.d.ts +57 -0
package/dist/markup/model.js +145 -0
package/dist/markup/parseDom.cjs +59 -0
package/dist/markup/parseDom.d.cts +7 -0
package/dist/markup/parseDom.d.ts +7 -0
package/dist/markup/parseDom.js +35 -0
package/dist/markup/segmentation.cjs +11 -57
package/dist/markup/segmentation.d.cts +6 -2
package/dist/markup/segmentation.d.ts +6 -2
package/dist/markup/segmentation.js +11 -58
package/dist/markup/serializeDom.cjs +87 -0
package/dist/markup/serializeDom.d.cts +7 -0
package/dist/markup/serializeDom.d.ts +7 -0
package/dist/markup/serializeDom.js +63 -0
package/dist/markup/transform.cjs +92 -0
package/dist/markup/transform.d.cts +11 -0
package/dist/markup/transform.d.ts +11 -0
package/dist/markup/transform.js +71 -0
package/dist/types/node-gyp-build.d.cjs +1 -0
package/dist/types/node-gyp-build.d.d.cts +3 -0
package/dist/types/node-gyp-build.d.d.ts +3 -0
package/dist/types/node-gyp-build.d.js +0 -0
package/package.json +11 -4

package/dist/align/align.js CHANGED Viewed

@@ -5,7 +5,9 @@ import {
 import { copyFile, mkdir, readFile, readdir, writeFile } from "node:fs/promises";
 import { dirname as autoDirname, join as autoJoin } from "node:path";
 import { basename, dirname, parse, relative } from "node:path/posix";
+import { enumerate } from "itertools";
 import memoize from "memoize";
+import { runes } from "runes2";
 import { isAudioFile, lookupAudioMime } from "@storyteller-platform/audiobook";
 import {
   Epub
@@ -24,7 +26,6 @@ import {
   interpolateSentenceRanges
 } from "./getSentenceRanges.js";
 import { slugify } from "./slugify.js";
-const OFFSET_SEARCH_WINDOW_SIZE = 5e3;
 async function align(input, output, transcriptionsDir, audiobookDir, options) {
   var _stack = [];
   try {
@@ -75,6 +76,7 @@ async function align(input, output, transcriptionsDir, audiobookDir, options) {
 class Aligner {
   constructor(epub, audiofiles, transcriptions, granularity, languageOverride, logger) {
     this.epub = epub;
+    this.audiofiles = audiofiles;
     this.languageOverride = languageOverride;
     this.logger = logger;
     this.transcription = concatTranscriptions(transcriptions, audiofiles);
@@ -89,71 +91,92 @@ class Aligner {
   report = {
     chapters: []
   };
-  findBestOffset(epubSentences, transcriptionText, lastMatchOffset, mapping) {
-    let i = 0;
-    while (i < transcriptionText.length) {
-      let startSentence = 0;
-      const proposedStartIndex = (lastMatchOffset + i) % transcriptionText.length;
-      const proposedEndIndex = (proposedStartIndex + OFFSET_SEARCH_WINDOW_SIZE) % transcriptionText.length;
-      const wrapping = proposedEndIndex < proposedStartIndex;
-      let endIndex = wrapping ? transcriptionText.length : proposedEndIndex;
-      let startIndex = proposedStartIndex;
-      let startSeen = null;
-      let endSeen = null;
-      for (const aligned of this.alignedChapters) {
-        const alignedStart = mapping.map(aligned.startOffset, -1);
-        const alignedEnd = mapping.map(aligned.endOffset, -1);
-        if (startSeen !== null && endSeen === alignedStart) {
-          endSeen = alignedEnd;
-        } else {
-          startSeen = alignedStart;
-          endSeen = alignedEnd;
-        }
-        if (startIndex >= startSeen && startIndex < endSeen) {
-          startIndex = endSeen;
-        }
-        if (endIndex >= startSeen && endIndex <= endSeen) {
-          endIndex = startSeen;
+  findBestOffset(epubSentences, transcriptionText, lastMatchOffset, dir = 1) {
+    const reverse = dir < 0;
+    if (dir < 0) {
+      epubSentences = epubSentences.toReversed().map((s) => runes(s).toReversed().join(""));
+      transcriptionText = runes(transcriptionText).toReversed().join("");
+      lastMatchOffset = transcriptionText.length - lastMatchOffset;
+    }
+    const flatSliceIndices = [
+      0,
+      ...this.alignedChapters.toSorted(
+        (a, b) => reverse ? transcriptionText.length - a.endOffset - (transcriptionText.length - b.endOffset) : a.startOffset - b.startOffset
+      ).flatMap((aligned) => [
+        reverse ? transcriptionText.length - aligned.endOffset : aligned.startOffset,
+        reverse ? transcriptionText.length - aligned.startOffset : aligned.endOffset
+      ]),
+      transcriptionText.length
+    ];
+    const sliceIndices = [];
+    for (let i = 0; i < flatSliceIndices.length - 1; i += 2) {
+      sliceIndices.push([flatSliceIndices[i], flatSliceIndices[i + 1]]);
+    }
+    const allSlices = [];
+    let startSlice = 0;
+    for (const [i, [start, end]] of enumerate(sliceIndices)) {
+      if (lastMatchOffset >= start && lastMatchOffset < end) {
+        if (!reverse) {
+          startSlice = i + 1;
+          allSlices.push({
+            start,
+            text: transcriptionText.slice(start, lastMatchOffset)
+          });
         }
+        allSlices.push({
+          start: lastMatchOffset,
+          text: transcriptionText.slice(lastMatchOffset, end)
+        });
+      } else if (!reverse) {
+        allSlices.push({ start, text: transcriptionText.slice(start, end) });
+      }
+    }
+    const slices = allSlices.filter((slice) => slice.text.length);
+    if (reverse && !slices.length) {
+      const indices = sliceIndices.find(([start]) => start > lastMatchOffset);
+      if (indices) {
+        slices.push({
+          start: indices[0],
+          text: transcriptionText.slice(...indices)
+        });
       }
-      if (startIndex < endIndex) {
-        const transcriptionTextSlice = transcriptionText.slice(
-          startIndex,
-          endIndex
+    }
+    for (const slice of slices.slice(startSlice).concat(slices.slice(0, startSlice))) {
+      let startSentence = 0;
+      while (startSentence < epubSentences.length) {
+        const needle = epubSentences.slice(startSentence, startSentence + 6).join("-");
+        const firstMatch = findNearestMatch(
+          needle,
+          slice.text,
+          Math.max(Math.floor(0.1 * needle.length), 1)
         );
-        while (startSentence < epubSentences.length) {
-          const queryString = epubSentences.slice(startSentence, startSentence + 6).join("-");
-          const firstMatch = findNearestMatch(
-            queryString.toLowerCase(),
-            transcriptionTextSlice.toLowerCase(),
-            Math.max(Math.floor(0.1 * queryString.length), 1)
-          );
-          if (firstMatch) {
-            return {
-              startSentence,
-              transcriptionOffset: (firstMatch.index + startIndex) % transcriptionText.length
-            };
-          }
-          startSentence += 3;
+        if (firstMatch) {
+          const start = reverse ? transcriptionText.length - (slice.start + firstMatch.index) : slice.start + firstMatch.index;
+          return {
+            startSentence: reverse ? epubSentences.length - startSentence : startSentence,
+            transcriptionOffset: start
+          };
         }
+        startSentence += 3;
       }
-      if (wrapping) {
-        i += transcriptionText.length - proposedStartIndex;
-      } else {
-        i += Math.floor(OFFSET_SEARCH_WINDOW_SIZE / 2);
-      }
+    }
+    if (reverse) {
+      return {
+        startSentence: epubSentences.length,
+        transcriptionOffset: slices[0] ? transcriptionText.length - slices[0].start : null
+      };
     }
     return { startSentence: 0, transcriptionOffset: null };
   }
   async getChapterSentences(chapterId) {
     const chapterXml = await this.epub.readXhtmlItemContents(chapterId);
-    const segmentation = await getXhtmlSegmentation(
+    const { result: segmentation } = await getXhtmlSegmentation(
       Epub.getXhtmlBody(chapterXml),
       {
         primaryLocale: this.languageOverride ?? await this.epub.getLanguage()
       }
     );
-    return segmentation.sentences.map((s) => s.text);
+    return segmentation.map((s) => s.text).filter((s) => s.match(/\S/));
   }
   async writeAlignedChapter(alignedChapter) {
     const { chapter, sentenceRanges, xml } = alignedChapter;
@@ -248,7 +271,7 @@ class Aligner {
       }, [])
     });
   }
-  async alignChapter(startSentence, chapterId, transcriptionOffset, locale, lastSentenceRange) {
+  async alignChapter(startSentence, endSentence, chapterId, transcriptionOffset, transcriptionEndOffset, locale, mapping) {
     const timing = createTiming();
     timing.start("read contents");
     const manifest = await this.epub.getManifest();
@@ -265,20 +288,14 @@ class Aligner {
     timing.start("align sentences");
     const { sentenceRanges, transcriptionOffset: endTranscriptionOffset } = await getSentenceRanges(
       startSentence,
+      endSentence,
       this.transcription,
       chapterSentences,
       transcriptionOffset,
-      locale,
-      lastSentenceRange
+      transcriptionEndOffset,
+      locale
     );
     timing.end("align sentences");
-    timing.start("expand ranges");
-    const interpolated = await interpolateSentenceRanges(
-      sentenceRanges,
-      lastSentenceRange
-    );
-    const expanded = expandEmptySentenceRanges(interpolated);
-    timing.end("expand ranges");
     const storytellerStylesheetUrl = relative(
       dirname(chapter.href),
       "Styles/storyteller-readaloud.css"
@@ -291,25 +308,25 @@ class Aligner {
     this.alignedChapters.push({
       chapter,
       xml: chapterXml,
-      sentenceRanges: expanded,
-      startOffset: transcriptionOffset,
-      endOffset: endTranscriptionOffset
+      sentenceRanges,
+      startOffset: mapping.map(transcriptionOffset),
+      endOffset: mapping.map(endTranscriptionOffset, -1)
     });
     this.addChapterReport(
       chapter,
       chapterSentences,
-      expanded,
+      sentenceRanges,
       startSentence,
       transcriptionOffset
     );
     return {
-      lastSentenceRange: expanded[expanded.length - 1] ?? null,
+      lastSentenceRange: sentenceRanges.at(-1) ?? null,
       endTranscriptionOffset,
       timing
     };
   }
   async alignBook(onProgress) {
-    var _a, _b, _c, _d, _e, _f, _g;
+    var _a, _b, _c, _d, _e, _f, _g, _h;
     const locale = this.languageOverride ?? await this.epub.getLanguage() ?? new Intl.Locale("en-US");
     this.timing.setMetadata("language", locale.toString());
     this.timing.setMetadata("granularity", this.granularity);
@@ -320,7 +337,6 @@ class Aligner {
       locale
     );
     let lastTranscriptionOffset = 0;
-    let lastSentenceRange = null;
     for (let index = 0; index < spine.length; index++) {
       onProgress == null ? void 0 : onProgress(index / spine.length);
       const spineItem = spine[index];
@@ -352,36 +368,72 @@ class Aligner {
       const { startSentence, transcriptionOffset: slugifiedOffset } = this.findBestOffset(
         slugifiedChapterSentences,
         transcriptionText,
-        mapping.map(lastTranscriptionOffset, -1),
-        mapping
+        mapping.map(lastTranscriptionOffset, -1)
       );
-      const transcriptionOffset = slugifiedOffset && mapping.invert().map(slugifiedOffset, -1);
-      if (transcriptionOffset === null) {
+      if (slugifiedOffset === null) {
         (_f = this.logger) == null ? void 0 : _f.info(
           `Couldn't find matching transcription for chapter #${index}`
         );
         continue;
       }
-      (_g = this.logger) == null ? void 0 : _g.info(
-        `Chapter #${index} best matches transcription at offset ${transcriptionOffset}, starting at sentence ${startSentence}`
+      const transcriptionOffset = mapping.invert().map(slugifiedOffset, -1);
+      const {
+        startSentence: startEndSentence,
+        transcriptionOffset: slugifiedEndOffset
+      } = this.findBestOffset(
+        slugifiedChapterSentences,
+        transcriptionText,
+        Math.min(
+          transcriptionText.length,
+          slugifiedOffset + Math.round(slugifiedChapterSentences.join("-").length * 1.2)
+        ),
+        -1
+      );
+      const endSentence = startEndSentence;
+      const endOffset = slugifiedEndOffset === null ? this.transcription.transcript.length : mapping.invert().map(slugifiedEndOffset, 1);
+      if (endSentence - startSentence < slugifiedChapterSentences.length / 2) {
+        (_g = this.logger) == null ? void 0 : _g.info(`Found less than half of chapter #${index}, skipping`);
+      }
+      (_h = this.logger) == null ? void 0 : _h.info(
+        `Chapter #${index} best matches transcription from ${transcriptionOffset} to ${endOffset}, from sentence ${startSentence} to ${endSentence} (of ${slugifiedChapterSentences.length}) in the book`
       );
       const result = await this.alignChapter(
         startSentence,
+        endSentence,
         chapterId,
         transcriptionOffset,
+        endOffset,
         locale,
-        lastSentenceRange
+        mapping
       );
-      lastSentenceRange = result.lastSentenceRange;
       lastTranscriptionOffset = result.endTranscriptionOffset;
       this.timing.add(result.timing.summary());
     }
-    if (lastSentenceRange) {
-      lastSentenceRange.end = await getTrackDuration(
-        lastSentenceRange.audiofile
+    const audioOrderedChapters = this.alignedChapters.toSorted((a, b) => {
+      const firstRangeA = a.sentenceRanges[0];
+      const firstRangeB = b.sentenceRanges[0];
+      if (!firstRangeA) return 1;
+      if (!firstRangeB) return -1;
+      const firstAudiofileIndexA = this.audiofiles.indexOf(
+        firstRangeA.audiofile
       );
-    }
-    for (const alignedChapter of this.alignedChapters) {
+      const firstAudiofileIndexB = this.audiofiles.indexOf(
+        firstRangeB.audiofile
+      );
+      if (firstAudiofileIndexA === firstAudiofileIndexB) {
+        return firstRangeA.start - firstRangeB.start;
+      }
+      return firstAudiofileIndexA - firstAudiofileIndexB;
+    });
+    let lastSentenceRange = null;
+    for (const alignedChapter of audioOrderedChapters) {
+      const interpolated = await interpolateSentenceRanges(
+        alignedChapter.sentenceRanges,
+        lastSentenceRange
+      );
+      const expanded = expandEmptySentenceRanges(interpolated);
+      alignedChapter.sentenceRanges = expanded;
+      lastSentenceRange = expanded.at(-1) ?? null;
       await this.writeAlignedChapter(alignedChapter);
     }
     await this.epub.addMetadata({

package/dist/align/getSentenceRanges.cjs CHANGED Viewed

@@ -25,29 +25,10 @@ __export(getSentenceRanges_exports, {
   interpolateSentenceRanges: () => interpolateSentenceRanges
 });
 module.exports = __toCommonJS(getSentenceRanges_exports);
-var import_text_segmentation = require("@echogarden/text-segmentation");
+var import_itertools = require("itertools");
 var import_ffmpeg = require("../common/ffmpeg.cjs");
-var import_fuzzy = require("./fuzzy.cjs");
+var import_errorAlign = require("../errorAlign/errorAlign.cjs");
 var import_slugify = require("./slugify.cjs");
-async function getSentencesWithOffsets(text) {
-  const sentences = await (0, import_text_segmentation.segmentText)(text).then(
-    (r) => r.sentences.map((s) => s.text)
-  );
-  const sentencesWithOffsets = [];
-  let lastSentenceEnd = 0;
-  for (const sentence of sentences) {
-    const sentenceStart = text.indexOf(sentence, lastSentenceEnd);
-    if (sentenceStart > lastSentenceEnd) {
-      sentencesWithOffsets.push(text.slice(lastSentenceEnd, sentenceStart));
-    }
-    sentencesWithOffsets.push(sentence);
-    lastSentenceEnd = sentenceStart + sentence.length;
-  }
-  if (text.length > lastSentenceEnd) {
-    sentencesWithOffsets.push(text.slice(lastSentenceEnd));
-  }
-  return sentencesWithOffsets;
-}
 function findStartTimestamp(matchStartIndex, transcription) {
   const entry = transcription.timeline.find(
     (entry2) => (entry2.endOffsetUtf16 ?? 0) > matchStartIndex
@@ -65,144 +46,92 @@ function findEndTimestamp(matchEndIndex, transcription) {
   );
   return (entry == null ? void 0 : entry.endTime) ?? null;
 }
-function getWindowIndexFromOffset(window, offset) {
-  let index = 0;
-  while (index < window.length - 1 && offset >= window[index].length) {
-    offset -= window[index].length;
-    index += 1;
+function getAlignmentsForSentence(sentence, alignments) {
+  const result = [];
+  let sentenceIndex = 0;
+  for (const alignment of alignments) {
+    if (sentenceIndex === sentence.length) break;
+    if (alignment.opType !== "INSERT") {
+      sentenceIndex += alignment.ref.length + (sentenceIndex === 0 ? 0 : 1);
+    }
+    result.push(alignment);
   }
-  return { index, offset };
+  return result;
 }
-function collapseWhitespace(input) {
-  return input.replaceAll(/\s+/g, " ");
-}
-async function getSentenceRanges(startSentence, transcription, sentences, chapterOffset, locale, lastSentenceRange) {
+async function getSentenceRanges(startSentence, endSentence, transcription, sentences, chapterOffset, chapterEndOffset, locale) {
   const sentenceRanges = [];
-  const fullTranscriptionText = transcription.transcript;
-  const transcriptionText = fullTranscriptionText.slice(chapterOffset);
-  const transcriptionSentences = await getSentencesWithOffsets(
-    transcriptionText
-  ).then((s) => s.map((sentence) => sentence.toLowerCase()));
-  let startSentenceEntry = startSentence;
-  const sentenceEntries = [];
-  for (let i = 0; i < sentences.length; i++) {
-    const sentence = (await (0, import_slugify.slugify)(sentences[i], locale)).result;
-    if (sentence.length <= 3) {
-      if (i < startSentence) startSentenceEntry--;
-      continue;
-    }
-    sentenceEntries.push([i, sentence]);
-  }
-  let transcriptionWindowIndex = 0;
-  let transcriptionWindowOffset = 0;
-  let lastGoodTranscriptionWindow = 0;
-  let notFound = 0;
-  let sentenceIndex = startSentenceEntry;
-  let lastMatchEnd = chapterOffset;
-  while (sentenceIndex < sentenceEntries.length) {
-    const [sentenceId, sentence] = sentenceEntries[sentenceIndex];
-    const transcriptionWindowList = transcriptionSentences.slice(
-      transcriptionWindowIndex,
-      transcriptionWindowIndex + 10
-    );
-    const { result: transcriptionWindow, mapping } = await (0, import_slugify.slugify)(
-      transcriptionWindowList.join("-").slice(transcriptionWindowOffset),
-      locale
-    );
-    const inverted = mapping.invert();
-    const query = collapseWhitespace(sentence.trim()).toLowerCase();
-    const firstMatch = (0, import_fuzzy.findNearestMatch)(
-      query,
-      transcriptionWindow,
-      Math.max(Math.floor(0.25 * query.length), 1)
-    );
-    if (!firstMatch) {
-      sentenceIndex += 1;
-      notFound += 1;
-      if (notFound === 3 || sentenceIndex === sentenceEntries.length) {
-        transcriptionWindowIndex += 1;
-        if (transcriptionWindowIndex == lastGoodTranscriptionWindow + 30) {
-          transcriptionWindowIndex = lastGoodTranscriptionWindow;
-          notFound = 0;
-          continue;
-        }
-        sentenceIndex -= notFound;
-        notFound = 0;
-      }
-      continue;
+  const fullTranscript = transcription.transcript;
+  const chapterTranscript = fullTranscript.slice(
+    chapterOffset,
+    chapterEndOffset
+  );
+  const { result: slugifiedChapterTranscript, mapping: transcriptMapping } = await (0, import_slugify.slugify)(chapterTranscript, locale);
+  let chapterTranscriptEndIndex = chapterOffset;
+  let chapterSentenceIndex = startSentence;
+  let slugifiedChapterTranscriptWindowStartIndex = 0;
+  while (chapterSentenceIndex < endSentence) {
+    const slugifiedChapterSentenceWindowList = [];
+    let sentenceWindowLength = 0;
+    let i = chapterSentenceIndex;
+    while (sentenceWindowLength < 5e3 && i < sentences.length) {
+      const { result: sentence } = await (0, import_slugify.slugify)(sentences[i], locale);
+      slugifiedChapterSentenceWindowList.push(sentence);
+      sentenceWindowLength += sentence.length;
+      i++;
     }
-    const transcriptionOffset = transcriptionSentences.slice(0, transcriptionWindowIndex).join("").length;
-    const matchStart = inverted.map(firstMatch.index, 1);
-    const matchEnd = inverted.map(
-      firstMatch.index + firstMatch.match.length,
-      -1
+    const slugifiedChapterSentenceWindow = slugifiedChapterSentenceWindowList.join("-");
+    const slugifiedChapterTranscriptWindow = slugifiedChapterTranscript.slice(
+      slugifiedChapterTranscriptWindowStartIndex,
+      slugifiedChapterTranscriptWindowStartIndex + sentenceWindowLength * 1.2
     );
-    const startResult = findStartTimestamp(
-      matchStart + transcriptionOffset + transcriptionWindowOffset + chapterOffset,
-      transcription
+    const alignments = (0, import_errorAlign.errorAlign)(
+      slugifiedChapterSentenceWindow,
+      slugifiedChapterTranscriptWindow
     );
-    if (!startResult) {
-      sentenceIndex += 1;
-      continue;
-    }
-    let start = startResult.start;
-    const audiofile = startResult.audiofile;
-    const end = findEndTimestamp(
-      matchEnd + transcriptionOffset + transcriptionWindowOffset + chapterOffset,
-      transcription
-    ) ?? startResult.end;
-    if (sentenceRanges.length > 0) {
-      const previousSentenceRange = sentenceRanges[sentenceRanges.length - 1];
-      const previousAudiofile = previousSentenceRange.audiofile;
-      if (audiofile === previousAudiofile) {
-        if (previousSentenceRange.id === sentenceId - 1) {
-          previousSentenceRange.end = start;
-        }
-      } else {
-        if (previousSentenceRange.id === sentenceId - 1) {
-          const lastTrackDuration = await (0, import_ffmpeg.getTrackDuration)(previousAudiofile);
-          previousSentenceRange.end = lastTrackDuration;
-          start = 0;
-        }
+    let alignmentIndex = 0;
+    let currentTranscriptWindowIndex = 0;
+    for (const [i2, slugifiedSentence] of (0, import_itertools.enumerate)(
+      slugifiedChapterSentenceWindowList
+    )) {
+      if (!slugifiedSentence) continue;
+      const sentenceAlignments = getAlignmentsForSentence(
+        slugifiedSentence,
+        alignments.slice(alignmentIndex)
+      );
+      const sentenceLengthInSlugifiedTranscript = sentenceAlignments.filter((a) => a.opType !== "DELETE").map((a) => a.hyp).join("-").length;
+      const start = findStartTimestamp(
+        chapterOffset + transcriptMapping.invert().map(
+          slugifiedChapterTranscriptWindowStartIndex + currentTranscriptWindowIndex,
+          1
+        ),
+        transcription
+      );
+      chapterTranscriptEndIndex = chapterOffset + transcriptMapping.invert().map(
+        slugifiedChapterTranscriptWindowStartIndex + currentTranscriptWindowIndex + sentenceLengthInSlugifiedTranscript,
+        -1
+      );
+      const end = findEndTimestamp(chapterTranscriptEndIndex, transcription);
+      if (start && end !== null) {
+        sentenceRanges.push({
+          id: i2 + chapterSentenceIndex,
+          start: start.start,
+          audiofile: start.audiofile,
+          end
+        });
       }
-    } else if (lastSentenceRange !== null) {
-      if (audiofile === lastSentenceRange.audiofile) {
-        if (sentenceId === 0) {
-          lastSentenceRange.end = start;
-        }
-      } else {
-        const lastTrackDuration = await (0, import_ffmpeg.getTrackDuration)(
-          lastSentenceRange.audiofile
-        );
-        lastSentenceRange.end = lastTrackDuration;
-        if (sentenceId === 0) {
-          start = 0;
-        }
+      alignmentIndex += sentenceAlignments.length;
+      currentTranscriptWindowIndex += sentenceLengthInSlugifiedTranscript;
+      if (slugifiedChapterTranscriptWindow[currentTranscriptWindowIndex] === "-") {
+        currentTranscriptWindowIndex++;
       }
-    } else if (sentenceId === 0) {
-      start = 0;
     }
-    sentenceRanges.push({
-      id: sentenceId,
-      start,
-      end,
-      audiofile
-    });
-    notFound = 0;
-    lastMatchEnd = matchEnd + transcriptionOffset + transcriptionWindowOffset + chapterOffset;
-    const windowIndexResult = getWindowIndexFromOffset(
-      transcriptionWindowList,
-      matchEnd + transcriptionWindowOffset
-    );
-    transcriptionWindowIndex += windowIndexResult.index;
-    transcriptionWindowOffset = windowIndexResult.offset;
-    lastGoodTranscriptionWindow = transcriptionWindowIndex;
-    sentenceIndex += 1;
+    chapterSentenceIndex += slugifiedChapterSentenceWindowList.length;
+    slugifiedChapterTranscriptWindowStartIndex += currentTranscriptWindowIndex;
+    if (slugifiedChapterTranscript[slugifiedChapterTranscriptWindowStartIndex] === "-") {
+      slugifiedChapterTranscriptWindowStartIndex++;
+    }
   }
-  return {
-    sentenceRanges,
-    transcriptionOffset: lastMatchEnd
-  };
+  return { sentenceRanges, transcriptionOffset: chapterTranscriptEndIndex };
 }
 async function getLargestGap(trailing, leading) {
   const leadingGap = leading.start;

package/dist/align/getSentenceRanges.d.cts CHANGED Viewed

@@ -14,7 +14,7 @@ type SentenceRange = {
     audiofile: string;
 };
 declare function findEndTimestamp(matchEndIndex: number, transcription: StorytellerTranscription): number | null;
-declare function getSentenceRanges(startSentence: number, transcription: StorytellerTranscription, sentences: string[], chapterOffset: number, locale: Intl.Locale, lastSentenceRange: SentenceRange | null): Promise<{
+declare function getSentenceRanges(startSentence: number, endSentence: number, transcription: StorytellerTranscription, sentences: string[], chapterOffset: number, chapterEndOffset: number, locale: Intl.Locale): Promise<{
     sentenceRanges: SentenceRange[];
     transcriptionOffset: number;
 }>;

package/dist/align/getSentenceRanges.d.ts CHANGED Viewed

@@ -14,7 +14,7 @@ type SentenceRange = {
     audiofile: string;
 };
 declare function findEndTimestamp(matchEndIndex: number, transcription: StorytellerTranscription): number | null;
-declare function getSentenceRanges(startSentence: number, transcription: StorytellerTranscription, sentences: string[], chapterOffset: number, locale: Intl.Locale, lastSentenceRange: SentenceRange | null): Promise<{
+declare function getSentenceRanges(startSentence: number, endSentence: number, transcription: StorytellerTranscription, sentences: string[], chapterOffset: number, chapterEndOffset: number, locale: Intl.Locale): Promise<{
     sentenceRanges: SentenceRange[];
     transcriptionOffset: number;
 }>;