npm - @storyteller-platform/ghost-story - Versions diffs - 0.1.6 → 0.1.7 - Mend

@storyteller-platform/ghost-story 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/dist/cli/config.cjs +7 -1
package/dist/cli/config.d.cts +4 -2
package/dist/cli/config.d.ts +4 -2
package/dist/cli/config.js +5 -1
package/dist/utilities/SpacelessScripts.cjs +48 -0
package/dist/utilities/SpacelessScripts.d.cts +10 -0
package/dist/utilities/SpacelessScripts.d.ts +10 -0
package/dist/utilities/SpacelessScripts.js +22 -0
package/dist/utilities/Timeline.cjs +12 -5
package/dist/utilities/Timeline.js +12 -5
package/dist/utilities/WhisperTimeline.cjs +118 -22
package/dist/utilities/WhisperTimeline.js +118 -22
package/package.json +1 -1

package/dist/cli/config.cjs CHANGED Viewed

@@ -35,6 +35,8 @@ __export(config_exports, {
   MODEL_SIZES: () => MODEL_SIZES,
   RECOGNITION_ENGINES: () => RECOGNITION_ENGINES,
   SILERO_VAD_VERSION: () => SILERO_VAD_VERSION,
+  WHISPER_CPP_PATCH_LEVEL: () => WHISPER_CPP_PATCH_LEVEL,
+  WHISPER_CPP_UPSTREAM_VERSION: () => WHISPER_CPP_UPSTREAM_VERSION,
   WHISPER_CPP_VERSION: () => WHISPER_CPP_VERSION,
   WHISPER_MODELS: () => WHISPER_MODELS,
   WHISPER_MODEL_VERSION: () => WHISPER_MODEL_VERSION,
@@ -72,7 +74,9 @@ var import_node_os = __toESM(require("node:os"), 1);
 var import_node_path = __toESM(require("node:path"), 1);
 var import_zod = __toESM(require("zod"), 1);
 var import_FileSystem = require("../utilities/FileSystem.cjs");
-const WHISPER_CPP_VERSION = "1.8.3";
+const WHISPER_CPP_UPSTREAM_VERSION = "1.8.3";
+const WHISPER_CPP_PATCH_LEVEL = 2;
+const WHISPER_CPP_VERSION = `${WHISPER_CPP_UPSTREAM_VERSION}-st.${WHISPER_CPP_PATCH_LEVEL}`;
 const WHISPER_MODEL_VERSION = "1.0.0";
 const SILERO_VAD_VERSION = "6.2.0";
 const GITLAB_PROJECT_PATH = "storyteller-platform/storyteller";
@@ -478,6 +482,8 @@ function writeConfig(config) {
   MODEL_SIZES,
   RECOGNITION_ENGINES,
   SILERO_VAD_VERSION,
+  WHISPER_CPP_PATCH_LEVEL,
+  WHISPER_CPP_UPSTREAM_VERSION,
   WHISPER_CPP_VERSION,
   WHISPER_MODELS,
   WHISPER_MODEL_VERSION,

package/dist/cli/config.d.cts CHANGED Viewed

@@ -1,6 +1,8 @@
 import z from 'zod';
-declare const WHISPER_CPP_VERSION = "1.8.3";
+declare const WHISPER_CPP_UPSTREAM_VERSION = "1.8.3";
+declare const WHISPER_CPP_PATCH_LEVEL = 2;
+declare const WHISPER_CPP_VERSION = "1.8.3-st.2";
 declare const WHISPER_MODEL_VERSION = "1.0.0";
 declare const SILERO_VAD_VERSION = "6.2.0";
 declare const GITLAB_PROJECT_PATH = "storyteller-platform/storyteller";
@@ -73,4 +75,4 @@ type CLIConfig = z.infer<typeof cliConfigSchema>;
 declare function readConfig(): CLIConfig;
 declare function writeConfig(config: Partial<CLIConfig>): void;
-export { BUILD_VARIANTS, type BuildVariant, type CLIConfig, GITLAB_PROJECT_ID, GITLAB_PROJECT_PATH, GITLAB_WHIPSER_ML_ID, MODEL_SIZES, RECOGNITION_ENGINES, SILERO_VAD_VERSION, WHISPER_CPP_VERSION, WHISPER_MODELS, WHISPER_MODEL_VERSION, type WhisperModel, applyLegacyCpuFallback, cliConfigSchema, detectPlatform, getBinaryDownloadUrl, getCompatibleVariants, getConfiguredVariant, getCoremlModelDownloadUrl, getCoremlModelPath, getInstallDir, getInstalledVariant, getModelDir, getModelDownloadUrl, getModelPath, getVadExecutablePath, getVadModelDownloadUrl, getVadModelPath, getWhisperBaseDir, getWhisperExecutablePath, getWhisperServerExecutablePath, isValidModel, isValidVariant, isVariantCompatibleWithCurrentPlatform, needsCoremlModel, readConfig, resolveVariant, writeConfig };
+export { BUILD_VARIANTS, type BuildVariant, type CLIConfig, GITLAB_PROJECT_ID, GITLAB_PROJECT_PATH, GITLAB_WHIPSER_ML_ID, MODEL_SIZES, RECOGNITION_ENGINES, SILERO_VAD_VERSION, WHISPER_CPP_PATCH_LEVEL, WHISPER_CPP_UPSTREAM_VERSION, WHISPER_CPP_VERSION, WHISPER_MODELS, WHISPER_MODEL_VERSION, type WhisperModel, applyLegacyCpuFallback, cliConfigSchema, detectPlatform, getBinaryDownloadUrl, getCompatibleVariants, getConfiguredVariant, getCoremlModelDownloadUrl, getCoremlModelPath, getInstallDir, getInstalledVariant, getModelDir, getModelDownloadUrl, getModelPath, getVadExecutablePath, getVadModelDownloadUrl, getVadModelPath, getWhisperBaseDir, getWhisperExecutablePath, getWhisperServerExecutablePath, isValidModel, isValidVariant, isVariantCompatibleWithCurrentPlatform, needsCoremlModel, readConfig, resolveVariant, writeConfig };

package/dist/cli/config.d.ts CHANGED Viewed

@@ -1,6 +1,8 @@
 import z from 'zod';
-declare const WHISPER_CPP_VERSION = "1.8.3";
+declare const WHISPER_CPP_UPSTREAM_VERSION = "1.8.3";
+declare const WHISPER_CPP_PATCH_LEVEL = 2;
+declare const WHISPER_CPP_VERSION = "1.8.3-st.2";
 declare const WHISPER_MODEL_VERSION = "1.0.0";
 declare const SILERO_VAD_VERSION = "6.2.0";
 declare const GITLAB_PROJECT_PATH = "storyteller-platform/storyteller";
@@ -73,4 +75,4 @@ type CLIConfig = z.infer<typeof cliConfigSchema>;
 declare function readConfig(): CLIConfig;
 declare function writeConfig(config: Partial<CLIConfig>): void;
-export { BUILD_VARIANTS, type BuildVariant, type CLIConfig, GITLAB_PROJECT_ID, GITLAB_PROJECT_PATH, GITLAB_WHIPSER_ML_ID, MODEL_SIZES, RECOGNITION_ENGINES, SILERO_VAD_VERSION, WHISPER_CPP_VERSION, WHISPER_MODELS, WHISPER_MODEL_VERSION, type WhisperModel, applyLegacyCpuFallback, cliConfigSchema, detectPlatform, getBinaryDownloadUrl, getCompatibleVariants, getConfiguredVariant, getCoremlModelDownloadUrl, getCoremlModelPath, getInstallDir, getInstalledVariant, getModelDir, getModelDownloadUrl, getModelPath, getVadExecutablePath, getVadModelDownloadUrl, getVadModelPath, getWhisperBaseDir, getWhisperExecutablePath, getWhisperServerExecutablePath, isValidModel, isValidVariant, isVariantCompatibleWithCurrentPlatform, needsCoremlModel, readConfig, resolveVariant, writeConfig };
+export { BUILD_VARIANTS, type BuildVariant, type CLIConfig, GITLAB_PROJECT_ID, GITLAB_PROJECT_PATH, GITLAB_WHIPSER_ML_ID, MODEL_SIZES, RECOGNITION_ENGINES, SILERO_VAD_VERSION, WHISPER_CPP_PATCH_LEVEL, WHISPER_CPP_UPSTREAM_VERSION, WHISPER_CPP_VERSION, WHISPER_MODELS, WHISPER_MODEL_VERSION, type WhisperModel, applyLegacyCpuFallback, cliConfigSchema, detectPlatform, getBinaryDownloadUrl, getCompatibleVariants, getConfiguredVariant, getCoremlModelDownloadUrl, getCoremlModelPath, getInstallDir, getInstalledVariant, getModelDir, getModelDownloadUrl, getModelPath, getVadExecutablePath, getVadModelDownloadUrl, getVadModelPath, getWhisperBaseDir, getWhisperExecutablePath, getWhisperServerExecutablePath, isValidModel, isValidVariant, isVariantCompatibleWithCurrentPlatform, needsCoremlModel, readConfig, resolveVariant, writeConfig };

package/dist/cli/config.js CHANGED Viewed

@@ -4,7 +4,9 @@ import os from "node:os";
 import path from "node:path";
 import z from "zod";
 import { getAppDataDir } from "../utilities/FileSystem.js";
-const WHISPER_CPP_VERSION = "1.8.3";
+const WHISPER_CPP_UPSTREAM_VERSION = "1.8.3";
+const WHISPER_CPP_PATCH_LEVEL = 2;
+const WHISPER_CPP_VERSION = `${WHISPER_CPP_UPSTREAM_VERSION}-st.${WHISPER_CPP_PATCH_LEVEL}`;
 const WHISPER_MODEL_VERSION = "1.0.0";
 const SILERO_VAD_VERSION = "6.2.0";
 const GITLAB_PROJECT_PATH = "storyteller-platform/storyteller";
@@ -409,6 +411,8 @@ export {
   MODEL_SIZES,
   RECOGNITION_ENGINES,
   SILERO_VAD_VERSION,
+  WHISPER_CPP_PATCH_LEVEL,
+  WHISPER_CPP_UPSTREAM_VERSION,
   WHISPER_CPP_VERSION,
   WHISPER_MODELS,
   WHISPER_MODEL_VERSION,

package/dist/utilities/SpacelessScripts.cjs ADDED Viewed

@@ -0,0 +1,48 @@
+"use strict";
+var __defProp = Object.defineProperty;
+var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
+var __getOwnPropNames = Object.getOwnPropertyNames;
+var __hasOwnProp = Object.prototype.hasOwnProperty;
+var __export = (target, all) => {
+  for (var name in all)
+    __defProp(target, name, { get: all[name], enumerable: true });
+};
+var __copyProps = (to, from, except, desc) => {
+  if (from && typeof from === "object" || typeof from === "function") {
+    for (let key of __getOwnPropNames(from))
+      if (!__hasOwnProp.call(to, key) && key !== except)
+        __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
+  }
+  return to;
+};
+var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
+var SpacelessScripts_exports = {};
+__export(SpacelessScripts_exports, {
+  spacelessScriptPattern: () => spacelessScriptPattern,
+  spacelessScripts: () => spacelessScripts,
+  startsWithSpacelessScript: () => startsWithSpacelessScript
+});
+module.exports = __toCommonJS(SpacelessScripts_exports);
+const spacelessScripts = [
+  { name: "thai", from: 3585, to: 3663 },
+  { name: "lao", from: 3713, to: 3807 },
+  { name: "tibetan", from: 3840, to: 4095 },
+  { name: "myanmar", from: 4096, to: 4255 },
+  { name: "khmer", from: 6016, to: 6143 },
+  { name: "hiragana", from: 12352, to: 12447 },
+  { name: "katakana", from: 12448, to: 12543 },
+  { name: "cjk-ext-a", from: 13312, to: 19903 },
+  { name: "cjk-unified", from: 19968, to: 40959 },
+  { name: "cjk-compat", from: 63744, to: 64255 }
+];
+const charClass = spacelessScripts.map((s) => `${String.fromCharCode(s.from)}-${String.fromCharCode(s.to)}`).join("");
+const spacelessScriptPattern = new RegExp(`[${charClass}]`);
+function startsWithSpacelessScript(text) {
+  return spacelessScriptPattern.test(text.charAt(0));
+}
+// Annotate the CommonJS export names for ESM import in node:
+0 && (module.exports = {
+  spacelessScriptPattern,
+  spacelessScripts,
+  startsWithSpacelessScript
+});

package/dist/utilities/SpacelessScripts.d.cts ADDED Viewed

@@ -0,0 +1,10 @@
+interface ScriptRange {
+    name: string;
+    from: number;
+    to: number;
+}
+declare const spacelessScripts: readonly ScriptRange[];
+declare const spacelessScriptPattern: RegExp;
+declare function startsWithSpacelessScript(text: string): boolean;
+export { spacelessScriptPattern, spacelessScripts, startsWithSpacelessScript };

package/dist/utilities/SpacelessScripts.d.ts ADDED Viewed

@@ -0,0 +1,10 @@
+interface ScriptRange {
+    name: string;
+    from: number;
+    to: number;
+}
+declare const spacelessScripts: readonly ScriptRange[];
+declare const spacelessScriptPattern: RegExp;
+declare function startsWithSpacelessScript(text: string): boolean;
+export { spacelessScriptPattern, spacelessScripts, startsWithSpacelessScript };

package/dist/utilities/SpacelessScripts.js ADDED Viewed

@@ -0,0 +1,22 @@
+const spacelessScripts = [
+  { name: "thai", from: 3585, to: 3663 },
+  { name: "lao", from: 3713, to: 3807 },
+  { name: "tibetan", from: 3840, to: 4095 },
+  { name: "myanmar", from: 4096, to: 4255 },
+  { name: "khmer", from: 6016, to: 6143 },
+  { name: "hiragana", from: 12352, to: 12447 },
+  { name: "katakana", from: 12448, to: 12543 },
+  { name: "cjk-ext-a", from: 13312, to: 19903 },
+  { name: "cjk-unified", from: 19968, to: 40959 },
+  { name: "cjk-compat", from: 63744, to: 64255 }
+];
+const charClass = spacelessScripts.map((s) => `${String.fromCharCode(s.from)}-${String.fromCharCode(s.to)}`).join("");
+const spacelessScriptPattern = new RegExp(`[${charClass}]`);
+function startsWithSpacelessScript(text) {
+  return spacelessScriptPattern.test(text.charAt(0));
+}
+export {
+  spacelessScriptPattern,
+  spacelessScripts,
+  startsWithSpacelessScript
+};

package/dist/utilities/Timeline.cjs CHANGED Viewed

@@ -24,6 +24,7 @@ __export(Timeline_exports, {
   getUTF32Chars: () => getUTF32Chars
 });
 module.exports = __toCommonJS(Timeline_exports);
+var import_SpacelessScripts = require("./SpacelessScripts.cjs");
 function addWordTextOffsetsToTimelineInPlace(timeline, text) {
   const { utf16To32Mapping } = getUTF32Chars(text);
   let currentOffset = 0;
@@ -96,16 +97,22 @@ function getUTF32Chars(str) {
 function buildTranscriptFromTimeline(timeline) {
   const words = [];
   function collectWords(entries) {
-    for (const entry of entries) {
-      if (entry.type === "word") {
-        words.push(entry.text);
-      } else if (entry.timeline) {
+    for (let i = 0; i < entries.length; i++) {
+      const entry = entries[i];
+      if (entry.timeline) {
         collectWords(entry.timeline);
+        continue;
+      }
+      if (!(0, import_SpacelessScripts.startsWithSpacelessScript)(entry.text) && i !== entries.length - 1) {
+        words.push(entry.text);
+        words.push(" ");
+        continue;
       }
+      words.push(entry.text);
     }
   }
   collectWords(timeline);
-  return words.join(" ");
+  return words.join("");
 }
 function addTimeOffsetToTimeline(targetTimeline, timeOffset) {
   const newTimeline = structuredClone(targetTimeline);

package/dist/utilities/Timeline.js CHANGED Viewed

@@ -1,3 +1,4 @@
+import { startsWithSpacelessScript } from "./SpacelessScripts.js";
 function addWordTextOffsetsToTimelineInPlace(timeline, text) {
   const { utf16To32Mapping } = getUTF32Chars(text);
   let currentOffset = 0;
@@ -70,16 +71,22 @@ function getUTF32Chars(str) {
 function buildTranscriptFromTimeline(timeline) {
   const words = [];
   function collectWords(entries) {
-    for (const entry of entries) {
-      if (entry.type === "word") {
-        words.push(entry.text);
-      } else if (entry.timeline) {
+    for (let i = 0; i < entries.length; i++) {
+      const entry = entries[i];
+      if (entry.timeline) {
         collectWords(entry.timeline);
+        continue;
+      }
+      if (!startsWithSpacelessScript(entry.text) && i !== entries.length - 1) {
+        words.push(entry.text);
+        words.push(" ");
+        continue;
       }
+      words.push(entry.text);
     }
   }
   collectWords(timeline);
-  return words.join(" ");
+  return words.join("");
 }
 function addTimeOffsetToTimeline(targetTimeline, timeOffset) {
   const newTimeline = structuredClone(targetTimeline);

package/dist/utilities/WhisperTimeline.cjs CHANGED Viewed

@@ -28,6 +28,7 @@ __export(WhisperTimeline_exports, {
   scoreTimeline: () => scoreTimeline
 });
 module.exports = __toCommonJS(WhisperTimeline_exports);
+var import_SpacelessScripts = require("./SpacelessScripts.cjs");
 const WHISPER_SAMPLE_RATE = 16e3;
 function calculateWhisperSplits(durationSeconds, numProcessors, sampleRate = WHISPER_SAMPLE_RATE) {
   if (numProcessors <= 1) return [];
@@ -42,26 +43,107 @@ function calculateWhisperSplits(durationSeconds, numProcessors, sampleRate = WHI
   return splits;
 }
 const specialTokenPattern = /\[_.+\]|<\|[a-z_]+\|>/g;
+const REPLACEMENT_CHAR = "\uFFFD";
+function isSpecialToken(text) {
+  return text.startsWith("[_") || text.startsWith("<|");
+}
+function hasUtf8Corruption(text) {
+  return text.includes(REPLACEMENT_CHAR);
+}
+function buildAnchor(items, startIdx, options, maxItems = 5) {
+  let anchor = "";
+  let count = 0;
+  for (let j = startIdx; j < items.length && count < maxItems; j++) {
+    const item = items[j];
+    if (!item) break;
+    const text = options.getText(item);
+    if (options.shouldSkipInAnchor(item)) continue;
+    if (hasUtf8Corruption(text)) break;
+    anchor += text;
+    count++;
+  }
+  return anchor;
+}
+function forEachMergedUtf8Run(items, segmentText, options, emit) {
+  let segPos = 0;
+  let i = 0;
+  while (i < items.length) {
+    const item = items[i];
+    if (!item) break;
+    const text = options.getText(item);
+    const isSkippable = options.shouldSkipInAnchor(item);
+    if (isSkippable || !hasUtf8Corruption(text)) {
+      if (!isSkippable) {
+        segPos += text.length;
+      }
+      emit({
+        first: item,
+        last: item,
+        text,
+        probability: options.getProbability(item),
+        isMerged: false
+      });
+      i++;
+      continue;
+    }
+    const runStart = i;
+    let probability = 1;
+    while (i < items.length) {
+      const runItem = items[i];
+      if (!runItem) break;
+      const runText = options.getText(runItem);
+      const shouldStop = options.shouldSkipInAnchor(runItem) || !hasUtf8Corruption(runText);
+      if (shouldStop) break;
+      probability *= options.getProbability(runItem);
+      i++;
+    }
+    const first = items[runStart];
+    const last = items[i - 1];
+    if (!first || !last) continue;
+    const anchor = buildAnchor(items, i, options);
+    const anchorIdx = anchor.length > 0 ? segmentText.indexOf(anchor, segPos) : -1;
+    const runEndSegPos = anchorIdx >= 0 ? anchorIdx : segmentText.length;
+    const mergedText = segmentText.slice(segPos, runEndSegPos);
+    segPos = runEndSegPos;
+    emit({
+      first,
+      last,
+      text: mergedText,
+      probability,
+      isMerged: true
+    });
+  }
+}
 function parseWhisperCppOutput(transcription) {
   return transcription.map((segment) => {
-    var _a, _b;
     const words = [];
     let lastTokenEndMs = 0;
-    for (const token of segment.tokens) {
-      const cleanedText = token.text.replace(specialTokenPattern, "");
-      if (cleanedText.trim().length === 0) continue;
-      const offsetFrom = ((_a = token.offsets) == null ? void 0 : _a.from) ?? lastTokenEndMs;
-      const offsetTo = ((_b = token.offsets) == null ? void 0 : _b.to) ?? lastTokenEndMs;
-      if (token.offsets) {
-        lastTokenEndMs = token.offsets.to;
+    forEachMergedUtf8Run(
+      segment.tokens,
+      segment.text,
+      {
+        getText: (token) => token.text,
+        getProbability: (token) => token.p,
+        shouldSkipInAnchor: (token) => isSpecialToken(token.text)
+      },
+      (run) => {
+        var _a, _b;
+        const cleanedText = run.text.replace(specialTokenPattern, "");
+        if (cleanedText.trim().length === 0) return;
+        const fallbackOffset = run.isMerged ? 0 : lastTokenEndMs;
+        const offsetFrom = ((_a = run.first.offsets) == null ? void 0 : _a.from) ?? fallbackOffset;
+        const offsetTo = ((_b = run.last.offsets) == null ? void 0 : _b.to) ?? fallbackOffset;
+        if (run.isMerged || run.last.offsets) {
+          lastTokenEndMs = offsetTo;
+        }
+        words.push({
+          text: cleanedText,
+          start: offsetFrom / 1e3,
+          end: offsetTo / 1e3,
+          confidence: run.probability
+        });
       }
-      words.push({
-        text: cleanedText,
-        start: offsetFrom / 1e3,
-        end: offsetTo / 1e3,
-        confidence: token.p
-      });
-    }
+    );
     return {
       text: segment.text,
       segmentStart: segment.offsets.from / 1e3,
@@ -72,12 +154,25 @@ function parseWhisperCppOutput(transcription) {
 }
 function parseWhisperServerOutput(segments) {
   return segments.map((segment) => {
-    const words = (segment.words ?? []).map((word) => ({
-      text: word.word,
-      start: word.start,
-      end: word.end,
-      confidence: word.probability ?? 0
-    }));
+    const words = [];
+    forEachMergedUtf8Run(
+      segment.words ?? [],
+      segment.text,
+      {
+        getText: (word) => word.word,
+        getProbability: (word) => word.probability ?? 1,
+        shouldSkipInAnchor: () => false
+      },
+      (run) => {
+        const confidence = run.isMerged ? run.probability : run.first.probability ?? 0;
+        words.push({
+          text: run.text,
+          start: run.first.start,
+          end: run.last.end,
+          confidence
+        });
+      }
+    );
     return {
       text: segment.text,
       segmentStart: segment.start,
@@ -256,7 +351,8 @@ function extractCorrectedTimeline(segments, options = {}) {
         end: segmentEnd
       });
       const lastEntry = timeline[timeline.length - 1];
-      if (lastEntry && !word.text.startsWith(" ")) {
+      const isSubwordContinuation = !word.text.startsWith(" ") && !(0, import_SpacelessScripts.startsWithSpacelessScript)(trimmedText);
+      if (lastEntry && isSubwordContinuation) {
         lastEntry.text += trimmedText;
         if (lastEntry.confidence !== void 0) {
           lastEntry.confidence = Math.min(lastEntry.confidence, word.confidence);

package/dist/utilities/WhisperTimeline.js CHANGED Viewed

@@ -1,3 +1,4 @@
+import { startsWithSpacelessScript } from "./SpacelessScripts.js";
 const WHISPER_SAMPLE_RATE = 16e3;
 function calculateWhisperSplits(durationSeconds, numProcessors, sampleRate = WHISPER_SAMPLE_RATE) {
   if (numProcessors <= 1) return [];
@@ -12,26 +13,107 @@ function calculateWhisperSplits(durationSeconds, numProcessors, sampleRate = WHI
   return splits;
 }
 const specialTokenPattern = /\[_.+\]|<\|[a-z_]+\|>/g;
+const REPLACEMENT_CHAR = "\uFFFD";
+function isSpecialToken(text) {
+  return text.startsWith("[_") || text.startsWith("<|");
+}
+function hasUtf8Corruption(text) {
+  return text.includes(REPLACEMENT_CHAR);
+}
+function buildAnchor(items, startIdx, options, maxItems = 5) {
+  let anchor = "";
+  let count = 0;
+  for (let j = startIdx; j < items.length && count < maxItems; j++) {
+    const item = items[j];
+    if (!item) break;
+    const text = options.getText(item);
+    if (options.shouldSkipInAnchor(item)) continue;
+    if (hasUtf8Corruption(text)) break;
+    anchor += text;
+    count++;
+  }
+  return anchor;
+}
+function forEachMergedUtf8Run(items, segmentText, options, emit) {
+  let segPos = 0;
+  let i = 0;
+  while (i < items.length) {
+    const item = items[i];
+    if (!item) break;
+    const text = options.getText(item);
+    const isSkippable = options.shouldSkipInAnchor(item);
+    if (isSkippable || !hasUtf8Corruption(text)) {
+      if (!isSkippable) {
+        segPos += text.length;
+      }
+      emit({
+        first: item,
+        last: item,
+        text,
+        probability: options.getProbability(item),
+        isMerged: false
+      });
+      i++;
+      continue;
+    }
+    const runStart = i;
+    let probability = 1;
+    while (i < items.length) {
+      const runItem = items[i];
+      if (!runItem) break;
+      const runText = options.getText(runItem);
+      const shouldStop = options.shouldSkipInAnchor(runItem) || !hasUtf8Corruption(runText);
+      if (shouldStop) break;
+      probability *= options.getProbability(runItem);
+      i++;
+    }
+    const first = items[runStart];
+    const last = items[i - 1];
+    if (!first || !last) continue;
+    const anchor = buildAnchor(items, i, options);
+    const anchorIdx = anchor.length > 0 ? segmentText.indexOf(anchor, segPos) : -1;
+    const runEndSegPos = anchorIdx >= 0 ? anchorIdx : segmentText.length;
+    const mergedText = segmentText.slice(segPos, runEndSegPos);
+    segPos = runEndSegPos;
+    emit({
+      first,
+      last,
+      text: mergedText,
+      probability,
+      isMerged: true
+    });
+  }
+}
 function parseWhisperCppOutput(transcription) {
   return transcription.map((segment) => {
-    var _a, _b;
     const words = [];
     let lastTokenEndMs = 0;
-    for (const token of segment.tokens) {
-      const cleanedText = token.text.replace(specialTokenPattern, "");
-      if (cleanedText.trim().length === 0) continue;
-      const offsetFrom = ((_a = token.offsets) == null ? void 0 : _a.from) ?? lastTokenEndMs;
-      const offsetTo = ((_b = token.offsets) == null ? void 0 : _b.to) ?? lastTokenEndMs;
-      if (token.offsets) {
-        lastTokenEndMs = token.offsets.to;
+    forEachMergedUtf8Run(
+      segment.tokens,
+      segment.text,
+      {
+        getText: (token) => token.text,
+        getProbability: (token) => token.p,
+        shouldSkipInAnchor: (token) => isSpecialToken(token.text)
+      },
+      (run) => {
+        var _a, _b;
+        const cleanedText = run.text.replace(specialTokenPattern, "");
+        if (cleanedText.trim().length === 0) return;
+        const fallbackOffset = run.isMerged ? 0 : lastTokenEndMs;
+        const offsetFrom = ((_a = run.first.offsets) == null ? void 0 : _a.from) ?? fallbackOffset;
+        const offsetTo = ((_b = run.last.offsets) == null ? void 0 : _b.to) ?? fallbackOffset;
+        if (run.isMerged || run.last.offsets) {
+          lastTokenEndMs = offsetTo;
+        }
+        words.push({
+          text: cleanedText,
+          start: offsetFrom / 1e3,
+          end: offsetTo / 1e3,
+          confidence: run.probability
+        });
       }
-      words.push({
-        text: cleanedText,
-        start: offsetFrom / 1e3,
-        end: offsetTo / 1e3,
-        confidence: token.p
-      });
-    }
+    );
     return {
       text: segment.text,
       segmentStart: segment.offsets.from / 1e3,
@@ -42,12 +124,25 @@ function parseWhisperCppOutput(transcription) {
 }
 function parseWhisperServerOutput(segments) {
   return segments.map((segment) => {
-    const words = (segment.words ?? []).map((word) => ({
-      text: word.word,
-      start: word.start,
-      end: word.end,
-      confidence: word.probability ?? 0
-    }));
+    const words = [];
+    forEachMergedUtf8Run(
+      segment.words ?? [],
+      segment.text,
+      {
+        getText: (word) => word.word,
+        getProbability: (word) => word.probability ?? 1,
+        shouldSkipInAnchor: () => false
+      },
+      (run) => {
+        const confidence = run.isMerged ? run.probability : run.first.probability ?? 0;
+        words.push({
+          text: run.text,
+          start: run.first.start,
+          end: run.last.end,
+          confidence
+        });
+      }
+    );
     return {
       text: segment.text,
       segmentStart: segment.start,
@@ -226,7 +321,8 @@ function extractCorrectedTimeline(segments, options = {}) {
         end: segmentEnd
       });
       const lastEntry = timeline[timeline.length - 1];
-      if (lastEntry && !word.text.startsWith(" ")) {
+      const isSubwordContinuation = !word.text.startsWith(" ") && !startsWithSpacelessScript(trimmedText);
+      if (lastEntry && isSubwordContinuation) {
         lastEntry.text += trimmedText;
         if (lastEntry.confidence !== void 0) {
           lastEntry.confidence = Math.min(lastEntry.confidence, word.confidence);

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@storyteller-platform/ghost-story",
-  "version": "0.1.6",
+  "version": "0.1.7",
   "description": "An easy-to-use speech toolset. Fork of the original echogarden project.",
   "author": "Thomas F. K. Jorna",
   "license": "GPL-3.0",