@speakableio/core 1.0.25 → 1.0.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -73,6 +73,7 @@ __export(index_native_exports, {
73
73
  getSetFromCache: () => getSetFromCache,
74
74
  getTotalCompletedCards: () => getTotalCompletedCards,
75
75
  getTranscript: () => getTranscript,
76
+ getTranscriptCycle: () => getTranscriptCycle,
76
77
  getWordHash: () => getWordHash,
77
78
  purify: () => purify,
78
79
  refsCardsFiresotre: () => refsCardsFiresotre,
@@ -97,6 +98,7 @@ __export(index_native_exports, {
97
98
  useSet: () => useSet,
98
99
  useSpeakableApi: () => useSpeakableApi,
99
100
  useSpeakableTranscript: () => useSpeakableTranscript,
101
+ useSpeakableTranscriptCycle: () => useSpeakableTranscriptCycle,
100
102
  useSubmitAssignmentScore: () => useSubmitAssignmentScore,
101
103
  useSubmitPracticeScore: () => useSubmitPracticeScore,
102
104
  useUpdateCardScore: () => useUpdateCardScore,
@@ -1837,7 +1839,7 @@ var getCard = withErrorHandler(_getCard, "getCard");
1837
1839
  var import_uuid = require("uuid");
1838
1840
 
1839
1841
  // src/utils/text-utils.ts
1840
- var import_js_sha1 = __toESM(require("js-sha1"));
1842
+ var import_js_sha1 = require("js-sha1");
1841
1843
  var purify = (word) => {
1842
1844
  return word.normalize("NFD").replace(/\/([^" "]*)/g, "").replace(/\([^()]*\)/g, "").replace(/([^()]*)/g, "").replace(/[\u0300-\u036f]/g, "").replace(/[-]/g, " ").replace(/[.,/#!¡¿?؟。,.?$%^&*;:{}=\-_`~()’'…\s]/g, "").replace(/\s\s+/g, " ").toLowerCase().trim();
1843
1845
  };
@@ -1855,7 +1857,7 @@ var cleanString = (words) => {
1855
1857
  };
1856
1858
  var getWordHash = (word, language) => {
1857
1859
  const cleanedWord = cleanString(word);
1858
- const wordHash = (0, import_js_sha1.default)(`${language}-${cleanedWord}`);
1860
+ const wordHash = (0, import_js_sha1.sha1)(`${language}-${cleanedWord}`);
1859
1861
  console.log("wordHash core library", wordHash);
1860
1862
  return wordHash;
1861
1863
  };
@@ -2242,8 +2244,209 @@ var createSetRepo = () => {
2242
2244
  };
2243
2245
  };
2244
2246
 
2247
+ // src/utils/ai/detect-transcript-hallucionation.ts
2248
+ var HALLUCINATION_THRESHOLDS = {
2249
+ // Short repeats
2250
+ MIN_CONSECUTIVE_REPEATS: 3,
2251
+ MIN_WORDS_FOR_RATIO_CHECK: 10,
2252
+ MAX_UNIQUE_WORDS_FOR_RATIO: 3,
2253
+ MIN_REPETITION_RATIO: 3,
2254
+ // Phrase repeats
2255
+ MIN_SENTENCE_LENGTH: 10,
2256
+ MIN_CONSECUTIVE_SIMILAR_SENTENCES: 2,
2257
+ MIN_SENTENCES_FOR_DUPLICATE_CHECK: 3,
2258
+ // Cyclic patterns
2259
+ MIN_CYCLE_LENGTH: 20,
2260
+ MIN_CYCLE_REPEATS: 3,
2261
+ // Entropy detection
2262
+ MIN_LENGTH_FOR_ENTROPY_CHECK: 50,
2263
+ MAX_ENTROPY_THRESHOLD: 2.5,
2264
+ // bits per character
2265
+ // Similarity
2266
+ SENTENCE_SIMILARITY_THRESHOLD: 0.8,
2267
+ SEGMENT_SIMILARITY_THRESHOLD: 0.85
2268
+ };
2269
+ function detectTranscriptHallucinationWithDetails(transcript) {
2270
+ if (!transcript || transcript.trim().length === 0) {
2271
+ return { isHallucination: false };
2272
+ }
2273
+ const text = transcript.trim();
2274
+ if (text.length < 10) {
2275
+ return { isHallucination: false };
2276
+ }
2277
+ const shortRepeats = detectShortRepeats(text);
2278
+ if (shortRepeats) {
2279
+ return {
2280
+ isHallucination: true,
2281
+ reason: "Detected repeated short words or phrases",
2282
+ confidence: 0.9
2283
+ };
2284
+ }
2285
+ const phraseRepeats = detectPhraseRepeats(text);
2286
+ if (phraseRepeats) {
2287
+ return {
2288
+ isHallucination: true,
2289
+ reason: "Detected repeated sentences or phrases",
2290
+ confidence: 0.85
2291
+ };
2292
+ }
2293
+ const cyclicRepeats = detectCyclicPattern(text);
2294
+ if (cyclicRepeats) {
2295
+ return {
2296
+ isHallucination: true,
2297
+ reason: "Detected cyclic repetition pattern",
2298
+ confidence: 0.8
2299
+ };
2300
+ }
2301
+ if (text.length >= HALLUCINATION_THRESHOLDS.MIN_LENGTH_FOR_ENTROPY_CHECK) {
2302
+ const entropy = calculateEntropy(text);
2303
+ if (entropy < HALLUCINATION_THRESHOLDS.MAX_ENTROPY_THRESHOLD) {
2304
+ return {
2305
+ isHallucination: true,
2306
+ reason: "Detected low entropy (likely gibberish or excessive repetition)",
2307
+ confidence: 0.75
2308
+ };
2309
+ }
2310
+ }
2311
+ return { isHallucination: false };
2312
+ }
2313
+ function detectShortRepeats(text) {
2314
+ const words = text.toLowerCase().split(/[\s,;.!?]+/).filter((w) => w.length > 0);
2315
+ if (words.length < 4) return false;
2316
+ let repeatCount = 1;
2317
+ for (let i = 1; i < words.length; i++) {
2318
+ if (words[i] === words[i - 1]) {
2319
+ repeatCount++;
2320
+ if (repeatCount >= HALLUCINATION_THRESHOLDS.MIN_CONSECUTIVE_REPEATS) {
2321
+ return true;
2322
+ }
2323
+ } else {
2324
+ repeatCount = 1;
2325
+ }
2326
+ }
2327
+ const uniqueWords = new Set(words);
2328
+ const repetitionRatio = words.length / uniqueWords.size;
2329
+ if (words.length >= HALLUCINATION_THRESHOLDS.MIN_WORDS_FOR_RATIO_CHECK && uniqueWords.size <= HALLUCINATION_THRESHOLDS.MAX_UNIQUE_WORDS_FOR_RATIO && repetitionRatio >= HALLUCINATION_THRESHOLDS.MIN_REPETITION_RATIO) {
2330
+ return true;
2331
+ }
2332
+ return false;
2333
+ }
2334
+ function detectPhraseRepeats(text) {
2335
+ const sentences = text.split(/[.!?]+/).map((s) => s.trim().toLowerCase()).filter((s) => s.length > HALLUCINATION_THRESHOLDS.MIN_SENTENCE_LENGTH);
2336
+ if (sentences.length < 2) return false;
2337
+ for (let i = 0; i < sentences.length - 1; i++) {
2338
+ let consecutiveRepeats = 1;
2339
+ for (let j = i + 1; j < sentences.length; j++) {
2340
+ if (isSimilarSentence(sentences[i], sentences[j])) {
2341
+ consecutiveRepeats++;
2342
+ } else {
2343
+ break;
2344
+ }
2345
+ }
2346
+ if (consecutiveRepeats >= HALLUCINATION_THRESHOLDS.MIN_CONSECUTIVE_SIMILAR_SENTENCES) {
2347
+ return true;
2348
+ }
2349
+ }
2350
+ const uniqueSentences = new Set(sentences);
2351
+ if (sentences.length >= HALLUCINATION_THRESHOLDS.MIN_SENTENCES_FOR_DUPLICATE_CHECK && uniqueSentences.size === 1) {
2352
+ return true;
2353
+ }
2354
+ return false;
2355
+ }
2356
+ function isSimilarSentence(s1, s2, threshold = HALLUCINATION_THRESHOLDS.SENTENCE_SIMILARITY_THRESHOLD) {
2357
+ if (s1 === s2) return true;
2358
+ const normalized1 = s1.replace(/\s+/g, " ").trim();
2359
+ const normalized2 = s2.replace(/\s+/g, " ").trim();
2360
+ if (normalized1 === normalized2) return true;
2361
+ const words1 = normalized1.split(/\s+/);
2362
+ const words2 = normalized2.split(/\s+/);
2363
+ if (Math.abs(words1.length - words2.length) > 2) return false;
2364
+ const set1 = new Set(words1);
2365
+ const set2 = new Set(words2);
2366
+ const intersection = new Set([...set1].filter((w) => set2.has(w)));
2367
+ const similarity = intersection.size * 2 / (set1.size + set2.size);
2368
+ return similarity >= threshold;
2369
+ }
2370
+ function detectCyclicPattern(text) {
2371
+ const normalized = text.toLowerCase().replace(/\s+/g, " ").trim();
2372
+ const length = normalized.length;
2373
+ const minCycleLength = HALLUCINATION_THRESHOLDS.MIN_CYCLE_LENGTH;
2374
+ const maxCycleLength = Math.floor(length / 2);
2375
+ if (maxCycleLength < minCycleLength) return false;
2376
+ const step = 5;
2377
+ for (let cycleLen = minCycleLength; cycleLen <= maxCycleLength; cycleLen += step) {
2378
+ const pattern = normalized.substring(0, cycleLen);
2379
+ let matchCount = 0;
2380
+ let pos = 0;
2381
+ while (pos < length) {
2382
+ const segment = normalized.substring(pos, pos + cycleLen);
2383
+ if (segment.length < cycleLen) {
2384
+ const partialMatch = pattern.startsWith(segment);
2385
+ if (partialMatch && matchCount > 0) {
2386
+ matchCount++;
2387
+ }
2388
+ break;
2389
+ }
2390
+ if (segment === pattern || isSegmentSimilar(segment, pattern)) {
2391
+ matchCount++;
2392
+ pos += cycleLen;
2393
+ } else {
2394
+ break;
2395
+ }
2396
+ }
2397
+ if (matchCount >= HALLUCINATION_THRESHOLDS.MIN_CYCLE_REPEATS) {
2398
+ return true;
2399
+ }
2400
+ }
2401
+ return false;
2402
+ }
2403
+ function isSegmentSimilar(s1, s2) {
2404
+ if (s1 === s2) return true;
2405
+ if (s1.length !== s2.length) return false;
2406
+ let matches = 0;
2407
+ const minLength = Math.min(s1.length, s2.length);
2408
+ for (let i = 0; i < minLength; i++) {
2409
+ if (s1[i] === s2[i]) {
2410
+ matches++;
2411
+ }
2412
+ }
2413
+ const similarity = matches / minLength;
2414
+ return similarity >= HALLUCINATION_THRESHOLDS.SEGMENT_SIMILARITY_THRESHOLD;
2415
+ }
2416
+ function calculateEntropy(text) {
2417
+ if (!text || text.length === 0) {
2418
+ return 0;
2419
+ }
2420
+ const frequencies = /* @__PURE__ */ new Map();
2421
+ for (const char of text.toLowerCase()) {
2422
+ frequencies.set(char, (frequencies.get(char) || 0) + 1);
2423
+ }
2424
+ let entropy = 0;
2425
+ const length = text.length;
2426
+ for (const count of frequencies.values()) {
2427
+ const probability = count / length;
2428
+ entropy -= probability * Math.log2(probability);
2429
+ }
2430
+ return entropy;
2431
+ }
2432
+ function cleanHallucinatedTranscript(transcript) {
2433
+ var _a, _b;
2434
+ const result = detectTranscriptHallucinationWithDetails(transcript);
2435
+ if (result.isHallucination) {
2436
+ console.warn(
2437
+ "Hallucinated transcript detected and removed:",
2438
+ transcript.substring(0, 100),
2439
+ `
2440
+ Reason: ${(_a = result.reason) != null ? _a : "Unknown"}`,
2441
+ `Confidence: ${String((_b = result.confidence) != null ? _b : "Unknown")}`
2442
+ );
2443
+ return "";
2444
+ }
2445
+ return transcript;
2446
+ }
2447
+
2245
2448
  // src/utils/ai/get-transcript.ts
2246
- async function getTranscript(model, args) {
2449
+ async function getTranscript(model, args, cleanHallucinations = true) {
2247
2450
  var _a, _b, _c, _d, _e, _f;
2248
2451
  const getGeminiTranscript = (_b = (_a = api).httpsCallable) == null ? void 0 : _b.call(_a, "getGeminiTranscript");
2249
2452
  const getAssemblyAITranscript = (_d = (_c = api).httpsCallable) == null ? void 0 : _d.call(_c, "transcribeAssemblyAIAudio");
@@ -2254,7 +2457,7 @@ async function getTranscript(model, args) {
2254
2457
  audioUrl: args.audioUrl,
2255
2458
  language: args.language
2256
2459
  }));
2257
- return data;
2460
+ return cleanHallucinations ? cleanHallucinatedTranscript(data) : data;
2258
2461
  } catch (error) {
2259
2462
  console.error("Error getting transcript from Whisper:", error);
2260
2463
  throw error;
@@ -2267,7 +2470,7 @@ async function getTranscript(model, args) {
2267
2470
  targetLanguage: args.language,
2268
2471
  prompt: args.prompt
2269
2472
  }));
2270
- return data.transcript;
2473
+ return cleanHallucinations ? cleanHallucinatedTranscript(data.transcript) : data.transcript;
2271
2474
  } catch (error) {
2272
2475
  console.error("Error getting transcript from Gemini:", error);
2273
2476
  throw error;
@@ -2279,7 +2482,7 @@ async function getTranscript(model, args) {
2279
2482
  audioUrl: args.audioUrl,
2280
2483
  language: args.language
2281
2484
  }));
2282
- return response.data;
2485
+ return cleanHallucinations ? cleanHallucinatedTranscript(response.data) : response.data;
2283
2486
  } catch (error) {
2284
2487
  console.error("Error getting transcript from AssemblyAI:", error);
2285
2488
  throw error;
@@ -2287,6 +2490,37 @@ async function getTranscript(model, args) {
2287
2490
  }
2288
2491
  return null;
2289
2492
  }
2493
+ async function getTranscriptCycle(args) {
2494
+ const models = ["whisper", "gemini", "assemblyai"];
2495
+ let transcript = "";
2496
+ let lastError = null;
2497
+ for (const model of models) {
2498
+ try {
2499
+ const transcriptResult = await getTranscript(model, args, false);
2500
+ const rawTranscript = transcriptResult || "";
2501
+ transcript = cleanHallucinatedTranscript(rawTranscript);
2502
+ if (transcript !== "") {
2503
+ console.log(`Successfully got transcript from ${model}`);
2504
+ break;
2505
+ }
2506
+ console.warn(`${model} returned empty transcript, trying next model`);
2507
+ } catch (e) {
2508
+ console.error(`Error with ${model} transcript:`, e);
2509
+ lastError = e;
2510
+ }
2511
+ }
2512
+ if (transcript === "") {
2513
+ console.error("All transcript models failed or returned empty", lastError);
2514
+ return {
2515
+ transcript: "",
2516
+ success: false
2517
+ };
2518
+ }
2519
+ return {
2520
+ transcript,
2521
+ success: true
2522
+ };
2523
+ }
2290
2524
 
2291
2525
  // src/constants/all-langs.json
2292
2526
  var all_langs_default = {
@@ -3033,6 +3267,17 @@ function useSpeakableTranscript() {
3033
3267
  mutation
3034
3268
  };
3035
3269
  }
3270
+ function useSpeakableTranscriptCycle() {
3271
+ const mutation = (0, import_react_query7.useMutation)({
3272
+ mutationFn: async (args) => {
3273
+ return getTranscriptCycle(args);
3274
+ },
3275
+ retry: false
3276
+ });
3277
+ return {
3278
+ mutationTranscriptCycle: mutation
3279
+ };
3280
+ }
3036
3281
 
3037
3282
  // src/hooks/useUpdateStudentVoc.ts
3038
3283
  var useUpdateStudentVocab = (page) => {