@speakableio/core 1.0.25 → 1.0.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.native.d.mts +39 -2
- package/dist/index.native.d.ts +39 -2
- package/dist/index.native.js +251 -6
- package/dist/index.native.js.map +1 -1
- package/dist/index.native.mjs +250 -5
- package/dist/index.native.mjs.map +1 -1
- package/dist/index.web.d.mts +39 -2
- package/dist/index.web.js +250 -5
- package/dist/index.web.js.map +1 -1
- package/package.json +1 -1
package/dist/index.native.js
CHANGED
|
@@ -73,6 +73,7 @@ __export(index_native_exports, {
|
|
|
73
73
|
getSetFromCache: () => getSetFromCache,
|
|
74
74
|
getTotalCompletedCards: () => getTotalCompletedCards,
|
|
75
75
|
getTranscript: () => getTranscript,
|
|
76
|
+
getTranscriptCycle: () => getTranscriptCycle,
|
|
76
77
|
getWordHash: () => getWordHash,
|
|
77
78
|
purify: () => purify,
|
|
78
79
|
refsCardsFiresotre: () => refsCardsFiresotre,
|
|
@@ -97,6 +98,7 @@ __export(index_native_exports, {
|
|
|
97
98
|
useSet: () => useSet,
|
|
98
99
|
useSpeakableApi: () => useSpeakableApi,
|
|
99
100
|
useSpeakableTranscript: () => useSpeakableTranscript,
|
|
101
|
+
useSpeakableTranscriptCycle: () => useSpeakableTranscriptCycle,
|
|
100
102
|
useSubmitAssignmentScore: () => useSubmitAssignmentScore,
|
|
101
103
|
useSubmitPracticeScore: () => useSubmitPracticeScore,
|
|
102
104
|
useUpdateCardScore: () => useUpdateCardScore,
|
|
@@ -1837,7 +1839,7 @@ var getCard = withErrorHandler(_getCard, "getCard");
|
|
|
1837
1839
|
var import_uuid = require("uuid");
|
|
1838
1840
|
|
|
1839
1841
|
// src/utils/text-utils.ts
|
|
1840
|
-
var import_js_sha1 =
|
|
1842
|
+
var import_js_sha1 = require("js-sha1");
|
|
1841
1843
|
var purify = (word) => {
|
|
1842
1844
|
return word.normalize("NFD").replace(/\/([^" "]*)/g, "").replace(/\([^()]*\)/g, "").replace(/([^()]*)/g, "").replace(/[\u0300-\u036f]/g, "").replace(/[-]/g, " ").replace(/[.,/#!¡¿?؟。,.?$%^&*;:{}=\-_`~()’'…\s]/g, "").replace(/\s\s+/g, " ").toLowerCase().trim();
|
|
1843
1845
|
};
|
|
@@ -1855,7 +1857,7 @@ var cleanString = (words) => {
|
|
|
1855
1857
|
};
|
|
1856
1858
|
var getWordHash = (word, language) => {
|
|
1857
1859
|
const cleanedWord = cleanString(word);
|
|
1858
|
-
const wordHash = (0, import_js_sha1.
|
|
1860
|
+
const wordHash = (0, import_js_sha1.sha1)(`${language}-${cleanedWord}`);
|
|
1859
1861
|
console.log("wordHash core library", wordHash);
|
|
1860
1862
|
return wordHash;
|
|
1861
1863
|
};
|
|
@@ -2242,8 +2244,209 @@ var createSetRepo = () => {
|
|
|
2242
2244
|
};
|
|
2243
2245
|
};
|
|
2244
2246
|
|
|
2247
|
+
// src/utils/ai/detect-transcript-hallucionation.ts
|
|
2248
|
+
var HALLUCINATION_THRESHOLDS = {
|
|
2249
|
+
// Short repeats
|
|
2250
|
+
MIN_CONSECUTIVE_REPEATS: 3,
|
|
2251
|
+
MIN_WORDS_FOR_RATIO_CHECK: 10,
|
|
2252
|
+
MAX_UNIQUE_WORDS_FOR_RATIO: 3,
|
|
2253
|
+
MIN_REPETITION_RATIO: 3,
|
|
2254
|
+
// Phrase repeats
|
|
2255
|
+
MIN_SENTENCE_LENGTH: 10,
|
|
2256
|
+
MIN_CONSECUTIVE_SIMILAR_SENTENCES: 2,
|
|
2257
|
+
MIN_SENTENCES_FOR_DUPLICATE_CHECK: 3,
|
|
2258
|
+
// Cyclic patterns
|
|
2259
|
+
MIN_CYCLE_LENGTH: 20,
|
|
2260
|
+
MIN_CYCLE_REPEATS: 3,
|
|
2261
|
+
// Entropy detection
|
|
2262
|
+
MIN_LENGTH_FOR_ENTROPY_CHECK: 50,
|
|
2263
|
+
MAX_ENTROPY_THRESHOLD: 2.5,
|
|
2264
|
+
// bits per character
|
|
2265
|
+
// Similarity
|
|
2266
|
+
SENTENCE_SIMILARITY_THRESHOLD: 0.8,
|
|
2267
|
+
SEGMENT_SIMILARITY_THRESHOLD: 0.85
|
|
2268
|
+
};
|
|
2269
|
+
function detectTranscriptHallucinationWithDetails(transcript) {
|
|
2270
|
+
if (!transcript || transcript.trim().length === 0) {
|
|
2271
|
+
return { isHallucination: false };
|
|
2272
|
+
}
|
|
2273
|
+
const text = transcript.trim();
|
|
2274
|
+
if (text.length < 10) {
|
|
2275
|
+
return { isHallucination: false };
|
|
2276
|
+
}
|
|
2277
|
+
const shortRepeats = detectShortRepeats(text);
|
|
2278
|
+
if (shortRepeats) {
|
|
2279
|
+
return {
|
|
2280
|
+
isHallucination: true,
|
|
2281
|
+
reason: "Detected repeated short words or phrases",
|
|
2282
|
+
confidence: 0.9
|
|
2283
|
+
};
|
|
2284
|
+
}
|
|
2285
|
+
const phraseRepeats = detectPhraseRepeats(text);
|
|
2286
|
+
if (phraseRepeats) {
|
|
2287
|
+
return {
|
|
2288
|
+
isHallucination: true,
|
|
2289
|
+
reason: "Detected repeated sentences or phrases",
|
|
2290
|
+
confidence: 0.85
|
|
2291
|
+
};
|
|
2292
|
+
}
|
|
2293
|
+
const cyclicRepeats = detectCyclicPattern(text);
|
|
2294
|
+
if (cyclicRepeats) {
|
|
2295
|
+
return {
|
|
2296
|
+
isHallucination: true,
|
|
2297
|
+
reason: "Detected cyclic repetition pattern",
|
|
2298
|
+
confidence: 0.8
|
|
2299
|
+
};
|
|
2300
|
+
}
|
|
2301
|
+
if (text.length >= HALLUCINATION_THRESHOLDS.MIN_LENGTH_FOR_ENTROPY_CHECK) {
|
|
2302
|
+
const entropy = calculateEntropy(text);
|
|
2303
|
+
if (entropy < HALLUCINATION_THRESHOLDS.MAX_ENTROPY_THRESHOLD) {
|
|
2304
|
+
return {
|
|
2305
|
+
isHallucination: true,
|
|
2306
|
+
reason: "Detected low entropy (likely gibberish or excessive repetition)",
|
|
2307
|
+
confidence: 0.75
|
|
2308
|
+
};
|
|
2309
|
+
}
|
|
2310
|
+
}
|
|
2311
|
+
return { isHallucination: false };
|
|
2312
|
+
}
|
|
2313
|
+
function detectShortRepeats(text) {
|
|
2314
|
+
const words = text.toLowerCase().split(/[\s,;.!?]+/).filter((w) => w.length > 0);
|
|
2315
|
+
if (words.length < 4) return false;
|
|
2316
|
+
let repeatCount = 1;
|
|
2317
|
+
for (let i = 1; i < words.length; i++) {
|
|
2318
|
+
if (words[i] === words[i - 1]) {
|
|
2319
|
+
repeatCount++;
|
|
2320
|
+
if (repeatCount >= HALLUCINATION_THRESHOLDS.MIN_CONSECUTIVE_REPEATS) {
|
|
2321
|
+
return true;
|
|
2322
|
+
}
|
|
2323
|
+
} else {
|
|
2324
|
+
repeatCount = 1;
|
|
2325
|
+
}
|
|
2326
|
+
}
|
|
2327
|
+
const uniqueWords = new Set(words);
|
|
2328
|
+
const repetitionRatio = words.length / uniqueWords.size;
|
|
2329
|
+
if (words.length >= HALLUCINATION_THRESHOLDS.MIN_WORDS_FOR_RATIO_CHECK && uniqueWords.size <= HALLUCINATION_THRESHOLDS.MAX_UNIQUE_WORDS_FOR_RATIO && repetitionRatio >= HALLUCINATION_THRESHOLDS.MIN_REPETITION_RATIO) {
|
|
2330
|
+
return true;
|
|
2331
|
+
}
|
|
2332
|
+
return false;
|
|
2333
|
+
}
|
|
2334
|
+
function detectPhraseRepeats(text) {
|
|
2335
|
+
const sentences = text.split(/[.!?]+/).map((s) => s.trim().toLowerCase()).filter((s) => s.length > HALLUCINATION_THRESHOLDS.MIN_SENTENCE_LENGTH);
|
|
2336
|
+
if (sentences.length < 2) return false;
|
|
2337
|
+
for (let i = 0; i < sentences.length - 1; i++) {
|
|
2338
|
+
let consecutiveRepeats = 1;
|
|
2339
|
+
for (let j = i + 1; j < sentences.length; j++) {
|
|
2340
|
+
if (isSimilarSentence(sentences[i], sentences[j])) {
|
|
2341
|
+
consecutiveRepeats++;
|
|
2342
|
+
} else {
|
|
2343
|
+
break;
|
|
2344
|
+
}
|
|
2345
|
+
}
|
|
2346
|
+
if (consecutiveRepeats >= HALLUCINATION_THRESHOLDS.MIN_CONSECUTIVE_SIMILAR_SENTENCES) {
|
|
2347
|
+
return true;
|
|
2348
|
+
}
|
|
2349
|
+
}
|
|
2350
|
+
const uniqueSentences = new Set(sentences);
|
|
2351
|
+
if (sentences.length >= HALLUCINATION_THRESHOLDS.MIN_SENTENCES_FOR_DUPLICATE_CHECK && uniqueSentences.size === 1) {
|
|
2352
|
+
return true;
|
|
2353
|
+
}
|
|
2354
|
+
return false;
|
|
2355
|
+
}
|
|
2356
|
+
function isSimilarSentence(s1, s2, threshold = HALLUCINATION_THRESHOLDS.SENTENCE_SIMILARITY_THRESHOLD) {
|
|
2357
|
+
if (s1 === s2) return true;
|
|
2358
|
+
const normalized1 = s1.replace(/\s+/g, " ").trim();
|
|
2359
|
+
const normalized2 = s2.replace(/\s+/g, " ").trim();
|
|
2360
|
+
if (normalized1 === normalized2) return true;
|
|
2361
|
+
const words1 = normalized1.split(/\s+/);
|
|
2362
|
+
const words2 = normalized2.split(/\s+/);
|
|
2363
|
+
if (Math.abs(words1.length - words2.length) > 2) return false;
|
|
2364
|
+
const set1 = new Set(words1);
|
|
2365
|
+
const set2 = new Set(words2);
|
|
2366
|
+
const intersection = new Set([...set1].filter((w) => set2.has(w)));
|
|
2367
|
+
const similarity = intersection.size * 2 / (set1.size + set2.size);
|
|
2368
|
+
return similarity >= threshold;
|
|
2369
|
+
}
|
|
2370
|
+
function detectCyclicPattern(text) {
|
|
2371
|
+
const normalized = text.toLowerCase().replace(/\s+/g, " ").trim();
|
|
2372
|
+
const length = normalized.length;
|
|
2373
|
+
const minCycleLength = HALLUCINATION_THRESHOLDS.MIN_CYCLE_LENGTH;
|
|
2374
|
+
const maxCycleLength = Math.floor(length / 2);
|
|
2375
|
+
if (maxCycleLength < minCycleLength) return false;
|
|
2376
|
+
const step = 5;
|
|
2377
|
+
for (let cycleLen = minCycleLength; cycleLen <= maxCycleLength; cycleLen += step) {
|
|
2378
|
+
const pattern = normalized.substring(0, cycleLen);
|
|
2379
|
+
let matchCount = 0;
|
|
2380
|
+
let pos = 0;
|
|
2381
|
+
while (pos < length) {
|
|
2382
|
+
const segment = normalized.substring(pos, pos + cycleLen);
|
|
2383
|
+
if (segment.length < cycleLen) {
|
|
2384
|
+
const partialMatch = pattern.startsWith(segment);
|
|
2385
|
+
if (partialMatch && matchCount > 0) {
|
|
2386
|
+
matchCount++;
|
|
2387
|
+
}
|
|
2388
|
+
break;
|
|
2389
|
+
}
|
|
2390
|
+
if (segment === pattern || isSegmentSimilar(segment, pattern)) {
|
|
2391
|
+
matchCount++;
|
|
2392
|
+
pos += cycleLen;
|
|
2393
|
+
} else {
|
|
2394
|
+
break;
|
|
2395
|
+
}
|
|
2396
|
+
}
|
|
2397
|
+
if (matchCount >= HALLUCINATION_THRESHOLDS.MIN_CYCLE_REPEATS) {
|
|
2398
|
+
return true;
|
|
2399
|
+
}
|
|
2400
|
+
}
|
|
2401
|
+
return false;
|
|
2402
|
+
}
|
|
2403
|
+
function isSegmentSimilar(s1, s2) {
|
|
2404
|
+
if (s1 === s2) return true;
|
|
2405
|
+
if (s1.length !== s2.length) return false;
|
|
2406
|
+
let matches = 0;
|
|
2407
|
+
const minLength = Math.min(s1.length, s2.length);
|
|
2408
|
+
for (let i = 0; i < minLength; i++) {
|
|
2409
|
+
if (s1[i] === s2[i]) {
|
|
2410
|
+
matches++;
|
|
2411
|
+
}
|
|
2412
|
+
}
|
|
2413
|
+
const similarity = matches / minLength;
|
|
2414
|
+
return similarity >= HALLUCINATION_THRESHOLDS.SEGMENT_SIMILARITY_THRESHOLD;
|
|
2415
|
+
}
|
|
2416
|
+
function calculateEntropy(text) {
|
|
2417
|
+
if (!text || text.length === 0) {
|
|
2418
|
+
return 0;
|
|
2419
|
+
}
|
|
2420
|
+
const frequencies = /* @__PURE__ */ new Map();
|
|
2421
|
+
for (const char of text.toLowerCase()) {
|
|
2422
|
+
frequencies.set(char, (frequencies.get(char) || 0) + 1);
|
|
2423
|
+
}
|
|
2424
|
+
let entropy = 0;
|
|
2425
|
+
const length = text.length;
|
|
2426
|
+
for (const count of frequencies.values()) {
|
|
2427
|
+
const probability = count / length;
|
|
2428
|
+
entropy -= probability * Math.log2(probability);
|
|
2429
|
+
}
|
|
2430
|
+
return entropy;
|
|
2431
|
+
}
|
|
2432
|
+
function cleanHallucinatedTranscript(transcript) {
|
|
2433
|
+
var _a, _b;
|
|
2434
|
+
const result = detectTranscriptHallucinationWithDetails(transcript);
|
|
2435
|
+
if (result.isHallucination) {
|
|
2436
|
+
console.warn(
|
|
2437
|
+
"Hallucinated transcript detected and removed:",
|
|
2438
|
+
transcript.substring(0, 100),
|
|
2439
|
+
`
|
|
2440
|
+
Reason: ${(_a = result.reason) != null ? _a : "Unknown"}`,
|
|
2441
|
+
`Confidence: ${String((_b = result.confidence) != null ? _b : "Unknown")}`
|
|
2442
|
+
);
|
|
2443
|
+
return "";
|
|
2444
|
+
}
|
|
2445
|
+
return transcript;
|
|
2446
|
+
}
|
|
2447
|
+
|
|
2245
2448
|
// src/utils/ai/get-transcript.ts
|
|
2246
|
-
async function getTranscript(model, args) {
|
|
2449
|
+
async function getTranscript(model, args, cleanHallucinations = true) {
|
|
2247
2450
|
var _a, _b, _c, _d, _e, _f;
|
|
2248
2451
|
const getGeminiTranscript = (_b = (_a = api).httpsCallable) == null ? void 0 : _b.call(_a, "getGeminiTranscript");
|
|
2249
2452
|
const getAssemblyAITranscript = (_d = (_c = api).httpsCallable) == null ? void 0 : _d.call(_c, "transcribeAssemblyAIAudio");
|
|
@@ -2254,7 +2457,7 @@ async function getTranscript(model, args) {
|
|
|
2254
2457
|
audioUrl: args.audioUrl,
|
|
2255
2458
|
language: args.language
|
|
2256
2459
|
}));
|
|
2257
|
-
return data;
|
|
2460
|
+
return cleanHallucinations ? cleanHallucinatedTranscript(data) : data;
|
|
2258
2461
|
} catch (error) {
|
|
2259
2462
|
console.error("Error getting transcript from Whisper:", error);
|
|
2260
2463
|
throw error;
|
|
@@ -2267,7 +2470,7 @@ async function getTranscript(model, args) {
|
|
|
2267
2470
|
targetLanguage: args.language,
|
|
2268
2471
|
prompt: args.prompt
|
|
2269
2472
|
}));
|
|
2270
|
-
return data.transcript;
|
|
2473
|
+
return cleanHallucinations ? cleanHallucinatedTranscript(data.transcript) : data.transcript;
|
|
2271
2474
|
} catch (error) {
|
|
2272
2475
|
console.error("Error getting transcript from Gemini:", error);
|
|
2273
2476
|
throw error;
|
|
@@ -2279,7 +2482,7 @@ async function getTranscript(model, args) {
|
|
|
2279
2482
|
audioUrl: args.audioUrl,
|
|
2280
2483
|
language: args.language
|
|
2281
2484
|
}));
|
|
2282
|
-
return response.data;
|
|
2485
|
+
return cleanHallucinations ? cleanHallucinatedTranscript(response.data) : response.data;
|
|
2283
2486
|
} catch (error) {
|
|
2284
2487
|
console.error("Error getting transcript from AssemblyAI:", error);
|
|
2285
2488
|
throw error;
|
|
@@ -2287,6 +2490,37 @@ async function getTranscript(model, args) {
|
|
|
2287
2490
|
}
|
|
2288
2491
|
return null;
|
|
2289
2492
|
}
|
|
2493
|
+
async function getTranscriptCycle(args) {
|
|
2494
|
+
const models = ["whisper", "gemini", "assemblyai"];
|
|
2495
|
+
let transcript = "";
|
|
2496
|
+
let lastError = null;
|
|
2497
|
+
for (const model of models) {
|
|
2498
|
+
try {
|
|
2499
|
+
const transcriptResult = await getTranscript(model, args, false);
|
|
2500
|
+
const rawTranscript = transcriptResult || "";
|
|
2501
|
+
transcript = cleanHallucinatedTranscript(rawTranscript);
|
|
2502
|
+
if (transcript !== "") {
|
|
2503
|
+
console.log(`Successfully got transcript from ${model}`);
|
|
2504
|
+
break;
|
|
2505
|
+
}
|
|
2506
|
+
console.warn(`${model} returned empty transcript, trying next model`);
|
|
2507
|
+
} catch (e) {
|
|
2508
|
+
console.error(`Error with ${model} transcript:`, e);
|
|
2509
|
+
lastError = e;
|
|
2510
|
+
}
|
|
2511
|
+
}
|
|
2512
|
+
if (transcript === "") {
|
|
2513
|
+
console.error("All transcript models failed or returned empty", lastError);
|
|
2514
|
+
return {
|
|
2515
|
+
transcript: "",
|
|
2516
|
+
success: false
|
|
2517
|
+
};
|
|
2518
|
+
}
|
|
2519
|
+
return {
|
|
2520
|
+
transcript,
|
|
2521
|
+
success: true
|
|
2522
|
+
};
|
|
2523
|
+
}
|
|
2290
2524
|
|
|
2291
2525
|
// src/constants/all-langs.json
|
|
2292
2526
|
var all_langs_default = {
|
|
@@ -3033,6 +3267,17 @@ function useSpeakableTranscript() {
|
|
|
3033
3267
|
mutation
|
|
3034
3268
|
};
|
|
3035
3269
|
}
|
|
3270
|
+
function useSpeakableTranscriptCycle() {
|
|
3271
|
+
const mutation = (0, import_react_query7.useMutation)({
|
|
3272
|
+
mutationFn: async (args) => {
|
|
3273
|
+
return getTranscriptCycle(args);
|
|
3274
|
+
},
|
|
3275
|
+
retry: false
|
|
3276
|
+
});
|
|
3277
|
+
return {
|
|
3278
|
+
mutationTranscriptCycle: mutation
|
|
3279
|
+
};
|
|
3280
|
+
}
|
|
3036
3281
|
|
|
3037
3282
|
// src/hooks/useUpdateStudentVoc.ts
|
|
3038
3283
|
var useUpdateStudentVocab = (page) => {
|