wobble-bibble 1.0.4 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +15 -32
- package/dist/index.d.ts +169 -141
- package/dist/index.js +512 -226
- package/dist/index.js.map +1 -1
- package/package.json +41 -41
package/README.md
CHANGED
|
@@ -51,24 +51,18 @@ const ids = getPromptIds(); // ['master_prompt', 'hadith', 'fiqh', ...]
|
|
|
51
51
|
### Validate LLM Output
|
|
52
52
|
|
|
53
53
|
```typescript
|
|
54
|
-
import {
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
54
|
+
import { validateTranslationResponse } from 'wobble-bibble';
|
|
55
|
+
|
|
56
|
+
const segments = [
|
|
57
|
+
{ id: 'P1234', text: '... Arabic source for P1234 ...' },
|
|
58
|
+
{ id: 'P1235', text: '... Arabic source for P1235 ...' },
|
|
59
|
+
];
|
|
59
60
|
|
|
60
61
|
const llmOutput = `P1234 - Translation of first segment
|
|
61
62
|
P1235 - Translation of second segment`;
|
|
62
63
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
if (!result.isValid) {
|
|
66
|
-
console.error('Error:', result.error);
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
// Individual detectors
|
|
70
|
-
const arabicWarnings = detectArabicScript(llmOutput); // Soft warnings
|
|
71
|
-
const newlineError = detectNewlineAfterId(llmOutput); // Hard error
|
|
64
|
+
const result = validateTranslationResponse(segments, llmOutput);
|
|
65
|
+
if (result.errors.length > 0) console.error(result.errors);
|
|
72
66
|
```
|
|
73
67
|
|
|
74
68
|
## API Reference
|
|
@@ -84,32 +78,20 @@ const newlineError = detectNewlineAfterId(llmOutput); // Hard error
|
|
|
84
78
|
| `getPromptIds()` | Get list of available prompt IDs |
|
|
85
79
|
| `stackPrompts(master, addon)` | Manually combine prompts |
|
|
86
80
|
|
|
87
|
-
### Validation
|
|
88
|
-
|
|
89
|
-
| Function | Description |
|
|
90
|
-
|----------|-------------|
|
|
91
|
-
| `validateTranslations(text, expectedIds)` | Full validation pipeline |
|
|
92
|
-
| `validateTranslationMarkers(text)` | Check for malformed IDs (e.g., `P123$4`) |
|
|
93
|
-
| `detectNewlineAfterId(text)` | Catch `P1234\nText` (Gemini bug) |
|
|
94
|
-
| `detectImplicitContinuation(text)` | Catch "implicit continuation" text |
|
|
95
|
-
| `detectMetaTalk(text)` | Catch "(Note:", "[Editor:" |
|
|
96
|
-
| `detectDuplicateIds(ids)` | Catch same ID appearing twice |
|
|
97
|
-
|
|
98
|
-
### Validation (Soft Warnings)
|
|
81
|
+
### Validation
|
|
99
82
|
|
|
100
83
|
| Function | Description |
|
|
101
84
|
|----------|-------------|
|
|
102
|
-
| `
|
|
103
|
-
| `
|
|
85
|
+
| `validateTranslationResponse(segments, response)` | Unified validator for LLM translation responses (IDs, Arabic leak, invented IDs, gaps, speaker-label drift, etc.) |
|
|
86
|
+
| `VALIDATION_ERROR_TYPE_INFO` | Human-readable descriptions for each `ValidationErrorType` (for UI/logging) |
|
|
104
87
|
|
|
105
88
|
### Utilities
|
|
106
89
|
|
|
107
90
|
| Function | Description |
|
|
108
91
|
|----------|-------------|
|
|
109
|
-
| `extractTranslationIds(text)` | Extract all segment IDs from text |
|
|
110
|
-
| `normalizeTranslationText(text)` | Split merged markers onto separate lines |
|
|
111
|
-
| `findUnmatchedTranslationIds(ids, expected)` | Find IDs not in expected list |
|
|
112
92
|
| `formatExcerptsForPrompt(segments, prompt)` | Format segments for LLM input |
|
|
93
|
+
| `normalizeTranslationText(text)` | Normalize newlines and split merged markers onto separate lines |
|
|
94
|
+
| `extractTranslationIds(text)` | Extract all segment IDs from "ID - ..." markers |
|
|
113
95
|
|
|
114
96
|
## Available Prompts
|
|
115
97
|
|
|
@@ -126,7 +108,8 @@ const newlineError = detectNewlineAfterId(llmOutput); // Hard error
|
|
|
126
108
|
|
|
127
109
|
## Prompt Development
|
|
128
110
|
|
|
129
|
-
See
|
|
111
|
+
See `docs/refinement-guide.md` for the methodology used to develop and test these prompts.
|
|
112
|
+
See `AI_REVIEW_PROMPT.md` for the peer-review prompt template used when sending round packets to external agents.
|
|
130
113
|
|
|
131
114
|
## License
|
|
132
115
|
|
package/dist/index.d.ts
CHANGED
|
@@ -31,38 +31,20 @@ declare const TRANSLATION_MARKER_PARTS: {
|
|
|
31
31
|
*/
|
|
32
32
|
declare const MARKER_ID_PATTERN: string;
|
|
33
33
|
//#endregion
|
|
34
|
-
//#region src/formatting.d.ts
|
|
35
|
-
/**
|
|
36
|
-
* Internal segment type for formatting.
|
|
37
|
-
*/
|
|
38
|
-
type Segment = {
|
|
39
|
-
/** The segment ID (e.g., P1) */id: string; /** The segment text */
|
|
40
|
-
text: string;
|
|
41
|
-
};
|
|
42
|
-
/**
|
|
43
|
-
* Formats excerpts for an LLM prompt by combining the prompt rules with the segment text.
|
|
44
|
-
* Each segment is formatted as "ID - Text" and separated by double newlines.
|
|
45
|
-
*
|
|
46
|
-
* @param segments - Array of segments to format
|
|
47
|
-
* @param prompt - The instruction/system prompt to prepend
|
|
48
|
-
* @returns Combined prompt and formatted text
|
|
49
|
-
*/
|
|
50
|
-
declare const formatExcerptsForPrompt: (segments: Segment[], prompt: string) => string;
|
|
51
|
-
//#endregion
|
|
52
34
|
//#region .generated/prompts.d.ts
|
|
53
35
|
type PromptId = 'master_prompt' | 'encyclopedia_mixed' | 'fatawa' | 'fiqh' | 'hadith' | 'jarh_wa_tadil' | 'tafsir' | 'usul_al_fiqh';
|
|
54
36
|
declare const PROMPTS: readonly [{
|
|
55
37
|
readonly id: "master_prompt";
|
|
56
38
|
readonly name: "Master Prompt";
|
|
57
|
-
readonly content: "ROLE: Expert academic translator of Classical Islamic texts; prioritize accuracy and structure over fluency.\nCRITICAL NEGATIONS: 1. NO SANITIZATION (Do not soften polemics). 2. NO META-TALK (Output translation only). 3. NO MARKDOWN (Plain text only). 4. NO EMENDATION. 5. NO INFERENCE. 6. NO RESTRUCTURING. 7. NO OPAQUE TRANSLITERATION (Must translate phrases). 8. NO INVENTED SEGMENTS (Do not create, modify, or \"continue\" segment IDs. Output IDs verbatim exactly as they appear in the source input/metadata. Alphabetic suffixes (e.g., P5511a) are allowed IF AND ONLY IF that exact ID appears in the source. Any ID not present verbatim in the source is INVENTED. EXAMPLE: If P5803b ends with a questioner line, that line stays under P5803b — do NOT invent P5803c. If an expected ID is missing from the source, output: \"ID - [MISSING]\".)\nRULES: NO ARABIC SCRIPT (Except ﷺ). Plain text only. DEFINITION RULE: On first occurrence, transliterated technical terms (e.g., bidʿah) MUST be defined: \"translit (English)\". Preserve Segment ID. Translate meaning/intent. No inference. No extra fields. Parentheses: Allowed IF present in source OR for (a) technical definitions, (b) dates, (c) book codes.\nARABIC LEAK (Hard ban):\n- SCRIPT LOCK: Output must be 100% Latin script (ASCII + ALA-LC diacritics like ā ī ū ḥ ṣ ḍ ṭ ẓ ʿ ʾ). These diacritics are allowed/required and are NOT Arabic script.\n- STRICT BAN: Arabic script codepoints (letters, Arabic-Indic numerals ٠-٩, punctuation like ، ؟ ؛ « » , tatweel ـ, and Arabic presentation forms) are forbidden everywhere in output (even inside quotes/brackets/parentheses/citations), except ﷺ.\n- NO CITATIONS/BILINGUAL: Do NOT paste Arabic source text anywhere (no quotes, no citations, no bilingual Arabic+English output). Translate into English only.\n- NO MIXED-SCRIPT: Never output a token that mixes Latin and Arabic characters (example: ʿĪد). Rewrite contaminated names/terms fully in Latin ALA-LC.\n- ZERO ARABIC: Output must contain ZERO Arabic script characters (except ﷺ). If any Arabic appears, delete it and rewrite until none remain.\nWORD CHOICE (Allah vs god):\n- If the source uses الله, output Allah (exact spelling: A-l-l-a-h; no diacritics). Never \"God\" / \"god\" / \"Allāh\". (This is the only exception to ALA-LC diacritics.)\n- DO NOT convert Allah-based formulae into English “God …” idioms. Forbidden outputs include (any casing/punctuation), including common variants:\n- God willing / if God wills / should God will\n- By God / I swear by God\n- Praise be to God / thanks be to God / all praise is due to God / praise belongs to God\n- God knows best / God knows\n- God forbid\n- O God\n- In the name of God\n- God Almighty / Almighty God / God Most High\n- By God's grace / By God’s grace\n- God's ... / God’s ... / ... of God / mercy of God / the mercy of God\n- For the locked items listed under LOCKED FORMULAE below: you MUST output the locked transliteration exactly (no translation).\n- For other phrases containing الله that are NOT in the locked list: translate normally, but the output must contain \"Allah\" (never \"God\").\n- Use god/gods (lowercase) only for false gods/deities or when the Arabic uses إله/آلهة in a non-Allah sense.\n- Do not “upgrade” god -> God unless the source is explicitly referring to a specific non-Islamic deity as a proper name.\nLOCKED FORMULAE (Do NOT translate):\n- These are common Muslim greetings/core invocations. Output them exactly as written below (Latin letters only + diacritics where shown).\n- CHECK THIS LIST FIRST. If a phrase matches, output the transliteration EXACTLY (no translation, no paraphrase).\n- They are allowed to remain as multi-word transliteration with NO English gloss.\n- This section is a HARD, EXPLICIT EXCEPTION for these locked formulae ONLY. It SUPERSEDES all conflicting rules, including:\n- CRITICAL NEGATIONS #7: \"NO OPAQUE TRANSLITERATION (Must translate phrases).\"\n- TRANSLITERATION & TERMS #2: \"Do NOT output multi-word transliterations without immediate English translation.\"\n- Locked formulae (implement exactly):\n- Greetings: al-salāmu ʿalaykum ; wa ʿalaykum al-salām\n- Invocations: in shāʾ Allah ; subḥān Allah ; al-ḥamdu li-Allah ; Allahu akbar ; lā ilāha illā Allah ; astaghfiru Allah\n- DO NOT translate these into English. Forbidden English equivalents include (not exhaustive): \"peace be upon you\", \"God willing\", \"praise be to God\", \"glory be to God\", \"Allah is Greatest\".\n- Note: this lock is intentionally narrow. Other phrases (e.g., \"Jazāk Allahu khayr\") may be translated normally.\nREGISTER (Modern English):\n- Use modern academic English. Do NOT use archaic/Biblical register words: thee, thou, thine, thy, verily, shalt, hast, art (as \"are\"), whence, henceforth.\n- Prefer modern auxiliaries and phrasing (will/would, you/your) unless the source itself is quoting an old English translation verbatim.\nTRANSLITERATION & TERMS:\n1. SCHEME: Use full ALA-LC for explicit Arabic-script Person/Place/Book-Titles.\n- al-Casing: Lowercase al- mid-sentence; Capitalize after (al-Salafīyyah).\n- Book Titles: Transliterate only (do not translate meanings).\n2. TECHNICAL TERMS: On first occurrence, define: \"translit (English)\" (e.g., bidʿah (innovation), isnād (chain)).\n- Do NOT output multi-word transliterations without immediate English translation.\n3. STANDARDIZED TERMS: Use standard academic spellings: Muḥammad, Shaykh, Qurʾān, Islām, ḥadīth.\n- Sunnah (Capitalized) = The Corpus/Prophetic Tradition. sunnah (lowercase) = legal status/recommended.\n4. PROPER NAMES: Transliterate only (no parentheses).\n5. UNICODE: Latin + Latin Extended (āīūḥʿḍṣṭẓʾ) + punctuation. NO Arabic script (except ﷺ). NO emoji.\n- DIACRITIC FALLBACK: If you cannot produce correct ALA-LC diacritics, output English only. Do NOT use substitute accents (â/ã/á).\n6. SALUTATION: Replace all Prophet salutations with ﷺ.\n7. AMBIGUITY: Use contextual meaning from tafsir for theological terms. Do not sanitise polemics (e.g. Rāfiḍah).\nOUTPUT FORMAT: Segment_ID - English translation.\nCRITICAL: You must use the ASCII hyphen separator \" - \" (space+hyphen+space) immediately after the ID. Do NOT use em-dash or en-dash. Do NOT use a newline after the ID.\nID INTEGRITY (Check First):\n- PREPASS (Silent closed set): Internally identify the exact ordered list of Segment_IDs present in the source. Treat this list as a CLOSED SET. Do not output this list.\n- REQUIRED (Exact match): Your output must contain EXACTLY those Segment_IDs, in the EXACT same order, each appearing EXACTLY ONCE as an \"ID - ...\" prefix. FORBIDDEN: re-outputting an ID prefix you already used (even in long segments).\n- BAN (No new IDs): Do NOT invent ANY IDs or ID-like labels not present verbatim in the source (including \"(continued)\", \"cont.\", \"part 2\", or invented suffixes like P123c). Suffix IDs are allowed ONLY if that exact ID appears in the source.\n- BOUNDARY (No bleed): Translate ONLY the text that belongs to the current Segment_ID (from its header to the next Segment_ID header, or to end-of-input for the last segment). Do NOT move lines across IDs and do NOT merge segments.\n- INCOMPLETE (Strict): Use \"ID - [INCOMPLETE]\" ONLY if the provided source text under that ID is truly unreadable/untranslatable. NEVER use \"[INCOMPLETE]\" for ellipses (…) or long segments. Translate all available text.\nMULTI-LINE SEGMENTS (e.g., internal Q&A): Output the Segment_ID and \" - \" ONLY ONCE on the first line. Do NOT repeat the Segment_ID on subsequent lines; subsequent lines must start directly with the speaker label/text (no \"ID - \" prefix).\nSEGMENT BOUNDARIES (Anti-hallucination): Start a NEW segment ONLY when the source explicitly provides a Segment_ID. If the source continues with extra lines (including speaker labels like \"Questioner:\"/\"The Shaykh:\"/\"السائل:\"/\"الشيخ:\") WITHOUT a new Segment_ID, treat them as part of the CURRENT segment (multi-line under the current Segment_ID). Do NOT invent a new ID (including alphabetic suffixes like \"P5803c\") to label such continuation.\nOUTPUT COMPLETENESS: Translate ALL content in EVERY segment. Do not truncate, summarize, or skip content. The \"…\" symbol in the source indicates an audio gap in the original recording — it is NOT an instruction to omit content. Every segment must be fully translated. If you cannot complete a segment, output \"ID - [INCOMPLETE]\" instead of just \"…\".\nOUTPUT UNIQUENESS: Each Segment_ID from the source must appear in your output EXACTLY ONCE as an \"ID - ...\" prefix. Do NOT output the same Segment_ID header twice. If a segment is long or has multiple speaker turns, continue translating under that single ID header without re-stating it.\nNEGATIVE CONSTRAINTS: Do NOT output \"implicit continuation\", summaries, or extra paragraphs. Output only the text present in the source segment.\nExample: P1234 - Translation text... (Correct) vs P1234\\nTranslation... (Forbidden).\nEXAMPLE: Input: P405 - حدثنا عبد الله بن يوسف... Output: P405 - ʿAbd Allāh b. Yūsuf narrated to us...";
|
|
39
|
+
readonly content: "ROLE: Expert academic translator of Classical Islamic texts; prioritize accuracy and structure over fluency.\nCRITICAL NEGATIONS: 1. NO SANITIZATION (Do not soften polemics). 2. NO META-TALK (Output translation only). 3. NO MARKDOWN (Plain text only). 4. NO EMENDATION. 5. NO INFERENCE. 6. NO RESTRUCTURING. 7. NO OPAQUE TRANSLITERATION (Must translate phrases). 8. NO INVENTED SEGMENTS (Do not create, modify, or \"continue\" segment IDs. Output IDs verbatim exactly as they appear in the source input/metadata. Alphabetic suffixes (e.g., P5511a) are allowed IF AND ONLY IF that exact ID appears in the source. Any ID not present verbatim in the source is INVENTED. EXAMPLE: If P5803b ends with a questioner line, that line stays under P5803b — do NOT invent P5803c. If an expected ID is missing from the source, output: \"ID - [MISSING]\".)\nRULES: NO ARABIC SCRIPT (Except ﷺ). Plain text only. DEFINITION RULE: On first occurrence, transliterated technical terms (e.g., bidʿah) MUST be defined: \"translit (English)\". Preserve Segment ID. Translate meaning/intent. No inference. No extra fields. Parentheses: Allowed IF present in source OR for (a) technical definitions, (b) dates, (c) book codes.\nARABIC LEAK (Hard ban):\n- SCRIPT LOCK: Output must be 100% Latin script (ASCII + ALA-LC diacritics like ā ī ū ḥ ṣ ḍ ṭ ẓ ʿ ʾ). These diacritics are allowed/required and are NOT Arabic script.\n- STRICT BAN: Arabic script codepoints (letters, Arabic-Indic numerals ٠-٩, punctuation like ، ؟ ؛ « » , tatweel ـ, and Arabic presentation forms) are forbidden everywhere in output (even inside quotes/brackets/parentheses/citations), except ﷺ.\n- NO CITATIONS/BILINGUAL: Do NOT paste Arabic source text anywhere (no quotes, no citations, no bilingual Arabic+English output). Translate into English only.\n- QUOTES/VERSES: Even if the source includes Arabic Qurʾān/ḥadīth/quoted Arabic text (e.g., «...») you must NOT copy Arabic. Translate the meaning into English only.\n- NO MIXED-SCRIPT: Never output a token that mixes Latin and Arabic characters (example: ʿĪد). Rewrite contaminated names/terms fully in Latin ALA-LC.\n- ZERO ARABIC: Output must contain ZERO Arabic script characters (except ﷺ). If any Arabic appears, delete it and rewrite until none remain.\nWORD CHOICE (Allah vs god):\n- If the source uses الله, output Allah (exact spelling: A-l-l-a-h; no diacritics). Never \"God\" / \"god\" / \"Allāh\". (This is the only exception to ALA-LC diacritics.)\n- DO NOT convert Allah-based formulae into English “God …” idioms. Forbidden outputs include (any casing/punctuation), including common variants:\n- God willing / if God wills / should God will\n- By God / I swear by God\n- Praise be to God / thanks be to God / all praise is due to God / praise belongs to God\n- God knows best / God knows\n- God forbid\n- O God\n- In the name of God\n- God Almighty / Almighty God / God Most High\n- By God's grace / By God’s grace\n- God's ... / God’s ... / ... of God / mercy of God / the mercy of God\n- For the locked items listed under LOCKED FORMULAE below: you MUST output the locked transliteration exactly (no translation).\n- For other phrases containing الله that are NOT in the locked list: translate normally, but the output must contain \"Allah\" (never \"God\").\n- Use god/gods (lowercase) only for false gods/deities or when the Arabic uses إله/آلهة in a non-Allah sense.\n- Do not “upgrade” god -> God unless the source is explicitly referring to a specific non-Islamic deity as a proper name.\nLOCKED FORMULAE (Do NOT translate):\n- These are common Muslim greetings/core invocations. Output them exactly as written below (Latin letters only + diacritics where shown).\n- CHECK THIS LIST FIRST. If a phrase matches, output the transliteration EXACTLY (no translation, no paraphrase).\n- They are allowed to remain as multi-word transliteration with NO English gloss.\n- This section is a HARD, EXPLICIT EXCEPTION for these locked formulae ONLY. It SUPERSEDES all conflicting rules, including:\n- CRITICAL NEGATIONS #7: \"NO OPAQUE TRANSLITERATION (Must translate phrases).\"\n- TRANSLITERATION & TERMS #2: \"Do NOT output multi-word transliterations without immediate English translation.\"\n- TRANSLITERATION & TERMS: \"Do NOT transliterate full sentences/matn/quotes.\"\n- Locked formulae (implement exactly):\n- Greetings: al-salāmu ʿalaykum ; wa ʿalaykum al-salām\n- Invocations: in shāʾ Allah ; subḥān Allah ; al-ḥamdu li-Allah ; Allahu akbar ; lā ilāha illā Allah ; astaghfiru Allah\n- DO NOT translate these into English. Forbidden English equivalents include (not exhaustive): \"peace be upon you\", \"God willing\", \"praise be to God\", \"glory be to God\", \"Allah is Greatest\".\n- Note: this lock is intentionally narrow. Other phrases (e.g., \"Jazāk Allahu khayr\") may be translated normally.\nREGISTER (Modern English):\n- Use modern academic English. Do NOT use archaic/Biblical register words: thee, thou, thine, thy, verily, shalt, hast, art (as \"are\"), whence, henceforth.\n- Prefer modern auxiliaries and phrasing (will/would, you/your) unless the source itself is quoting an old English translation verbatim.\n- NO ALL CAPS / NO KJV-STYLE: Do NOT use ALL CAPS for emphasis (even inside quotes). Do NOT render Arabic Qurʾān/ḥadīth in KJV/Biblical style.\nTRANSLITERATION & TERMS:\n1. SCHEME: Use full ALA-LC for explicit Arabic-script Person/Place/Book-Titles.\n- al-Casing: Lowercase al- mid-sentence; Capitalize after (al-Salafīyyah).\n- Book Titles: Transliterate only (do not translate meanings).\n2. TECHNICAL TERMS: On first occurrence, define: \"translit (English)\" (e.g., bidʿah (innovation), isnād (chain)).\n- Do NOT output multi-word transliterations without immediate English translation.\n- Do NOT transliterate full sentences/matn/quotes. Translate into English; transliteration is for names/terms only.\n- EXCEPTION (Duʿāʾ/Supplications): If the source contains a specific duʿāʾ/supplication phrase and you choose to preserve its wording for pronunciation, you MAY output transliteration BUT you MUST also translate it immediately (same line or next) as: \"translit (English translation)\". Do NOT output Arabic script.\n- Example Allowed: Allāhumma innī asʾaluka al-ʿāfiyah (O Allah, I ask You for well-being).\n- Example Forbidden: Transliterate a long multi-sentence duʿāʾ paragraph without translating it.\n- LOCKED FORMULAE are the only exception allowed to remain multi-word transliteration with NO English gloss.\n- If you use any other multi-word transliteration (not locked), it MUST be immediately glossed: \"translit (English)\". Prefer full English translation for phrases.\n- Do NOT leave common nouns/objects/roles as transliteration (e.g., tools, foods, occupations). Translate them into English. If you must transliterate a non-name, you MUST immediately gloss it: \"translit (English)\".\n3. STANDARDIZED TERMS: Use standard academic spellings: Muḥammad, Shaykh, Qurʾān, Islām, ḥadīth.\n- Sunnah (Capitalized) = The Corpus/Prophetic Tradition. sunnah (lowercase) = legal status/recommended.\n4. PROPER NAMES: Transliterate only (no parentheses).\n5. UNICODE: Latin + Latin Extended (āīūḥʿḍṣṭẓʾ) + punctuation. NO Arabic script (except ﷺ). NO emoji.\n- DIACRITIC FALLBACK: If you cannot produce correct ALA-LC diacritics, output English only. Do NOT use substitute accents (â/ã/á).\n6. SALUTATION: Replace all Prophet salutations with ﷺ.\n7. HONORIFICS: Expand common phrases (do not transliterate):\n- Allah ʿazza wa-jall -> Allah, the Mighty and Majestic\n- rahimahu Allah -> may Allah have mercy on him\n8. AMBIGUITY: Use contextual meaning from tafsir for theological terms. Do not sanitise polemics (e.g. Rāfiḍah).\nOUTPUT FORMAT: Segment_ID - English translation.\nCRITICAL: You must use the ASCII hyphen separator \" - \" (space+hyphen+space) immediately after the ID. Do NOT use em-dash or en-dash. Do NOT use a newline after the ID.\nID INTEGRITY (Check First):\n- PREPASS (Silent closed set): Internally identify the exact ordered list of Segment_IDs present in the source. Treat this list as a CLOSED SET. Do not output this list.\n- REQUIRED (Exact match): Your output must contain EXACTLY those Segment_IDs, in the EXACT same order, each appearing EXACTLY ONCE as an \"ID - ...\" prefix. FORBIDDEN: re-outputting an ID prefix you already used (even in long segments).\n- BAN (No new IDs): Do NOT invent ANY IDs or ID-like labels not present verbatim in the source (including \"(continued)\", \"cont.\", \"part 2\", or invented suffixes like P123c). Suffix IDs are allowed ONLY if that exact ID appears in the source.\n- BOUNDARY (No bleed): Translate ONLY the text that belongs to the current Segment_ID (from its header to the next Segment_ID header, or to end-of-input for the last segment). Do NOT move lines across IDs and do NOT merge segments.\n- INCOMPLETE (Strict): Use \"ID - [INCOMPLETE]\" ONLY if the provided source text under that ID is truly unreadable/untranslatable. NEVER use \"[INCOMPLETE]\" for ellipses (…) or long segments. Translate all available text.\nMULTI-LINE SEGMENTS (e.g., internal Q&A): Output the Segment_ID and \" - \" ONLY ONCE on the first line. Do NOT repeat the Segment_ID on subsequent lines; subsequent lines must start directly with the speaker label/text (no \"ID - \" prefix).\nSEGMENT BOUNDARIES (Anti-hallucination): Start a NEW segment ONLY when the source explicitly provides a Segment_ID. If the source continues with extra lines (including speaker labels like \"Questioner:\"/\"The Shaykh:\"/\"السائل:\"/\"الشيخ:\") WITHOUT a new Segment_ID, treat them as part of the CURRENT segment (multi-line under the current Segment_ID). Do NOT invent a new ID (including alphabetic suffixes like \"P5803c\") to label such continuation.\nOUTPUT COMPLETENESS: Translate ALL content in EVERY segment. Do not truncate, summarize, or skip content. The \"…\" symbol in the source indicates an audio gap in the original recording — it is NOT an instruction to omit content. Every segment must be fully translated. If you cannot complete a segment, output \"ID - [INCOMPLETE]\" instead of just \"…\".\nOUTPUT UNIQUENESS: Each Segment_ID from the source must appear in your output EXACTLY ONCE as an \"ID - ...\" prefix. Do NOT output the same Segment_ID header twice. If a segment is long or has multiple speaker turns, continue translating under that single ID header without re-stating it.\nNEGATIVE CONSTRAINTS: Do NOT output \"implicit continuation\", summaries, or extra paragraphs. Output only the text present in the source segment.\nExample: P1234 - Translation text... (Correct) vs P1234\\nTranslation... (Forbidden).\nEXAMPLE: Input: P405 - حدثنا عبد الله بن يوسف... Output: P405 - ʿAbd Allāh b. Yūsuf narrated to us...";
|
|
58
40
|
}, {
|
|
59
41
|
readonly id: "encyclopedia_mixed";
|
|
60
42
|
readonly name: "Encyclopedia Mixed";
|
|
61
|
-
readonly content: "NO MODE TAGS: Do not output any mode labels or bracket tags.\nSTRUCTURE (Apply First):\n- Q&A: Whenever \"Al-Sāʾil:\"/\"Al-Shaykh:\" appear: Start NEW LINE for speaker. Keep Label+Text on SAME LINE.\n- EXCEPTION: If the speaker label is the VERY FIRST token after the \"ID - \" prefix, keep it on the same line. (Correct: P5455 - Questioner: Text...) (Wrong: P5455 \\n Questioner: Text...).\n- INTERNAL Q&A: If segment has multiple turns, use new lines for speakers. Output Segment ID ONLY ONCE at the start of the first line. Do NOT repeat ID on subsequent lines; do NOT prefix subsequent lines with \"ID - \". (e.g. P5455 - Questioner: ... \\n The Shaykh: ...).\n- OUTPUT LABELS: Al-Sāʾil -> Questioner: ; Al-Shaykh -> The Shaykh:\n\nDEFINITIONS & CASING:\n- GEOPOLITICS: Modern place names may use English exonyms (Filasṭīn -> Palestine).\n- PLURALS: Do not pluralize term-pairs by appending \"s\" (e.g., \"ḥadīth (report)s\"). Use the English plural or rephrase.\
|
|
43
|
+
readonly content: "NO MODE TAGS: Do not output any mode labels or bracket tags.\nSTRUCTURE (Apply First):\n- Q&A: Whenever \"Al-Sāʾil:\"/\"Al-Shaykh:\" appear: Start NEW LINE for speaker. Keep Label+Text on SAME LINE.\n- EXCEPTION: If the speaker label is the VERY FIRST token after the \"ID - \" prefix, keep it on the same line. (Correct: P5455 - Questioner: Text...) (Wrong: P5455 \\n Questioner: Text...).\n- INTERNAL Q&A: If segment has multiple turns, use new lines for speakers. Output Segment ID ONLY ONCE at the start of the first line. Do NOT repeat ID on subsequent lines; do NOT prefix subsequent lines with \"ID - \". (e.g. P5455 - Questioner: ... \\n The Shaykh: ...).\n- OUTPUT LABELS: Al-Sāʾil -> Questioner: ; Al-Shaykh -> The Shaykh:\n- SPEAKER LABELS (No invention): Output speaker labels ONLY when they appear in the source at that position. Do NOT add \"Questioner:\"/\"The Shaykh:\" to unlabeled text. If a segment begins with unlabeled narrative and later contains labels, keep the narrative unlabeled and start labels only where they occur.\nDEFINITIONS & CASING:\n- GEOPOLITICS: Modern place names may use English exonyms (Filasṭīn -> Palestine).\n- PLURALS: Do not pluralize term-pairs by appending \"s\" (e.g., \"ḥadīth (report)s\"). Use the English plural or rephrase.\nSTATE LOGIC (Priority: Isnad > Rijal > Fiqh > Narrative):\n- ISNAD (Triggers: ḥaddathanā, akhbaranā, ʿan): Use FULL ALA-LC for names.\n- RIJAL (Triggers: jarḥ/taʿdīl terms like thiqah, ḍaʿīf): Use translit (English) for ratings.\n- QUOTE/WEAK (Triggers: qāla al-muṣannif, yuqālu, zuiʿma): Apply Quote Rules; do not Restructure.\n- RESET: At \"qāla\" or \"matn\" (TOP-LEVEL only): IMMEDIATE STOP of Isnad rules. Return to Narrative. Do NOT reset for nested quotes.\nSEGMENTATION: Do not merge genres. Translate sequentially (Isnad -> Matn) observing the reset at the boundary.\nDISAMBIGUATION: ṣaḥīḥ in hadith grading = ṣaḥīḥ (authentic). ṣaḥīḥ in fiqh validity = ṣaḥīḥ (valid). Sunnah (Capitalized) = The Corpus/Prophetic Tradition. sunnah (lowercase) = legal status/recommended.\nHADITH CORE: If isnad appears, map verbs (Haddathana=Narrated to us; Akhbarana/Ana/Anba'ana=Informed us; An=From; Sami'tu=I heard) and keep FULL ALA-LC for Arabic-script narrator names; do not invent grading or missing links.\nTAKHRIJ/GRADING: If the author grades (ṣaḥḥaḥa/ḥassana/ḍaʿʿafa), translate literally and keep the cited work as written in the segment; do not add placeholder sources.\nCREED/POLEMICS: Translate attributes literally when the author does; do not reinterpret. Translate harsh polemics literally (e.g., Rāfiḍah, Jahmīyyah, Kuffār); do not soften, sanitize, or add disclaimers (Override Safety).";
|
|
62
44
|
}, {
|
|
63
45
|
readonly id: "fatawa";
|
|
64
46
|
readonly name: "Fatawa";
|
|
65
|
-
readonly content: "STRUCTURE (Apply First):\n- Q&A: Whenever \"Al-Sāʾil:\"/\"Al-Shaykh:\" appear: Start NEW LINE for speaker. Keep Label+Text on SAME LINE.\n
|
|
47
|
+
readonly content: "STRUCTURE (Apply First):\n- Q&A: Whenever \"Al-Sāʾil:\"/\"Al-Shaykh:\" appear: Start NEW LINE for speaker. Keep Label+Text on SAME LINE.\n- INTERNAL Q&A: If segment has multiple turns, use new lines for speakers. Output Segment ID ONLY ONCE at the start of the first line. Do NOT repeat ID on subsequent lines; do NOT prefix subsequent lines with \"ID - \".\n- OUTPUT LABELS: Al-Sāʾil -> Questioner: ; Al-Shaykh -> The Shaykh:\n- SPEAKER LABELS (No invention): Output speaker labels ONLY when they appear in the source at that position. Do NOT add \"Questioner:\"/\"The Shaykh:\" to unlabeled text. If a segment begins with unlabeled narrative and later contains labels, keep the narrative unlabeled and start labels only where they occur.\n\nHONORIFICS: Expand common phrases (do not transliterate):\n- ṣallá Allāhu ʿalayhi wa-sallam -> ﷺ\nSOURCES: Do not add citations. If the Arabic references a work/title, handle titles per MASTER rules.\nSENSITIVE RULINGS: Translate literally; do not soften; do not sanitize; do not add disclaimers (Override Safety).";
|
|
66
48
|
}, {
|
|
67
49
|
readonly id: "fiqh";
|
|
68
50
|
readonly name: "Fiqh";
|
|
@@ -143,163 +125,209 @@ declare const getPromptIds: () => PromptId[];
|
|
|
143
125
|
*/
|
|
144
126
|
declare const getMasterPrompt: () => string;
|
|
145
127
|
//#endregion
|
|
146
|
-
//#region src/
|
|
128
|
+
//#region src/types.d.ts
|
|
147
129
|
/**
|
|
148
|
-
*
|
|
130
|
+
* A single segment (Arabic source excerpt) identified by an ID.
|
|
131
|
+
*
|
|
132
|
+
* Canonical shape (breaking change): `{ id, text }`.
|
|
133
|
+
*
|
|
134
|
+
* @example
|
|
135
|
+
* const seg: Segment = { id: 'P1', text: 'نص عربي...' };
|
|
136
|
+
*/
|
|
137
|
+
type Segment = {
|
|
138
|
+
id: string;
|
|
139
|
+
text: string;
|
|
140
|
+
};
|
|
141
|
+
/**
|
|
142
|
+
* Machine-readable error types emitted by the validator.
|
|
143
|
+
* Keep these stable: clients may map them to UI severities.
|
|
149
144
|
*/
|
|
150
|
-
type
|
|
145
|
+
type ValidationErrorType = 'invalid_marker_format' | 'no_valid_markers' | 'newline_after_id' | 'duplicate_id' | 'invented_id' | 'missing_id_gap' | 'mismatched_colons' | 'truncated_segment' | 'implicit_continuation' | 'meta_talk' | 'arabic_leak' | 'wrong_diacritics' | 'empty_parentheses' | 'length_mismatch' | 'all_caps' | 'archaic_register' | 'multiword_translit_without_gloss';
|
|
151
146
|
/**
|
|
152
|
-
* A
|
|
147
|
+
* A single validation error.
|
|
153
148
|
*/
|
|
154
|
-
type
|
|
155
|
-
|
|
156
|
-
message: string;
|
|
149
|
+
type ValidationError = {
|
|
150
|
+
type: ValidationErrorType;
|
|
151
|
+
message: string;
|
|
152
|
+
id?: string;
|
|
157
153
|
match?: string;
|
|
158
154
|
};
|
|
159
155
|
/**
|
|
160
|
-
* Result of translation
|
|
156
|
+
* Result of validating an LLM translation response against a set of source segments.
|
|
161
157
|
*/
|
|
162
|
-
type
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
parsedIds: string[]; /** Soft warnings (issues that don't fail validation) */
|
|
167
|
-
warnings?: ValidationWarning[];
|
|
158
|
+
type ValidationResponseResult = {
|
|
159
|
+
normalizedResponse: string;
|
|
160
|
+
parsedIds: string[];
|
|
161
|
+
errors: ValidationError[];
|
|
168
162
|
};
|
|
163
|
+
//#endregion
|
|
164
|
+
//#region src/textUtils.d.ts
|
|
169
165
|
/**
|
|
170
|
-
*
|
|
171
|
-
*
|
|
166
|
+
* Formats excerpts for an LLM prompt by combining the prompt rules with the segment text.
|
|
167
|
+
* Each segment is formatted as "ID - Text" and separated by double newlines.
|
|
172
168
|
*
|
|
173
|
-
* @param
|
|
174
|
-
* @
|
|
169
|
+
* @param segments - Array of segments to format
|
|
170
|
+
* @param prompt - The instruction/system prompt to prepend
|
|
171
|
+
* @returns Combined prompt and formatted text
|
|
175
172
|
*/
|
|
176
|
-
declare const
|
|
173
|
+
declare const formatExcerptsForPrompt: (segments: Segment[], prompt: string) => string;
|
|
177
174
|
/**
|
|
178
|
-
*
|
|
179
|
-
* This is a SOFT warning - wrong diacritics are bad but not a hard failure.
|
|
175
|
+
* Normalize line endings and split merged markers onto separate lines.
|
|
180
176
|
*
|
|
181
|
-
* @
|
|
182
|
-
*
|
|
177
|
+
* @example
|
|
178
|
+
* // "helloP1 - ..." becomes split onto a new line before "P1 -"
|
|
179
|
+
* normalizeTranslationText('helloP1 - x').includes('\\nP1 -') === true
|
|
183
180
|
*/
|
|
184
|
-
declare const
|
|
181
|
+
declare const normalizeTranslationText: (content: string) => string;
|
|
185
182
|
/**
|
|
186
|
-
*
|
|
187
|
-
* Format should be "P1234 - Text" not "P1234\nText".
|
|
183
|
+
* Extract translation IDs from normalized response, in order.
|
|
188
184
|
*
|
|
189
|
-
* @
|
|
190
|
-
*
|
|
185
|
+
* @example
|
|
186
|
+
* extractTranslationIds('P1 - a\\nP2b - b') // => ['P1', 'P2b']
|
|
191
187
|
*/
|
|
192
|
-
declare const
|
|
188
|
+
declare const extractTranslationIds: (text: string) => string[];
|
|
193
189
|
/**
|
|
194
|
-
*
|
|
190
|
+
* Parse a single translation line in the form "ID - translation".
|
|
195
191
|
*
|
|
196
|
-
*
|
|
197
|
-
* @returns Error message if continuation text is found, otherwise undefined
|
|
198
|
-
*/
|
|
199
|
-
declare const detectImplicitContinuation: (text: string) => string | undefined;
|
|
200
|
-
/**
|
|
201
|
-
* Detects meta-talk (translator notes, editor comments) that violate NO META-TALK.
|
|
192
|
+
* Note: This returns a translation entry shape, not an Arabic source `Segment`.
|
|
202
193
|
*
|
|
203
|
-
* @param
|
|
204
|
-
* @returns
|
|
205
|
-
*/
|
|
206
|
-
declare const detectMetaTalk: (text: string) => string | undefined;
|
|
207
|
-
/**
|
|
208
|
-
* Detects duplicate segment IDs in the output.
|
|
194
|
+
* @param line - Single line to parse
|
|
195
|
+
* @returns `{ id, translation }` when valid; otherwise `null`
|
|
209
196
|
*
|
|
210
|
-
* @
|
|
211
|
-
*
|
|
212
|
-
*/
|
|
213
|
-
declare const detectDuplicateIds: (ids: string[]) => string | undefined;
|
|
214
|
-
/**
|
|
215
|
-
* Detects IDs in the output that were not in the source (invented/hallucinated IDs).
|
|
216
|
-
* @param outputIds - IDs extracted from LLM output
|
|
217
|
-
* @param sourceIds - IDs that were present in the source input
|
|
218
|
-
* @returns Error message if invented IDs found, undefined if all IDs are valid
|
|
197
|
+
* @example
|
|
198
|
+
* parseTranslationLine('P1 - Hello')?.id === 'P1'
|
|
219
199
|
*/
|
|
220
|
-
declare const
|
|
200
|
+
declare const parseTranslationLine: (line: string) => {
|
|
201
|
+
id: string;
|
|
202
|
+
translation: string;
|
|
203
|
+
} | null;
|
|
221
204
|
/**
|
|
222
|
-
*
|
|
223
|
-
* @param text - The full LLM output text
|
|
224
|
-
* @returns Error message if truncated segments found, undefined if all segments have content
|
|
225
|
-
*/
|
|
226
|
-
declare const detectTruncatedSegments: (text: string) => string | undefined;
|
|
227
|
-
/**
|
|
228
|
-
* Validates translation marker format and returns error message if invalid.
|
|
229
|
-
* Catches common AI hallucinations like malformed reference IDs.
|
|
205
|
+
* Parses bulk translation text into a Map for efficient O(1) lookup.
|
|
230
206
|
*
|
|
231
|
-
*
|
|
232
|
-
* @returns Error message if invalid, undefined if valid
|
|
233
|
-
*/
|
|
234
|
-
declare const validateTranslationMarkers: (text: string) => string | undefined;
|
|
235
|
-
/**
|
|
236
|
-
* Normalizes translation text by splitting merged markers onto separate lines.
|
|
237
|
-
* LLMs sometimes put multiple translations on the same line.
|
|
207
|
+
* Handles multi-line translations: subsequent non-marker lines belong to the previous ID.
|
|
238
208
|
*
|
|
239
|
-
* @param
|
|
240
|
-
* @returns
|
|
241
|
-
*/
|
|
242
|
-
declare const normalizeTranslationText: (content: string) => string;
|
|
243
|
-
/**
|
|
244
|
-
* Extracts translation IDs from text in order of appearance.
|
|
209
|
+
* @param rawText - Raw text containing translations in format "ID - Translation text"
|
|
210
|
+
* @returns An object with `count` and `translationMap`
|
|
245
211
|
*
|
|
246
|
-
* @
|
|
247
|
-
*
|
|
212
|
+
* @example
|
|
213
|
+
* parseTranslations('P1 - a\\nP2 - b').count === 2
|
|
248
214
|
*/
|
|
249
|
-
declare const
|
|
215
|
+
declare const parseTranslations: (rawText: string) => {
|
|
216
|
+
count: number;
|
|
217
|
+
translationMap: Map<string, string>;
|
|
218
|
+
};
|
|
250
219
|
/**
|
|
251
|
-
*
|
|
252
|
-
* E.g., "P11622a" -> 11622, "C123" -> 123, "B45b" -> 45
|
|
220
|
+
* Parse translations into an ordered array (preserving the original response order).
|
|
253
221
|
*
|
|
254
|
-
*
|
|
255
|
-
*
|
|
256
|
-
*/
|
|
257
|
-
declare const extractIdNumber: (id: string) => number;
|
|
258
|
-
/**
|
|
259
|
-
* Extracts the prefix (type) from an excerpt ID.
|
|
260
|
-
* E.g., "P11622a" -> "P", "C123" -> "C", "B45" -> "B"
|
|
222
|
+
* This differs from `parseTranslations()` which returns a Map and therefore cannot represent
|
|
223
|
+
* duplicates as separate entries.
|
|
261
224
|
*
|
|
262
|
-
* @param
|
|
263
|
-
* @returns
|
|
264
|
-
*/
|
|
265
|
-
declare const extractIdPrefix: (id: string) => string;
|
|
266
|
-
/**
|
|
267
|
-
* Validates that translation IDs appear in ascending numeric order within the same prefix type.
|
|
268
|
-
* This catches LLM errors where translations are output in wrong order (e.g., P12659 before P12651).
|
|
225
|
+
* @param rawText - Raw text containing translations in format "ID - Translation text"
|
|
226
|
+
* @returns Array of `{ id, translation }` entries in appearance order
|
|
269
227
|
*
|
|
270
|
-
* @
|
|
271
|
-
*
|
|
228
|
+
* @example
|
|
229
|
+
* parseTranslationsInOrder('P1 - a\\nP2 - b').map((e) => e.id) // => ['P1', 'P2']
|
|
272
230
|
*/
|
|
273
|
-
declare const
|
|
231
|
+
declare const parseTranslationsInOrder: (rawText: string) => {
|
|
232
|
+
id: string;
|
|
233
|
+
translation: string;
|
|
234
|
+
}[];
|
|
235
|
+
//#endregion
|
|
236
|
+
//#region src/validation.d.ts
|
|
274
237
|
/**
|
|
275
|
-
*
|
|
276
|
-
* Allows pasting in multiple blocks where each block is internally ordered.
|
|
277
|
-
* Resets (position going backwards) are allowed between blocks.
|
|
278
|
-
* Errors only when there's disorder WITHIN a block (going backwards then forwards).
|
|
238
|
+
* Human-readable descriptions for each `ValidationErrorType`, intended for client UIs and logs.
|
|
279
239
|
*
|
|
280
|
-
* @
|
|
281
|
-
*
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
240
|
+
* @example
|
|
241
|
+
* VALIDATION_ERROR_TYPE_INFO.arabic_leak.description
|
|
242
|
+
*/
|
|
243
|
+
declare const VALIDATION_ERROR_TYPE_INFO: {
|
|
244
|
+
readonly all_caps: {
|
|
245
|
+
readonly description: "ALL CAPS “shouting” word detected (5+ letters).";
|
|
246
|
+
};
|
|
247
|
+
readonly arabic_leak: {
|
|
248
|
+
readonly description: "Arabic script was detected in output (except ﷺ).";
|
|
249
|
+
};
|
|
250
|
+
readonly archaic_register: {
|
|
251
|
+
readonly description: "Archaic/Biblical English detected (e.g., thou, verily, shalt).";
|
|
252
|
+
};
|
|
253
|
+
readonly duplicate_id: {
|
|
254
|
+
readonly description: "The same segment ID appears more than once in the response.";
|
|
255
|
+
};
|
|
256
|
+
readonly empty_parentheses: {
|
|
257
|
+
readonly description: "Excessive \"()\" patterns detected, often indicating failed/empty term-pairs.";
|
|
258
|
+
};
|
|
259
|
+
readonly implicit_continuation: {
|
|
260
|
+
readonly description: "The response includes continuation/meta phrasing (e.g., \"continued:\", \"implicit continuation\").";
|
|
261
|
+
};
|
|
262
|
+
readonly invalid_marker_format: {
|
|
263
|
+
readonly description: "A segment marker line is malformed (e.g., wrong ID shape or missing content after the dash).";
|
|
264
|
+
};
|
|
265
|
+
readonly invented_id: {
|
|
266
|
+
readonly description: "The response contains a segment ID that does not exist in the provided source corpus.";
|
|
267
|
+
};
|
|
268
|
+
readonly length_mismatch: {
|
|
269
|
+
readonly description: "Translation appears too short relative to Arabic source (heuristic truncation check).";
|
|
270
|
+
};
|
|
271
|
+
readonly meta_talk: {
|
|
272
|
+
readonly description: "The response includes translator/editor notes instead of pure translation.";
|
|
273
|
+
};
|
|
274
|
+
readonly mismatched_colons: {
|
|
275
|
+
readonly description: "Per-segment colon count mismatch between Arabic segment text and its translation chunk (counts \":\" and \":\").";
|
|
276
|
+
};
|
|
277
|
+
readonly missing_id_gap: {
|
|
278
|
+
readonly description: "A gap was detected: the response includes two IDs whose corpus order implies one or more intermediate IDs are missing.";
|
|
279
|
+
};
|
|
280
|
+
readonly multiword_translit_without_gloss: {
|
|
281
|
+
readonly description: "A multi-word transliteration phrase was detected without an immediate parenthetical gloss.";
|
|
282
|
+
};
|
|
283
|
+
readonly newline_after_id: {
|
|
284
|
+
readonly description: "The response used \"ID -\\nText\" instead of \"ID - Text\" (newline immediately after the marker).";
|
|
285
|
+
};
|
|
286
|
+
readonly no_valid_markers: {
|
|
287
|
+
readonly description: "No valid \"ID - ...\" markers were found anywhere in the response.";
|
|
288
|
+
};
|
|
289
|
+
readonly truncated_segment: {
|
|
290
|
+
readonly description: "A segment appears truncated (e.g., only \"…\", \"...\", or \"[INCOMPLETE]\").";
|
|
291
|
+
};
|
|
292
|
+
readonly wrong_diacritics: {
|
|
293
|
+
readonly description: "Wrong diacritics like â/ã/á were detected (should use macrons like ā ī ū).";
|
|
294
|
+
};
|
|
295
|
+
};
|
|
285
296
|
/**
|
|
286
|
-
*
|
|
287
|
-
* Validates markers, normalizes text, and checks order against expected IDs.
|
|
297
|
+
* Validate an LLM translation response against a set of Arabic source segments.
|
|
288
298
|
*
|
|
289
|
-
*
|
|
290
|
-
*
|
|
291
|
-
* @returns Validation result with normalized text and any errors
|
|
292
|
-
*/
|
|
293
|
-
declare const validateTranslations: (rawText: string, expectedIds: string[]) => TranslationValidationResult;
|
|
294
|
-
/**
|
|
295
|
-
* Finds translation IDs that don't exist in the expected store IDs.
|
|
296
|
-
* Used to validate that all pasted translations can be matched before committing.
|
|
299
|
+
* Rules are expressed as a list of typed errors. The caller decides severity.
|
|
300
|
+
* The validator normalizes the response first (marker splitting + line endings).
|
|
297
301
|
*
|
|
298
|
-
*
|
|
299
|
-
*
|
|
300
|
-
*
|
|
301
|
-
|
|
302
|
-
|
|
302
|
+
* Important: `segments` may be the full corpus. The validator reduces to only
|
|
303
|
+
* those IDs parsed from the response (plus detects missing-ID gaps between IDs).
|
|
304
|
+
*
|
|
305
|
+
* @example
|
|
306
|
+
* // Pass (no errors)
|
|
307
|
+
* validateTranslationResponse(
|
|
308
|
+
* [{ id: 'P1', text: 'نص عربي طويل...' }],
|
|
309
|
+
* 'P1 - A complete translation.'
|
|
310
|
+
* ).errors.length === 0
|
|
311
|
+
*
|
|
312
|
+
* @example
|
|
313
|
+
* // Fail (invented ID)
|
|
314
|
+
* validateTranslationResponse(
|
|
315
|
+
* [{ id: 'P1', text: 'نص عربي طويل...' }],
|
|
316
|
+
* 'P2 - This ID is not in the corpus.'
|
|
317
|
+
* ).errors.some(e => e.type === 'invented_id') === true
|
|
318
|
+
*/
|
|
319
|
+
declare const validateTranslationResponse: (segments: Segment[], response: string) => {
|
|
320
|
+
errors: {
|
|
321
|
+
message: string;
|
|
322
|
+
type: string;
|
|
323
|
+
}[];
|
|
324
|
+
normalizedResponse: string;
|
|
325
|
+
parsedIds: never[];
|
|
326
|
+
} | {
|
|
327
|
+
errors: ValidationError[];
|
|
328
|
+
normalizedResponse: string;
|
|
329
|
+
parsedIds: string[];
|
|
330
|
+
};
|
|
303
331
|
//#endregion
|
|
304
|
-
export { MARKER_ID_PATTERN, Markers, type PromptId, type PromptMetadata, type StackedPrompt, TRANSLATION_MARKER_PARTS,
|
|
332
|
+
export { MARKER_ID_PATTERN, Markers, type PromptId, type PromptMetadata, type Segment, type StackedPrompt, TRANSLATION_MARKER_PARTS, VALIDATION_ERROR_TYPE_INFO, type ValidationError, type ValidationErrorType, type ValidationResponseResult, extractTranslationIds, formatExcerptsForPrompt, getMasterPrompt, getPrompt, getPromptIds, getPrompts, getStackedPrompt, normalizeTranslationText, parseTranslationLine, parseTranslations, parseTranslationsInOrder, stackPrompts, validateTranslationResponse };
|
|
305
333
|
//# sourceMappingURL=index.d.ts.map
|