@geolonia/yuuhitsu 0.1.17 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/commands/translate.d.ts.map +1 -1
- package/dist/cli/commands/translate.js +3 -0
- package/dist/cli/commands/translate.js.map +1 -1
- package/dist/provider/claude.d.ts +7 -1
- package/dist/provider/claude.d.ts.map +1 -1
- package/dist/provider/claude.js +79 -2
- package/dist/provider/claude.js.map +1 -1
- package/dist/provider/interface.d.ts +27 -0
- package/dist/provider/interface.d.ts.map +1 -1
- package/dist/tasks/batch-translate.d.ts +1 -0
- package/dist/tasks/batch-translate.d.ts.map +1 -1
- package/dist/tasks/batch-translate.js +2 -1
- package/dist/tasks/batch-translate.js.map +1 -1
- package/dist/tasks/translate.d.ts +34 -41
- package/dist/tasks/translate.d.ts.map +1 -1
- package/dist/tasks/translate.js +329 -404
- package/dist/tasks/translate.js.map +1 -1
- package/package.json +9 -1
package/dist/tasks/translate.js
CHANGED
|
@@ -1,21 +1,20 @@
|
|
|
1
1
|
import { readFileSync, writeFileSync, mkdirSync } from "fs";
|
|
2
2
|
import { dirname, basename, extname, join } from "path";
|
|
3
|
+
import { remark } from "remark";
|
|
4
|
+
import remarkGfm from "remark-gfm";
|
|
5
|
+
import { visit } from "unist-util-visit";
|
|
3
6
|
import { buildGlossaryPrompt } from "./glossary.js";
|
|
4
|
-
export const DEFAULT_MAX_CHUNK_LINES =
|
|
7
|
+
export const DEFAULT_MAX_CHUNK_LINES = 150;
|
|
5
8
|
const MIN_CHUNK_LINES = 50;
|
|
9
|
+
export const DEFAULT_MAX_NODES_PER_BATCH = 200;
|
|
10
|
+
// P-A1: minimum ratio of output characters to input characters (truncation check)
|
|
11
|
+
const MIN_OUTPUT_RATIO = 0.3;
|
|
6
12
|
/**
|
|
7
|
-
* Separate frontmatter from Markdown content
|
|
13
|
+
* Separate frontmatter from Markdown content.
|
|
8
14
|
* Handles: LF, CRLF, no trailing newline after closing ---, trailing spaces, empty frontmatter
|
|
9
|
-
* @param content - Full Markdown content
|
|
10
|
-
* @returns Object with separated frontmatter and body
|
|
11
15
|
*/
|
|
12
16
|
export function separateFrontmatter(content) {
|
|
13
|
-
// Normalize CRLF to LF for regex matching
|
|
14
17
|
const normalized = content.replace(/\r\n/g, "\n");
|
|
15
|
-
// Match frontmatter (two alternations to keep closing --- on its own line):
|
|
16
|
-
// Case 1: non-empty body: ^---\n ... \n---[ \t]*(\n|$)
|
|
17
|
-
// Case 2: empty body: ^---\n---[ \t]*(\n|$)
|
|
18
|
-
// Using alternation avoids the \n? ambiguity that allows --- to match mid-line.
|
|
19
18
|
const frontmatterRegex = /^---\n([\s\S]*?)\n---[ \t]*(\n|$)|^---\n---[ \t]*(\n|$)/;
|
|
20
19
|
const match = normalized.match(frontmatterRegex);
|
|
21
20
|
if (match) {
|
|
@@ -26,359 +25,7 @@ export function separateFrontmatter(content) {
|
|
|
26
25
|
}
|
|
27
26
|
return { frontmatter: null, body: normalized };
|
|
28
27
|
}
|
|
29
|
-
|
|
30
|
-
* Replace fenced code blocks and inline code with placeholders.
|
|
31
|
-
* Uses a line-by-line parser instead of regex to avoid V8 stack overflow
|
|
32
|
-
* on files with many code blocks (backreference + [\s\S]*? causes recursive backtracking).
|
|
33
|
-
*/
|
|
34
|
-
export function protectCodeBlocks(content) {
|
|
35
|
-
const map = new Map();
|
|
36
|
-
let blockIndex = 0;
|
|
37
|
-
let inlineIndex = 0;
|
|
38
|
-
// Step 1: Replace fenced code blocks using line-by-line parsing
|
|
39
|
-
const lines = content.split("\n");
|
|
40
|
-
const resultLines = [];
|
|
41
|
-
let fenceOpen = null; // the backtick sequence that opened the current block
|
|
42
|
-
let blockLines = [];
|
|
43
|
-
for (const line of lines) {
|
|
44
|
-
const fenceMatch = line.match(/^(`{3,})/);
|
|
45
|
-
if (fenceOpen === null) {
|
|
46
|
-
// Not inside a code block
|
|
47
|
-
if (fenceMatch) {
|
|
48
|
-
// Opening fence found
|
|
49
|
-
fenceOpen = fenceMatch[1];
|
|
50
|
-
blockLines = [line];
|
|
51
|
-
}
|
|
52
|
-
else {
|
|
53
|
-
resultLines.push(line);
|
|
54
|
-
}
|
|
55
|
-
}
|
|
56
|
-
else {
|
|
57
|
-
// Inside a code block — look for closing fence with same or more backticks
|
|
58
|
-
blockLines.push(line);
|
|
59
|
-
if (fenceMatch && fenceMatch[1].length >= fenceOpen.length && line.trim() === fenceMatch[1]) {
|
|
60
|
-
// Closing fence found — store as single-line placeholder (no padding).
|
|
61
|
-
// Padding was previously used to preserve line count, but it caused chunk
|
|
62
|
-
// boundaries to fall inside the placeholder's whitespace region, leading to
|
|
63
|
-
// non-deterministic LLM output when the code block exceeded --max-chunk-lines.
|
|
64
|
-
const original = blockLines.join("\n") + "\n";
|
|
65
|
-
const placeholder = `__CODE_BLOCK_${blockIndex++}__`;
|
|
66
|
-
map.set(placeholder, original);
|
|
67
|
-
resultLines.push(placeholder);
|
|
68
|
-
fenceOpen = null;
|
|
69
|
-
blockLines = [];
|
|
70
|
-
}
|
|
71
|
-
}
|
|
72
|
-
}
|
|
73
|
-
// If we ended inside an unclosed fence, emit lines as-is
|
|
74
|
-
if (fenceOpen !== null) {
|
|
75
|
-
resultLines.push(...blockLines);
|
|
76
|
-
}
|
|
77
|
-
let result = resultLines.join("\n");
|
|
78
|
-
// Step 2: Replace inline code (single backtick, not within code blocks)
|
|
79
|
-
result = result.replace(/`([^`\n]+)`/g, (match) => {
|
|
80
|
-
const placeholder = `__INLINE_CODE_${inlineIndex++}__`;
|
|
81
|
-
map.set(placeholder, match);
|
|
82
|
-
return placeholder;
|
|
83
|
-
});
|
|
84
|
-
return { text: result, map };
|
|
85
|
-
}
|
|
86
|
-
/**
|
|
87
|
-
* Restore placeholders back to original code blocks/inline code.
|
|
88
|
-
*/
|
|
89
|
-
export function restoreCodeBlocks(content, map) {
|
|
90
|
-
let result = content;
|
|
91
|
-
for (const [placeholder, original] of map.entries()) {
|
|
92
|
-
if (original.endsWith("\n")) {
|
|
93
|
-
// Fenced code block: original already has trailing \n — just replace, one \n total.
|
|
94
|
-
result = result.split(placeholder + "\n").join(original);
|
|
95
|
-
result = result.split(placeholder).join(original);
|
|
96
|
-
}
|
|
97
|
-
else {
|
|
98
|
-
// Inline code: original has NO trailing \n. Preserve any \n that follows the placeholder
|
|
99
|
-
// so that newlines inserted by restoreBlockBoundaries (Layer 3) are not consumed.
|
|
100
|
-
// e.g. "__INLINE_CODE_0__\n- next item" → "`code`\n- next item" (not "`code`- next item")
|
|
101
|
-
result = result.split(placeholder + "\n").join(original + "\n");
|
|
102
|
-
result = result.split(placeholder).join(original);
|
|
103
|
-
}
|
|
104
|
-
}
|
|
105
|
-
return result;
|
|
106
|
-
}
|
|
107
|
-
export const BLOCK_BOUNDARY_SENTINEL = "<!--BB-->";
|
|
108
|
-
// Temporary escape for pre-existing <!--BB--> literals in user content.
|
|
109
|
-
// Uses a character sequence unlikely to appear in markdown documents.
|
|
110
|
-
const ESCAPED_SENTINEL = "\x01BB\x01";
|
|
111
|
-
// Fallback regex: catches LLM-deformed variants (e.g. <!-- BB -->, <!--BB__-->, <!--BBx-->)
|
|
112
|
-
const SENTINEL_FALLBACK = /<!--\s*BB[a-zA-Z0-9_-]*\s*-->/g;
|
|
113
|
-
// Broad check: detects severely-deformed residuals not caught by SENTINEL_FALLBACK
|
|
114
|
-
const SENTINEL_RESIDUAL_CHECK = /<!--[\s\S]*?BB[\s\S]*?-->/g;
|
|
115
|
-
/**
|
|
116
|
-
* Insert block boundary sentinels before structural Markdown elements
|
|
117
|
-
* (list items, headings, horizontal rules, code fences, code block placeholders).
|
|
118
|
-
* P-A4: prevents newline collapse around structural boundaries during LLM translation.
|
|
119
|
-
* When called after protectCodeBlocks, fenced blocks appear as __CODE_BLOCK_N__ placeholders;
|
|
120
|
-
* the function treats those placeholders as structural to protect fence-adjacent newlines.
|
|
121
|
-
*/
|
|
122
|
-
export function protectBlockBoundaries(content) {
|
|
123
|
-
// Escape all sentinel-like patterns (exact + variants) to prevent control-marker confusion.
|
|
124
|
-
// SENTINEL_FALLBACK covers <!--BB-->, <!-- BB -->, <!--BBx-->, <!--BB-x-->, etc.
|
|
125
|
-
// On restore, these all round-trip back to <!--BB--> (minor cosmetic vs. content deletion).
|
|
126
|
-
const escaped = content.replace(SENTINEL_FALLBACK, ESCAPED_SENTINEL);
|
|
127
|
-
const lines = escaped.split("\n");
|
|
128
|
-
const result = [];
|
|
129
|
-
for (const line of lines) {
|
|
130
|
-
const isListItem = /^\s*[-*+]\s/.test(line) || // unordered list
|
|
131
|
-
/^\s*\d+\.\s/.test(line); // ordered list
|
|
132
|
-
const isOtherStructural = !isListItem && (/^ {0,3}#{1,6}\s/.test(line) || // heading (CommonMark: 0-3 leading spaces)
|
|
133
|
-
/^ {0,3}-{3,}\s*$/.test(line) || // hr (dash, 0-3 leading spaces)
|
|
134
|
-
/^ {0,3}\*{3,}\s*$/.test(line) || // hr (asterisk, 0-3 leading spaces)
|
|
135
|
-
/^ {0,3}_{3,}\s*$/.test(line) || // hr (underscore, 0-3 leading spaces)
|
|
136
|
-
/^\s*`{3,}/.test(line) || // fenced code (backtick)
|
|
137
|
-
/^\s*~{3,}/.test(line) || // fenced code (tilde)
|
|
138
|
-
/^__CODE_BLOCK_\d+__$/.test(line.trim()) // code block placeholder (after protectCodeBlocks)
|
|
139
|
-
);
|
|
140
|
-
if (isListItem) {
|
|
141
|
-
// P-A4 v3: double sentinel BEFORE each list item to resist LLM collapse.
|
|
142
|
-
// 0.1.16 used a single sentinel; LLM deleted it and collapsed items to one line.
|
|
143
|
-
// Two sentinels before each item mean the LLM must delete both to collapse — higher bar.
|
|
144
|
-
// After-sentinels are intentionally omitted to preserve clean round-trip (no trailing \n).
|
|
145
|
-
result.push(BLOCK_BOUNDARY_SENTINEL);
|
|
146
|
-
result.push(BLOCK_BOUNDARY_SENTINEL);
|
|
147
|
-
result.push(line);
|
|
148
|
-
}
|
|
149
|
-
else if (isOtherStructural) {
|
|
150
|
-
result.push(BLOCK_BOUNDARY_SENTINEL);
|
|
151
|
-
result.push(line);
|
|
152
|
-
}
|
|
153
|
-
else {
|
|
154
|
-
result.push(line);
|
|
155
|
-
}
|
|
156
|
-
}
|
|
157
|
-
return result.join("\n");
|
|
158
|
-
}
|
|
159
|
-
/**
|
|
160
|
-
* Remove block boundary sentinels and restore newlines lost during LLM translation.
|
|
161
|
-
* Uses a 3-pass strategy plus Layer 3 list-aware fallback (P-A4 v3):
|
|
162
|
-
* Pass 1: normalize LLM-deformed variants (e.g. <!-- BB -->) to exact sentinel form
|
|
163
|
-
* Pass 2: split + restore newlines (handles both clean and collapsed sentinel cases)
|
|
164
|
-
* Pass 3: post-restore warning for residual sentinel-like patterns (silent failure prevention)
|
|
165
|
-
* Layer 3: detect list items collapsed onto one line ("- A- B") and split them back
|
|
166
|
-
* Applied unconditionally — handles the case where LLM deleted ALL sentinels
|
|
167
|
-
*/
|
|
168
|
-
export function restoreBlockBoundaries(content) {
|
|
169
|
-
// Pass 1: normalize variant sentinels introduced by LLM deformation (cmd_389 Root Cause A/B)
|
|
170
|
-
const normalized = content.replace(SENTINEL_FALLBACK, BLOCK_BOUNDARY_SENTINEL);
|
|
171
|
-
let restored;
|
|
172
|
-
// Pass 2: split + restore newlines
|
|
173
|
-
if (!normalized.includes(BLOCK_BOUNDARY_SENTINEL)) {
|
|
174
|
-
// No exact sentinels: fall through to Pass 3 + Layer 3 (handles fully-deleted sentinel case)
|
|
175
|
-
restored = normalized;
|
|
176
|
-
}
|
|
177
|
-
else {
|
|
178
|
-
const parts = normalized.split(BLOCK_BOUNDARY_SENTINEL);
|
|
179
|
-
restored = parts[0];
|
|
180
|
-
for (let i = 1; i < parts.length; i++) {
|
|
181
|
-
// Strip a leading newline from next part (present when LLM preserved sentinel on its own line)
|
|
182
|
-
const stripped = parts[i].replace(/^\n/, "");
|
|
183
|
-
if (restored.length === 0) {
|
|
184
|
-
// Sentinel was at the very start of content — no preceding text to separate from
|
|
185
|
-
restored = stripped;
|
|
186
|
-
}
|
|
187
|
-
else {
|
|
188
|
-
// Ensure restored ends with exactly one newline before appending next part
|
|
189
|
-
restored = restored.replace(/\n?$/, "\n") + stripped;
|
|
190
|
-
}
|
|
191
|
-
}
|
|
192
|
-
}
|
|
193
|
-
// Pass 3: post-restore warning for patterns not caught by SENTINEL_FALLBACK.
|
|
194
|
-
// Runs for BOTH the sentinel-present and no-sentinel paths so fully-deformed output
|
|
195
|
-
// is still flagged. Check before unescaping to avoid false positives from user <!--BB-->.
|
|
196
|
-
const residuals = restored.match(SENTINEL_RESIDUAL_CHECK);
|
|
197
|
-
if (residuals && residuals.length > 0) {
|
|
198
|
-
console.warn(`[yuuhitsu] restoreBlockBoundaries: ${residuals.length} residual sentinel-like pattern(s) detected after restore:`, residuals.slice(0, 5));
|
|
199
|
-
}
|
|
200
|
-
// Unescape any pre-existing <!--BB--> that were escaped before protection
|
|
201
|
-
if (restored.includes(ESCAPED_SENTINEL)) {
|
|
202
|
-
restored = restored.split(ESCAPED_SENTINEL).join(BLOCK_BOUNDARY_SENTINEL);
|
|
203
|
-
}
|
|
204
|
-
// Layer 3 (P-A4 v3): list-aware fallback — detect inline list concatenation that
|
|
205
|
-
// survived Layer 1+2 (LLM joined "- A\n- B" into "- A- B" on a single line).
|
|
206
|
-
// Applied unconditionally: handles the worst case where ALL sentinels were deleted.
|
|
207
|
-
// Uses /gm flag: ^ anchors to line start per line (multiline mode).
|
|
208
|
-
//
|
|
209
|
-
// Two sub-patterns:
|
|
210
|
-
// (a) Spaced: "- A- B" or "- A - B" — requires whitespace after 2nd marker (original).
|
|
211
|
-
// (b) Placeholder-end: "- A: __INLINE_CODE_0__-B" — inline code placeholder at end of
|
|
212
|
-
// previous item followed directly by next list marker (no space). This is the pattern
|
|
213
|
-
// LLM produces when translating to Japanese (Japanese text has no space after marker).
|
|
214
|
-
// (?<!\s) guard: require the char immediately before the 2nd marker to be non-whitespace.
|
|
215
|
-
// This prevents false-positive splits on prose like "- Linux - macOS support" where the
|
|
216
|
-
// inline "- " is preceded by a space (valid prose) rather than collapsed item text.
|
|
217
|
-
// True collapse ("- A- B") has NO space before the 2nd marker → lookbehind passes.
|
|
218
|
-
const LIST_INLINE_MERGE_UNORDERED = /(^\s*[-*+]\s[^\n]*?)(?<!\s)([-*+]\s)/gm;
|
|
219
|
-
const LIST_INLINE_MERGE_ORDERED = /(^\s*\d+\.\s[^\n]*?)(?<!\s)(\d+\.\s)/gm;
|
|
220
|
-
// Placeholder-end pattern: matches code-placeholder end (\d+__) immediately before list marker.
|
|
221
|
-
// Inserts "\n" + space (standard list-item format: "- content") so the new line passes
|
|
222
|
-
// /^\s*[-*+]\s/ checks in integration tests.
|
|
223
|
-
// (?=[^a-z]) guard avoids false positives for "__CODE__-style" (lowercase word hyphens).
|
|
224
|
-
// e.g. "__INLINE_CODE_0__-次の項目" → "__INLINE_CODE_0__\n- 次の項目"
|
|
225
|
-
const LIST_INLINE_MERGE_PLACEHOLDER_UNORDERED = /(\d+__)([-*+])(?=[^a-z])/gm;
|
|
226
|
-
const LIST_INLINE_MERGE_PLACEHOLDER_ORDERED = /(\d+__)(\d+\.)(?=[^a-z])/gm;
|
|
227
|
-
// Apply iteratively: JavaScript replace() scans left-to-right in the original string,
|
|
228
|
-
// so "- A- B- C" needs two passes (first splits A-B, second splits B-C on the new line).
|
|
229
|
-
let layer3applied = false;
|
|
230
|
-
let prev;
|
|
231
|
-
do {
|
|
232
|
-
prev = restored;
|
|
233
|
-
restored = restored.replace(LIST_INLINE_MERGE_UNORDERED, (_match, p1, p2) => {
|
|
234
|
-
layer3applied = true;
|
|
235
|
-
return `${p1}\n${p2}`;
|
|
236
|
-
});
|
|
237
|
-
restored = restored.replace(LIST_INLINE_MERGE_ORDERED, (_match, p1, p2) => {
|
|
238
|
-
layer3applied = true;
|
|
239
|
-
return `${p1}\n${p2}`;
|
|
240
|
-
});
|
|
241
|
-
restored = restored.replace(LIST_INLINE_MERGE_PLACEHOLDER_UNORDERED, (_match, p1, p2) => {
|
|
242
|
-
layer3applied = true;
|
|
243
|
-
return `${p1}\n${p2} `; // trailing space ensures valid "- content" list-item format
|
|
244
|
-
});
|
|
245
|
-
restored = restored.replace(LIST_INLINE_MERGE_PLACEHOLDER_ORDERED, (_match, p1, p2) => {
|
|
246
|
-
layer3applied = true;
|
|
247
|
-
return `${p1}\n${p2} `;
|
|
248
|
-
});
|
|
249
|
-
} while (restored !== prev);
|
|
250
|
-
if (layer3applied) {
|
|
251
|
-
console.warn("[yuuhitsu] restoreBlockBoundaries: Layer 3 list-aware fallback applied — " +
|
|
252
|
-
"LLM concatenated list items inline. Layer 1+2 sentinels were insufficient.");
|
|
253
|
-
}
|
|
254
|
-
return restored;
|
|
255
|
-
}
|
|
256
|
-
const DEFAULT_TEMPLATE = `You are a professional translator. Translate the following Markdown document to {{targetLanguage}}.
|
|
257
|
-
|
|
258
|
-
Rules:
|
|
259
|
-
- Preserve all Markdown formatting (headings, links, code blocks, tables, lists)
|
|
260
|
-
- Do not translate code blocks, URLs, or file paths
|
|
261
|
-
- Do not translate frontmatter keys (only translate values where appropriate)
|
|
262
|
-
- Maintain the same document structure
|
|
263
|
-
- Produce natural, fluent text in the target language
|
|
264
|
-
- Every opening \`\`\` you write MUST be followed by a language identifier on the same line (e.g., \`\`\`json, \`\`\`bash, \`\`\`typescript)
|
|
265
|
-
- If the language is unknown, use \`\`\`text — never emit a bare opening \`\`\`
|
|
266
|
-
|
|
267
|
-
CRITICAL - Link and URL preservation:
|
|
268
|
-
- NEVER modify any URLs or link paths. Keep all href/src values exactly as-is.
|
|
269
|
-
- NEVER change internal link paths (e.g., /ja/..., /en/..., ./relative-path). Preserve them verbatim.
|
|
270
|
-
- NEVER convert external URLs to different language versions.
|
|
271
|
-
- If the source has [text](/ja/changelog), the output must keep the same path, only translate the link text if needed.
|
|
272
|
-
- Example: [紹介](/ja/intro) → translate "紹介" but keep "/ja/intro" unchanged
|
|
273
|
-
- Example: [MDN](https://developer.mozilla.org/ja/) → keep the /ja/ in URL, translate "MDN" if needed
|
|
274
|
-
|
|
275
|
-
Additional rules for Japanese translation:
|
|
276
|
-
- Use full-width punctuation: 。、?! (not .,?!)
|
|
277
|
-
- Add half-width spaces around English words and numbers (e.g., "Vela とは", "NGSIv2 は", "3 つの")
|
|
278
|
-
- Use natural Japanese terms for technical words where appropriate (e.g., "registration" → "登録", "subscription" → "サブスクリプション")
|
|
279
|
-
- Keep product names, proper nouns, and abbreviations unchanged (e.g., Vela, FIWARE, NGSIv2, NGSI-LD, MCP)
|
|
280
|
-
|
|
281
|
-
Example — code fence with language identifier:
|
|
282
|
-
Bad: \`\`\` echo hello \`\`\`
|
|
283
|
-
Good: \`\`\`bash echo hello \`\`\``;
|
|
284
|
-
function buildPrompt(content, targetLang, hasPlaceholders, hasSentinels, templateContent, glossaryConfig) {
|
|
285
|
-
const template = templateContent || DEFAULT_TEMPLATE;
|
|
286
|
-
let systemPrompt = template
|
|
287
|
-
.replace(/\{\{targetLanguage\}\}/g, targetLang)
|
|
288
|
-
.replace(/\{\{content\}\}/g, "");
|
|
289
|
-
if (glossaryConfig) {
|
|
290
|
-
const glossarySection = buildGlossaryPrompt(glossaryConfig, targetLang);
|
|
291
|
-
if (glossarySection) {
|
|
292
|
-
systemPrompt += glossarySection;
|
|
293
|
-
}
|
|
294
|
-
}
|
|
295
|
-
if (hasSentinels) {
|
|
296
|
-
systemPrompt +=
|
|
297
|
-
"\n\n## Block boundary markers (P-A4 v2)\n\n" +
|
|
298
|
-
"Lines containing the marker `<!--BB-->` are **block boundary markers** inserted\n" +
|
|
299
|
-
"by the translation pipeline to preserve newlines around structural elements.\n\n" +
|
|
300
|
-
"Rules for `<!--BB-->` markers (HTML comment form):\n" +
|
|
301
|
-
"- Output every `<!--BB-->` marker **verbatim and unchanged** in your translation.\n" +
|
|
302
|
-
"- Each marker must remain on its own line, in the same position relative to\n" +
|
|
303
|
-
" surrounding content.\n" +
|
|
304
|
-
"- Do not translate, remove, paraphrase, modify, normalize whitespace inside, or\n" +
|
|
305
|
-
" rename these markers.\n" +
|
|
306
|
-
"- Do not add new `<!--BB-->` markers; only preserve existing ones.\n\n" +
|
|
307
|
-
"Good example (correct preservation):\n" +
|
|
308
|
-
" Input:\n" +
|
|
309
|
-
" <!--BB-->\n" +
|
|
310
|
-
" - List item one\n" +
|
|
311
|
-
" <!--BB-->\n" +
|
|
312
|
-
" - List item two\n" +
|
|
313
|
-
" <!--BB-->\n" +
|
|
314
|
-
" ## Section heading\n" +
|
|
315
|
-
" Output:\n" +
|
|
316
|
-
" <!--BB-->\n" +
|
|
317
|
-
" - リスト項目その一\n" +
|
|
318
|
-
" <!--BB-->\n" +
|
|
319
|
-
" - リスト項目その二\n" +
|
|
320
|
-
" <!--BB-->\n" +
|
|
321
|
-
" ## セクション見出し\n\n" +
|
|
322
|
-
"Bad examples (DO NOT do these):\n" +
|
|
323
|
-
" - <!--BB--> ❌ → <!--BB__--> (added suffix — FORBIDDEN)\n" +
|
|
324
|
-
" - <!--BB--> ❌ → <!-- BB --> (added internal whitespace — FORBIDDEN)\n" +
|
|
325
|
-
" - <!--BB--> ❌ → <!--bb--> (case change — FORBIDDEN)\n" +
|
|
326
|
-
" - <!--BB--> ❌ → (omitted) (deleted — FORBIDDEN)\n" +
|
|
327
|
-
" - <!--BB--> ❌ → <!--BB-x--> (added suffix — FORBIDDEN)\n\n" +
|
|
328
|
-
"Preserve the marker exactly: 9 characters, opening `<!--`, content `BB`,\n" +
|
|
329
|
-
"closing `-->`, no whitespace, no case changes, no suffixes.\n\n" +
|
|
330
|
-
"## List boundary protection (P-A4 v3 list addendum)\n\n" +
|
|
331
|
-
"For list items (lines starting with `-`, `*`, `+`, or `1.`),\n" +
|
|
332
|
-
"`<!--BB-->` markers appear **multiple times in a row** (e.g., two consecutive\n" +
|
|
333
|
-
"`<!--BB-->` lines). This is intentional — preserve ALL of them.\n\n" +
|
|
334
|
-
"Bad example (DO NOT do this):\n" +
|
|
335
|
-
" <!--BB-->\n" +
|
|
336
|
-
" <!--BB-->\n" +
|
|
337
|
-
" - Item A\n" +
|
|
338
|
-
" <!--BB-->\n" +
|
|
339
|
-
" <!--BB-->\n" +
|
|
340
|
-
" - Item B\n\n" +
|
|
341
|
-
" ❌ becomes: - Item A- Item B (list items on one line — FORBIDDEN)\n\n" +
|
|
342
|
-
"Good example (keep each item on its own line):\n" +
|
|
343
|
-
" <!--BB-->\n" +
|
|
344
|
-
" <!--BB-->\n" +
|
|
345
|
-
" - アイテムA\n" +
|
|
346
|
-
" <!--BB-->\n" +
|
|
347
|
-
" <!--BB-->\n" +
|
|
348
|
-
" - アイテムB\n\n" +
|
|
349
|
-
"Key rules for lists:\n" +
|
|
350
|
-
"- Each list item MUST remain on its own line.\n" +
|
|
351
|
-
"- NEVER join two list items into one line (e.g., `- A- B` is FORBIDDEN).\n" +
|
|
352
|
-
"- Preserve ALL `<!--BB-->` markers, even when they appear consecutively.\n\n" +
|
|
353
|
-
"Bad example (DO NOT do this) — inline code list collapse:\n" +
|
|
354
|
-
" - Item A: `value 1`\n" +
|
|
355
|
-
" - Item B: `value 2`\n\n" +
|
|
356
|
-
" ❌ becomes: - Item A: `value 1`- Item B: `value 2` (FORBIDDEN, space before marker)\n" +
|
|
357
|
-
" ❌ becomes: - Item A: `value 1`-Item B: `value 2` (FORBIDDEN, no space)\n\n" +
|
|
358
|
-
"Good example: each list item must remain on its own line, even when items contain inline code:\n" +
|
|
359
|
-
" - アイテムA: `value 1`\n" +
|
|
360
|
-
" - アイテムB: `value 2`";
|
|
361
|
-
}
|
|
362
|
-
if (hasPlaceholders) {
|
|
363
|
-
systemPrompt +=
|
|
364
|
-
"\n\nIMPORTANT - Placeholder preservation:\n" +
|
|
365
|
-
"- Tokens matching __CODE_BLOCK_N__ or __INLINE_CODE_N__ are placeholders for code blocks/inline code.\n" +
|
|
366
|
-
"- Output them VERBATIM and UNCHANGED. Do NOT translate, modify, or remove them.\n" +
|
|
367
|
-
"- Example: if input has __CODE_BLOCK_0__, output must contain __CODE_BLOCK_0__ exactly.";
|
|
368
|
-
}
|
|
369
|
-
return [
|
|
370
|
-
{ role: "system", content: systemPrompt },
|
|
371
|
-
{ role: "user", content },
|
|
372
|
-
];
|
|
373
|
-
}
|
|
374
|
-
function resolveOutputPath(inputPath, targetLang, outputPath) {
|
|
375
|
-
if (outputPath)
|
|
376
|
-
return outputPath;
|
|
377
|
-
const dir = dirname(inputPath);
|
|
378
|
-
const ext = extname(inputPath);
|
|
379
|
-
const base = basename(inputPath, ext);
|
|
380
|
-
return join(dir, `${base}.${targetLang}${ext}`);
|
|
381
|
-
}
|
|
28
|
+
// ─── Chunking utilities (kept for splitIntoChunks export) ───────────────────
|
|
382
29
|
/**
|
|
383
30
|
* Find positions (line indices) of Markdown headings at the given level,
|
|
384
31
|
* excluding lines inside fenced code blocks or table rows.
|
|
@@ -389,7 +36,7 @@ export function findHeadingPositions(lines, level) {
|
|
|
389
36
|
const prefix = "#".repeat(level) + " ";
|
|
390
37
|
for (let i = 0; i < lines.length; i++) {
|
|
391
38
|
const line = lines[i];
|
|
392
|
-
if (
|
|
39
|
+
if (/^(`{3,}|~{3,})/.test(line))
|
|
393
40
|
inCodeBlock = !inCodeBlock;
|
|
394
41
|
if (inCodeBlock)
|
|
395
42
|
continue;
|
|
@@ -401,8 +48,8 @@ export function findHeadingPositions(lines, level) {
|
|
|
401
48
|
return positions;
|
|
402
49
|
}
|
|
403
50
|
/**
|
|
404
|
-
* Split lines at the given positions into chunks
|
|
405
|
-
*
|
|
51
|
+
* Split lines at the given positions into chunks. Segments exceeding maxChunkLines
|
|
52
|
+
* are further split using ### headings or safeSplitLines.
|
|
406
53
|
*/
|
|
407
54
|
export function splitAtPositions(lines, positions, maxChunkLines) {
|
|
408
55
|
const result = [];
|
|
@@ -417,8 +64,6 @@ export function splitAtPositions(lines, positions, maxChunkLines) {
|
|
|
417
64
|
continue;
|
|
418
65
|
if (segmentLines.length > maxChunkLines) {
|
|
419
66
|
const subPositions = findHeadingPositions(segmentLines, 3);
|
|
420
|
-
// Filter out position 0: splitting at the start doesn't reduce segment size
|
|
421
|
-
// and causes infinite recursion when the only heading is at position 0.
|
|
422
67
|
const effectivePositions = subPositions.filter((p) => p > 0);
|
|
423
68
|
if (effectivePositions.length > 0) {
|
|
424
69
|
result.push(...splitAtPositions(segmentLines, effectivePositions, maxChunkLines));
|
|
@@ -443,12 +88,8 @@ export function safeSplitLines(lines, maxChunkLines) {
|
|
|
443
88
|
let inCodeBlock = false;
|
|
444
89
|
for (let i = 0; i < lines.length; i++) {
|
|
445
90
|
const line = lines[i];
|
|
446
|
-
const isFence =
|
|
91
|
+
const isFence = /^(`{3,}|~{3,})/.test(line);
|
|
447
92
|
const isTableLine = line.startsWith("|");
|
|
448
|
-
// Check split eligibility BEFORE toggling fence state:
|
|
449
|
-
// - never split inside a code block
|
|
450
|
-
// - never split ON a fence line (would separate opening/closing ``` from their block)
|
|
451
|
-
// - never split ON a table row
|
|
452
93
|
const canSplitHere = !inCodeBlock && !isFence && !isTableLine;
|
|
453
94
|
if (isFence)
|
|
454
95
|
inCodeBlock = !inCodeBlock;
|
|
@@ -463,8 +104,7 @@ export function safeSplitLines(lines, maxChunkLines) {
|
|
|
463
104
|
return chunks.filter((c) => c.trim().length > 0);
|
|
464
105
|
}
|
|
465
106
|
/**
|
|
466
|
-
* Merge chunks smaller than MIN_CHUNK_LINES into the previous chunk
|
|
467
|
-
* as long as the merged result does not exceed maxLines.
|
|
107
|
+
* Merge chunks smaller than MIN_CHUNK_LINES into the previous chunk.
|
|
468
108
|
*/
|
|
469
109
|
export function mergeSmallChunks(chunks, maxLines) {
|
|
470
110
|
if (chunks.length <= 1)
|
|
@@ -500,9 +140,294 @@ export function splitIntoChunks(content, maxChunkLines = DEFAULT_MAX_CHUNK_LINES
|
|
|
500
140
|
}
|
|
501
141
|
return safeSplitLines(lines, maxChunkLines);
|
|
502
142
|
}
|
|
143
|
+
/**
|
|
144
|
+
* Replace fenced code blocks and inline code with placeholders.
|
|
145
|
+
* Used by glossary-fix.ts to protect code blocks before text replacement.
|
|
146
|
+
*/
|
|
147
|
+
export function protectCodeBlocks(content) {
|
|
148
|
+
const map = new Map();
|
|
149
|
+
let blockIndex = 0;
|
|
150
|
+
let inlineIndex = 0;
|
|
151
|
+
const lines = content.split("\n");
|
|
152
|
+
const resultLines = [];
|
|
153
|
+
let fenceOpen = null;
|
|
154
|
+
let blockLines = [];
|
|
155
|
+
for (const line of lines) {
|
|
156
|
+
const fenceMatch = line.match(/^(`{3,})/);
|
|
157
|
+
if (fenceOpen === null) {
|
|
158
|
+
if (fenceMatch) {
|
|
159
|
+
fenceOpen = fenceMatch[1];
|
|
160
|
+
blockLines = [line];
|
|
161
|
+
}
|
|
162
|
+
else {
|
|
163
|
+
resultLines.push(line);
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
else {
|
|
167
|
+
blockLines.push(line);
|
|
168
|
+
if (fenceMatch && fenceMatch[1].length >= fenceOpen.length && line.trim() === fenceMatch[1]) {
|
|
169
|
+
const original = blockLines.join("\n") + "\n";
|
|
170
|
+
const placeholder = `__CODE_BLOCK_${blockIndex++}__`;
|
|
171
|
+
map.set(placeholder, original);
|
|
172
|
+
resultLines.push(placeholder);
|
|
173
|
+
fenceOpen = null;
|
|
174
|
+
blockLines = [];
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
if (fenceOpen !== null) {
|
|
179
|
+
resultLines.push(...blockLines);
|
|
180
|
+
}
|
|
181
|
+
let result = resultLines.join("\n");
|
|
182
|
+
result = result.replace(/`([^`\n]+)`/g, (match) => {
|
|
183
|
+
const placeholder = `__INLINE_CODE_${inlineIndex++}__`;
|
|
184
|
+
map.set(placeholder, match);
|
|
185
|
+
return placeholder;
|
|
186
|
+
});
|
|
187
|
+
return { text: result, map };
|
|
188
|
+
}
|
|
189
|
+
/**
|
|
190
|
+
* Restore placeholders back to original code blocks/inline code.
|
|
191
|
+
*/
|
|
192
|
+
export function restoreCodeBlocks(content, map) {
|
|
193
|
+
let result = content;
|
|
194
|
+
for (const [placeholder, original] of map.entries()) {
|
|
195
|
+
if (original.endsWith("\n")) {
|
|
196
|
+
result = result.split(placeholder + "\n").join(original);
|
|
197
|
+
result = result.split(placeholder).join(original);
|
|
198
|
+
}
|
|
199
|
+
else {
|
|
200
|
+
result = result.split(placeholder + "\n").join(original + "\n");
|
|
201
|
+
result = result.split(placeholder).join(original);
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
return result;
|
|
205
|
+
}
|
|
206
|
+
/**
|
|
207
|
+
* Extract all translatable text nodes from an mdast AST.
|
|
208
|
+
* remark AST guarantees text nodes cannot be inside code/inlineCode nodes,
|
|
209
|
+
* so we only skip empty/whitespace-only nodes.
|
|
210
|
+
*/
|
|
211
|
+
function extractTextNodes(ast) {
|
|
212
|
+
const nodes = [];
|
|
213
|
+
visit(ast, "text", (node) => {
|
|
214
|
+
if (!node.value.trim())
|
|
215
|
+
return;
|
|
216
|
+
nodes.push({ node, id: nodes.length });
|
|
217
|
+
});
|
|
218
|
+
return nodes;
|
|
219
|
+
}
|
|
220
|
+
/**
|
|
221
|
+
* Build system prompt for text-mode batch translation (Gemini / Ollama fallback).
|
|
222
|
+
* Includes JSON format instructions since we're relying on the LLM to output JSON.
|
|
223
|
+
*/
|
|
224
|
+
function buildBatchSystemPrompt(targetLang, templateContent, glossaryConfig) {
|
|
225
|
+
const basePrompt = templateContent
|
|
226
|
+
? templateContent.replace(/\{\{targetLanguage\}\}/g, targetLang)
|
|
227
|
+
: `You are a professional translator. Translate text segments to ${targetLang}.
|
|
228
|
+
|
|
229
|
+
Rules:
|
|
230
|
+
- Translate only the text content; do not add or remove punctuation structure
|
|
231
|
+
- Preserve proper nouns, code identifiers, URLs, and file paths unchanged
|
|
232
|
+
- Produce natural, fluent text in the target language
|
|
233
|
+
- For Japanese: use full-width punctuation (。、?!), add half-width spaces around English words/numbers
|
|
234
|
+
- Keep product names, abbreviations, and technical terms unchanged (e.g., NGSI-LD, MCP, GeoJSON)`;
|
|
235
|
+
let prompt = basePrompt;
|
|
236
|
+
if (glossaryConfig) {
|
|
237
|
+
const glossarySection = buildGlossaryPrompt(glossaryConfig, targetLang);
|
|
238
|
+
if (glossarySection)
|
|
239
|
+
prompt += glossarySection;
|
|
240
|
+
}
|
|
241
|
+
prompt += `
|
|
242
|
+
|
|
243
|
+
## Translation format
|
|
244
|
+
|
|
245
|
+
You will receive a JSON object with a "segments" array.
|
|
246
|
+
Each segment has an "id" (integer) and "text" (string to translate).
|
|
247
|
+
|
|
248
|
+
Return ONLY a valid JSON object with a "translations" array.
|
|
249
|
+
Each translation must have the same "id" and the translated "text".
|
|
250
|
+
Do not include any explanation, markdown, or text outside the JSON object.
|
|
251
|
+
|
|
252
|
+
Example input:
|
|
253
|
+
{"segments": [{"id": 0, "text": "Hello world"}, {"id": 1, "text": "This is a test."}]}
|
|
254
|
+
|
|
255
|
+
Example output:
|
|
256
|
+
{"translations": [{"id": 0, "text": "こんにちは世界"}, {"id": 1, "text": "これはテストです。"}]}`;
|
|
257
|
+
return prompt;
|
|
258
|
+
}
|
|
259
|
+
/**
|
|
260
|
+
* Build system prompt for structured output translation (Claude tool_use path).
|
|
261
|
+
* No JSON format instructions needed — the tool schema enforces the response shape.
|
|
262
|
+
*/
|
|
263
|
+
function buildStructuredSystemPrompt(targetLang, templateContent, glossaryConfig) {
|
|
264
|
+
const basePrompt = templateContent
|
|
265
|
+
? templateContent.replace(/\{\{targetLanguage\}\}/g, targetLang)
|
|
266
|
+
: `You are a professional translator. Translate each text segment to ${targetLang}.
|
|
267
|
+
|
|
268
|
+
Rules:
|
|
269
|
+
- Translate only the text content; do not alter structure or punctuation outside the text
|
|
270
|
+
- Preserve proper nouns, code identifiers, URLs, and file paths unchanged
|
|
271
|
+
- Produce natural, fluent text in the target language
|
|
272
|
+
- For Japanese: use full-width punctuation (。、?!), add half-width spaces around English words/numbers
|
|
273
|
+
- Keep product names, abbreviations, and technical terms unchanged (e.g., NGSI-LD, MCP, GeoJSON)
|
|
274
|
+
- Each segment is independent; translate it on its own`;
|
|
275
|
+
let prompt = basePrompt;
|
|
276
|
+
if (glossaryConfig) {
|
|
277
|
+
const glossarySection = buildGlossaryPrompt(glossaryConfig, targetLang);
|
|
278
|
+
if (glossarySection)
|
|
279
|
+
prompt += glossarySection;
|
|
280
|
+
}
|
|
281
|
+
return prompt;
|
|
282
|
+
}
|
|
283
|
+
/**
|
|
284
|
+
* Parse translation JSON response from LLM (text mode fallback).
|
|
285
|
+
* Handles both clean JSON and JSON embedded in prose (extracts first {...} block).
|
|
286
|
+
*/
|
|
287
|
+
function parseTranslationResponse(raw) {
|
|
288
|
+
const trimmed = raw.trim();
|
|
289
|
+
try {
|
|
290
|
+
return JSON.parse(trimmed);
|
|
291
|
+
}
|
|
292
|
+
catch {
|
|
293
|
+
// Extract first JSON object if LLM added prose around it
|
|
294
|
+
const jsonMatch = trimmed.match(/\{[\s\S]*\}/);
|
|
295
|
+
if (jsonMatch) {
|
|
296
|
+
return JSON.parse(jsonMatch[0]);
|
|
297
|
+
}
|
|
298
|
+
throw new Error(`Failed to parse translation response: ${trimmed.slice(0, 200)}`);
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
function assertValidTranslations(value) {
|
|
302
|
+
if (!Array.isArray(value) ||
|
|
303
|
+
value.some((item) => {
|
|
304
|
+
if (typeof item !== "object" || item === null)
|
|
305
|
+
return true;
|
|
306
|
+
const candidate = item;
|
|
307
|
+
return typeof candidate.id !== "number" || typeof candidate.text !== "string";
|
|
308
|
+
})) {
|
|
309
|
+
throw new Error("[yuuhitsu] translateBatch: invalid translation payload — expected Array<{id: number, text: string}>");
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
/**
|
|
313
|
+
* Translate a batch of text segments using the provider.
|
|
314
|
+
*
|
|
315
|
+
* - If the provider implements `translateStructured` (Claude): uses tool_use to
|
|
316
|
+
* enforce the JSON schema at the API level, guaranteeing 1:1 ID mapping.
|
|
317
|
+
* - Otherwise: falls back to text-mode JSON prompt (Gemini / Ollama).
|
|
318
|
+
*
|
|
319
|
+
* In both paths the 1:1 ID mapping is validated and an error is thrown on mismatch.
|
|
320
|
+
*/
|
|
321
|
+
async function translateBatch(provider, nodes, targetLang, templateContent, glossaryConfig) {
|
|
322
|
+
if (nodes.length === 0) {
|
|
323
|
+
return { usage: { promptTokens: 0, completionTokens: 0, totalTokens: 0 } };
|
|
324
|
+
}
|
|
325
|
+
const segments = nodes.map(({ node, id }) => ({ id, text: node.value }));
|
|
326
|
+
// P-A1: track total input character count for truncation detection
|
|
327
|
+
const totalInputChars = segments.reduce((sum, s) => sum + s.text.length, 0);
|
|
328
|
+
let translations;
|
|
329
|
+
let usage;
|
|
330
|
+
if (provider.translateStructured) {
|
|
331
|
+
// Structured output path: provider (Claude) enforces JSON schema via tool_use
|
|
332
|
+
const systemPrompt = buildStructuredSystemPrompt(targetLang, templateContent, glossaryConfig);
|
|
333
|
+
const result = await provider.translateStructured({ segments, systemPrompt });
|
|
334
|
+
assertValidTranslations(result.translations);
|
|
335
|
+
translations = result.translations;
|
|
336
|
+
usage = result.usage;
|
|
337
|
+
// P-A1: warn on truncation (compare translated chars to input chars)
|
|
338
|
+
const totalOutputChars = translations.reduce((sum, t) => sum + t.text.length, 0);
|
|
339
|
+
if (totalInputChars > 0 && totalOutputChars < totalInputChars * MIN_OUTPUT_RATIO) {
|
|
340
|
+
console.warn(`[yuuhitsu] translateBatch: output may be truncated ` +
|
|
341
|
+
`(input chars: ${totalInputChars}, output chars: ${totalOutputChars})`);
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
else {
|
|
345
|
+
// Text mode fallback: Gemini / Ollama
|
|
346
|
+
const systemPrompt = buildBatchSystemPrompt(targetLang, templateContent, glossaryConfig);
|
|
347
|
+
const messages = [
|
|
348
|
+
{ role: "system", content: systemPrompt },
|
|
349
|
+
{ role: "user", content: JSON.stringify({ segments }) },
|
|
350
|
+
];
|
|
351
|
+
const response = await provider.chat({ model: "", messages });
|
|
352
|
+
const parsed = parseTranslationResponse(response.content);
|
|
353
|
+
assertValidTranslations(parsed.translations);
|
|
354
|
+
// P-A1: warn on truncation (compare translated text chars, not raw JSON string length)
|
|
355
|
+
const totalOutputChars = parsed.translations.reduce((sum, t) => sum + t.text.length, 0);
|
|
356
|
+
if (totalInputChars > 0 && totalOutputChars < totalInputChars * MIN_OUTPUT_RATIO) {
|
|
357
|
+
console.warn(`[yuuhitsu] translateBatch: output may be truncated ` +
|
|
358
|
+
`(input chars: ${totalInputChars}, output chars: ${totalOutputChars})`);
|
|
359
|
+
}
|
|
360
|
+
translations = parsed.translations;
|
|
361
|
+
usage = response.usage;
|
|
362
|
+
}
|
|
363
|
+
// Validate strict 1:1 ID mapping (both paths):
|
|
364
|
+
// - no duplicate output IDs (Map silently overwrites; we must detect before)
|
|
365
|
+
// - no unexpected IDs (IDs not present in the input set)
|
|
366
|
+
// - no missing IDs (input ID absent from output)
|
|
367
|
+
const inputIds = new Set(nodes.map(({ id }) => id));
|
|
368
|
+
const seenOutputIds = new Set();
|
|
369
|
+
const duplicateIds = [];
|
|
370
|
+
const unexpectedIds = [];
|
|
371
|
+
for (const t of translations) {
|
|
372
|
+
if (seenOutputIds.has(t.id)) {
|
|
373
|
+
duplicateIds.push(t.id);
|
|
374
|
+
}
|
|
375
|
+
else {
|
|
376
|
+
seenOutputIds.add(t.id);
|
|
377
|
+
}
|
|
378
|
+
if (!inputIds.has(t.id)) {
|
|
379
|
+
unexpectedIds.push(t.id);
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
if (duplicateIds.length > 0) {
|
|
383
|
+
throw new Error(`[yuuhitsu] translateBatch: duplicate IDs in response (IDs: ${duplicateIds.join(", ")})`);
|
|
384
|
+
}
|
|
385
|
+
if (unexpectedIds.length > 0) {
|
|
386
|
+
throw new Error(`[yuuhitsu] translateBatch: unexpected IDs in response (IDs: ${unexpectedIds.join(", ")})`);
|
|
387
|
+
}
|
|
388
|
+
const translationMap = new Map(translations.map((t) => [t.id, t.text]));
|
|
389
|
+
const missingIds = nodes.filter(({ id }) => !translationMap.has(id)).map(({ id }) => id);
|
|
390
|
+
if (missingIds.length > 0) {
|
|
391
|
+
throw new Error(`[yuuhitsu] translateBatch: partial translation — ${missingIds.length} node(s) missing` +
|
|
392
|
+
` from response (IDs: ${missingIds.join(", ")})`);
|
|
393
|
+
}
|
|
394
|
+
// Apply translations back to AST nodes
|
|
395
|
+
for (const { node, id } of nodes) {
|
|
396
|
+
const translated = translationMap.get(id);
|
|
397
|
+
if (translated !== undefined && translated.trim()) {
|
|
398
|
+
node.value = translated;
|
|
399
|
+
}
|
|
400
|
+
}
|
|
401
|
+
return { usage };
|
|
402
|
+
}
|
|
403
|
+
function resolveOutputPath(inputPath, targetLang, outputPath) {
|
|
404
|
+
if (outputPath)
|
|
405
|
+
return outputPath;
|
|
406
|
+
const dir = dirname(inputPath);
|
|
407
|
+
const ext = extname(inputPath);
|
|
408
|
+
const base = basename(inputPath, ext);
|
|
409
|
+
return join(dir, `${base}.${targetLang}${ext}`);
|
|
410
|
+
}
|
|
411
|
+
/**
|
|
412
|
+
* Translate a Markdown file using AST-based approach.
|
|
413
|
+
*
|
|
414
|
+
* Architecture (0.2.0):
|
|
415
|
+
* 1. Separate frontmatter (preserved verbatim)
|
|
416
|
+
* 2. Split body into chunks by heading boundaries
|
|
417
|
+
* 3. For each chunk: parse to AST → extract text nodes → translate via LLM → write back
|
|
418
|
+
* 4. Serialize AST → markdown → concatenate → write output
|
|
419
|
+
*
|
|
420
|
+
* Key properties:
|
|
421
|
+
* - Code blocks (fenced and inline) are never sent to LLM (AST handles them deterministically)
|
|
422
|
+
* - Markdown structure (headings, lists, tables, HR) is preserved by AST round-trip
|
|
423
|
+
* - No sentinel injection or removal needed
|
|
424
|
+
*/
|
|
503
425
|
export async function translateFile(options) {
|
|
504
|
-
const { provider, inputPath, targetLang, templateContent, glossaryConfig, maxChunkLines } = options;
|
|
505
|
-
|
|
426
|
+
const { provider, inputPath, targetLang, templateContent, glossaryConfig, maxChunkLines, maxNodesPerBatch, } = options;
|
|
427
|
+
if (maxNodesPerBatch !== undefined && (!Number.isInteger(maxNodesPerBatch) || maxNodesPerBatch < 1)) {
|
|
428
|
+
throw new Error(`maxNodesPerBatch must be a positive integer, got: ${maxNodesPerBatch}`);
|
|
429
|
+
}
|
|
430
|
+
const resolvedMaxNodes = maxNodesPerBatch ?? DEFAULT_MAX_NODES_PER_BATCH;
|
|
506
431
|
let content;
|
|
507
432
|
try {
|
|
508
433
|
content = readFileSync(inputPath, "utf-8");
|
|
@@ -513,46 +438,46 @@ export async function translateFile(options) {
|
|
|
513
438
|
}
|
|
514
439
|
throw err;
|
|
515
440
|
}
|
|
516
|
-
// Check for empty file
|
|
517
441
|
if (content.trim().length === 0) {
|
|
518
442
|
throw new Error(`Input file is empty: ${inputPath}`);
|
|
519
443
|
}
|
|
520
444
|
const resolvedOutput = resolveOutputPath(inputPath, targetLang, options.outputPath);
|
|
521
|
-
// Ensure output directory exists
|
|
522
445
|
mkdirSync(dirname(resolvedOutput), { recursive: true });
|
|
523
|
-
// Separate frontmatter from body
|
|
524
446
|
const { frontmatter, body } = separateFrontmatter(content);
|
|
525
|
-
//
|
|
526
|
-
const
|
|
527
|
-
const
|
|
528
|
-
// Protect block boundaries: insert %%BB%% sentinels before structural elements (P-A4)
|
|
529
|
-
const bodyWithSentinels = protectBlockBoundaries(protectedBody);
|
|
530
|
-
const hasSentinels = bodyWithSentinels.includes(BLOCK_BOUNDARY_SENTINEL);
|
|
531
|
-
// Split body into chunks if needed (frontmatter is never sent to LLM)
|
|
532
|
-
const chunks = splitIntoChunks(bodyWithSentinels, maxChunkLines);
|
|
533
|
-
const translatedParts = [];
|
|
447
|
+
// Split body into text chunks (heading-based, same as before)
|
|
448
|
+
const chunks = splitIntoChunks(body, maxChunkLines ?? DEFAULT_MAX_CHUNK_LINES);
|
|
449
|
+
const translatedChunks = [];
|
|
534
450
|
let totalUsage = { promptTokens: 0, completionTokens: 0, totalTokens: 0 };
|
|
535
451
|
for (const chunk of chunks) {
|
|
536
|
-
|
|
537
|
-
const
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
452
|
+
// Parse chunk to AST (code blocks, inline code become typed nodes → never reach LLM)
|
|
453
|
+
const processor = remark().use(remarkGfm);
|
|
454
|
+
const ast = processor.parse(chunk);
|
|
455
|
+
// Extract translatable text nodes
|
|
456
|
+
const textNodes = extractTextNodes(ast);
|
|
457
|
+
if (textNodes.length > 0) {
|
|
458
|
+
// Split into sub-batches when node count exceeds maxNodesPerBatch (BUG-421-dense-chunk fix).
|
|
459
|
+
// Dense files (e.g. changelog) may have 300+ nodes/chunk, causing Claude ID hallucination.
|
|
460
|
+
for (let batchStart = 0; batchStart < textNodes.length; batchStart += resolvedMaxNodes) {
|
|
461
|
+
const batch = textNodes.slice(batchStart, batchStart + resolvedMaxNodes);
|
|
462
|
+
const { usage } = await translateBatch(provider, batch, targetLang, templateContent, glossaryConfig);
|
|
463
|
+
totalUsage.promptTokens += usage.promptTokens;
|
|
464
|
+
totalUsage.completionTokens += usage.completionTokens;
|
|
465
|
+
totalUsage.totalTokens += usage.totalTokens;
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
// Serialize AST back to markdown
|
|
469
|
+
const translatedChunk = processor.stringify(ast);
|
|
470
|
+
translatedChunks.push(translatedChunk);
|
|
545
471
|
}
|
|
546
|
-
//
|
|
547
|
-
//
|
|
548
|
-
|
|
549
|
-
const
|
|
550
|
-
|
|
551
|
-
|
|
472
|
+
// Join chunks: normalize each chunk to exactly one trailing newline
|
|
473
|
+
// (preserves trailing spaces for Markdown hard line breaks), then join
|
|
474
|
+
// with a single "\n" so chunk boundaries produce exactly one blank line.
|
|
475
|
+
const translatedBody = translatedChunks
|
|
476
|
+
.map((c) => c.replace(/\n+$/, "\n"))
|
|
477
|
+
.join("\n");
|
|
552
478
|
const translatedContent = frontmatter
|
|
553
479
|
? frontmatter + translatedBody
|
|
554
480
|
: translatedBody;
|
|
555
|
-
// Write output
|
|
556
481
|
writeFileSync(resolvedOutput, translatedContent, "utf-8");
|
|
557
482
|
return {
|
|
558
483
|
outputPath: resolvedOutput,
|