@llm-translate/cli 1.0.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.dockerignore +51 -0
- package/.env.example +33 -0
- package/.github/workflows/docs-pages.yml +57 -0
- package/.github/workflows/release.yml +49 -0
- package/.translaterc.json +44 -0
- package/CLAUDE.md +243 -0
- package/Dockerfile +55 -0
- package/README.md +371 -0
- package/RFC.md +1595 -0
- package/dist/cli/index.d.ts +2 -0
- package/dist/cli/index.js +4494 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/index.d.ts +1152 -0
- package/dist/index.js +3841 -0
- package/dist/index.js.map +1 -0
- package/docker-compose.yml +56 -0
- package/docs/.vitepress/config.ts +161 -0
- package/docs/api/agent.md +262 -0
- package/docs/api/engine.md +274 -0
- package/docs/api/index.md +171 -0
- package/docs/api/providers.md +304 -0
- package/docs/changelog.md +64 -0
- package/docs/cli/dir.md +243 -0
- package/docs/cli/file.md +213 -0
- package/docs/cli/glossary.md +273 -0
- package/docs/cli/index.md +129 -0
- package/docs/cli/init.md +158 -0
- package/docs/cli/serve.md +211 -0
- package/docs/glossary.json +235 -0
- package/docs/guide/chunking.md +272 -0
- package/docs/guide/configuration.md +139 -0
- package/docs/guide/cost-optimization.md +237 -0
- package/docs/guide/docker.md +371 -0
- package/docs/guide/getting-started.md +150 -0
- package/docs/guide/glossary.md +241 -0
- package/docs/guide/index.md +86 -0
- package/docs/guide/ollama.md +515 -0
- package/docs/guide/prompt-caching.md +221 -0
- package/docs/guide/providers.md +232 -0
- package/docs/guide/quality-control.md +206 -0
- package/docs/guide/vitepress-integration.md +265 -0
- package/docs/index.md +63 -0
- package/docs/ja/api/agent.md +262 -0
- package/docs/ja/api/engine.md +274 -0
- package/docs/ja/api/index.md +171 -0
- package/docs/ja/api/providers.md +304 -0
- package/docs/ja/changelog.md +64 -0
- package/docs/ja/cli/dir.md +243 -0
- package/docs/ja/cli/file.md +213 -0
- package/docs/ja/cli/glossary.md +273 -0
- package/docs/ja/cli/index.md +111 -0
- package/docs/ja/cli/init.md +158 -0
- package/docs/ja/guide/chunking.md +271 -0
- package/docs/ja/guide/configuration.md +139 -0
- package/docs/ja/guide/cost-optimization.md +30 -0
- package/docs/ja/guide/getting-started.md +150 -0
- package/docs/ja/guide/glossary.md +214 -0
- package/docs/ja/guide/index.md +32 -0
- package/docs/ja/guide/ollama.md +410 -0
- package/docs/ja/guide/prompt-caching.md +221 -0
- package/docs/ja/guide/providers.md +232 -0
- package/docs/ja/guide/quality-control.md +137 -0
- package/docs/ja/guide/vitepress-integration.md +265 -0
- package/docs/ja/index.md +58 -0
- package/docs/ko/api/agent.md +262 -0
- package/docs/ko/api/engine.md +274 -0
- package/docs/ko/api/index.md +171 -0
- package/docs/ko/api/providers.md +304 -0
- package/docs/ko/changelog.md +64 -0
- package/docs/ko/cli/dir.md +243 -0
- package/docs/ko/cli/file.md +213 -0
- package/docs/ko/cli/glossary.md +273 -0
- package/docs/ko/cli/index.md +111 -0
- package/docs/ko/cli/init.md +158 -0
- package/docs/ko/guide/chunking.md +271 -0
- package/docs/ko/guide/configuration.md +139 -0
- package/docs/ko/guide/cost-optimization.md +30 -0
- package/docs/ko/guide/getting-started.md +150 -0
- package/docs/ko/guide/glossary.md +214 -0
- package/docs/ko/guide/index.md +32 -0
- package/docs/ko/guide/ollama.md +410 -0
- package/docs/ko/guide/prompt-caching.md +221 -0
- package/docs/ko/guide/providers.md +232 -0
- package/docs/ko/guide/quality-control.md +137 -0
- package/docs/ko/guide/vitepress-integration.md +265 -0
- package/docs/ko/index.md +58 -0
- package/docs/zh/api/agent.md +262 -0
- package/docs/zh/api/engine.md +274 -0
- package/docs/zh/api/index.md +171 -0
- package/docs/zh/api/providers.md +304 -0
- package/docs/zh/changelog.md +64 -0
- package/docs/zh/cli/dir.md +243 -0
- package/docs/zh/cli/file.md +213 -0
- package/docs/zh/cli/glossary.md +273 -0
- package/docs/zh/cli/index.md +111 -0
- package/docs/zh/cli/init.md +158 -0
- package/docs/zh/guide/chunking.md +271 -0
- package/docs/zh/guide/configuration.md +139 -0
- package/docs/zh/guide/cost-optimization.md +30 -0
- package/docs/zh/guide/getting-started.md +150 -0
- package/docs/zh/guide/glossary.md +214 -0
- package/docs/zh/guide/index.md +32 -0
- package/docs/zh/guide/ollama.md +410 -0
- package/docs/zh/guide/prompt-caching.md +221 -0
- package/docs/zh/guide/providers.md +232 -0
- package/docs/zh/guide/quality-control.md +137 -0
- package/docs/zh/guide/vitepress-integration.md +265 -0
- package/docs/zh/index.md +58 -0
- package/package.json +91 -0
- package/release.config.mjs +15 -0
- package/schemas/glossary.schema.json +110 -0
- package/src/cli/commands/dir.ts +469 -0
- package/src/cli/commands/file.ts +291 -0
- package/src/cli/commands/glossary.ts +221 -0
- package/src/cli/commands/init.ts +68 -0
- package/src/cli/commands/serve.ts +60 -0
- package/src/cli/index.ts +64 -0
- package/src/cli/options.ts +59 -0
- package/src/core/agent.ts +1119 -0
- package/src/core/chunker.ts +391 -0
- package/src/core/engine.ts +634 -0
- package/src/errors.ts +188 -0
- package/src/index.ts +147 -0
- package/src/integrations/vitepress.ts +549 -0
- package/src/parsers/markdown.ts +383 -0
- package/src/providers/claude.ts +259 -0
- package/src/providers/interface.ts +109 -0
- package/src/providers/ollama.ts +379 -0
- package/src/providers/openai.ts +308 -0
- package/src/providers/registry.ts +153 -0
- package/src/server/index.ts +152 -0
- package/src/server/middleware/auth.ts +93 -0
- package/src/server/middleware/logger.ts +90 -0
- package/src/server/routes/health.ts +84 -0
- package/src/server/routes/translate.ts +210 -0
- package/src/server/types.ts +138 -0
- package/src/services/cache.ts +899 -0
- package/src/services/config.ts +217 -0
- package/src/services/glossary.ts +247 -0
- package/src/types/analysis.ts +164 -0
- package/src/types/index.ts +265 -0
- package/src/types/modes.ts +121 -0
- package/src/types/mqm.ts +157 -0
- package/src/utils/logger.ts +141 -0
- package/src/utils/tokens.ts +116 -0
- package/tests/fixtures/glossaries/ml-glossary.json +53 -0
- package/tests/fixtures/input/lynq-installation.ko.md +350 -0
- package/tests/fixtures/input/lynq-installation.md +350 -0
- package/tests/fixtures/input/simple.ko.md +27 -0
- package/tests/fixtures/input/simple.md +27 -0
- package/tests/unit/chunker.test.ts +229 -0
- package/tests/unit/glossary.test.ts +146 -0
- package/tests/unit/markdown.test.ts +205 -0
- package/tests/unit/tokens.test.ts +81 -0
- package/tsconfig.json +28 -0
- package/tsup.config.ts +34 -0
- package/vitest.config.ts +16 -0
|
@@ -0,0 +1,391 @@
|
|
|
1
|
+
import type { Chunk, ChunkingConfig } from "../types/index.js";
|
|
2
|
+
import { estimateTokens } from "../utils/tokens.js";
|
|
3
|
+
|
|
4
|
+
// ============================================================================
|
|
5
|
+
// Default Configuration
|
|
6
|
+
// ============================================================================
|
|
7
|
+
|
|
8
|
+
const DEFAULT_CONFIG: ChunkingConfig = {
|
|
9
|
+
maxTokens: 1024,
|
|
10
|
+
overlapTokens: 150,
|
|
11
|
+
separators: ["\n\n", "\n", ". ", " "],
|
|
12
|
+
preservePatterns: [
|
|
13
|
+
/```[\s\S]*?```/g, // Code blocks
|
|
14
|
+
/`[^`]+`/g, // Inline code
|
|
15
|
+
/\[.*?\]\(.*?\)/g, // Links
|
|
16
|
+
],
|
|
17
|
+
};
|
|
18
|
+
|
|
19
|
+
// ============================================================================
|
|
20
|
+
// Chunker Implementation
|
|
21
|
+
// ============================================================================
|
|
22
|
+
|
|
23
|
+
export interface ChunkerOptions {
|
|
24
|
+
maxTokens?: number;
|
|
25
|
+
overlapTokens?: number;
|
|
26
|
+
preserveCodeBlocks?: boolean;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Split content into chunks that respect token limits
|
|
31
|
+
*/
|
|
32
|
+
export function chunkContent(
|
|
33
|
+
content: string,
|
|
34
|
+
options: ChunkerOptions = {}
|
|
35
|
+
): Chunk[] {
|
|
36
|
+
// Handle empty or whitespace-only content
|
|
37
|
+
if (!content.trim()) {
|
|
38
|
+
return [];
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
const config: ChunkingConfig = {
|
|
42
|
+
...DEFAULT_CONFIG,
|
|
43
|
+
maxTokens: options.maxTokens ?? DEFAULT_CONFIG.maxTokens,
|
|
44
|
+
overlapTokens: options.overlapTokens ?? DEFAULT_CONFIG.overlapTokens,
|
|
45
|
+
};
|
|
46
|
+
|
|
47
|
+
// Extract header hierarchy from the entire content
|
|
48
|
+
const headerHierarchy = extractHeaderHierarchy(content);
|
|
49
|
+
|
|
50
|
+
// First, identify and extract preserved sections (code blocks, etc.)
|
|
51
|
+
const { segments } = extractPreservedSections(content);
|
|
52
|
+
|
|
53
|
+
// Chunk the translatable segments
|
|
54
|
+
const chunks: Chunk[] = [];
|
|
55
|
+
let previousChunkContent: string | undefined;
|
|
56
|
+
|
|
57
|
+
for (const segment of segments) {
|
|
58
|
+
// Find relevant headers for this segment
|
|
59
|
+
const segmentHeaders = getHeadersForPosition(
|
|
60
|
+
headerHierarchy,
|
|
61
|
+
segment.startOffset
|
|
62
|
+
);
|
|
63
|
+
|
|
64
|
+
if (segment.type === "preserve") {
|
|
65
|
+
// Preserved content (code blocks) - don't chunk
|
|
66
|
+
chunks.push({
|
|
67
|
+
id: `chunk-${chunks.length}`,
|
|
68
|
+
content: segment.content,
|
|
69
|
+
type: "preserve",
|
|
70
|
+
startOffset: segment.startOffset,
|
|
71
|
+
endOffset: segment.endOffset,
|
|
72
|
+
metadata: {
|
|
73
|
+
headerHierarchy: segmentHeaders,
|
|
74
|
+
},
|
|
75
|
+
});
|
|
76
|
+
} else {
|
|
77
|
+
// Translatable content - split into chunks
|
|
78
|
+
const textChunks = splitIntoChunks(
|
|
79
|
+
segment.content,
|
|
80
|
+
config,
|
|
81
|
+
segment.startOffset
|
|
82
|
+
);
|
|
83
|
+
|
|
84
|
+
for (let idx = 0; idx < textChunks.length; idx++) {
|
|
85
|
+
const chunk = textChunks[idx];
|
|
86
|
+
if (!chunk) continue;
|
|
87
|
+
|
|
88
|
+
// Find headers specific to this chunk's position
|
|
89
|
+
const chunkHeaders = getHeadersForPosition(
|
|
90
|
+
headerHierarchy,
|
|
91
|
+
chunk.startOffset
|
|
92
|
+
);
|
|
93
|
+
|
|
94
|
+
chunks.push({
|
|
95
|
+
...chunk,
|
|
96
|
+
id: `chunk-${chunks.length}`,
|
|
97
|
+
metadata: {
|
|
98
|
+
headerHierarchy:
|
|
99
|
+
chunkHeaders.length > 0 ? chunkHeaders : segmentHeaders,
|
|
100
|
+
previousContext: previousChunkContent,
|
|
101
|
+
},
|
|
102
|
+
});
|
|
103
|
+
|
|
104
|
+
// Store current chunk content for next iteration (truncate if too long)
|
|
105
|
+
previousChunkContent = truncateForContext(chunk.content, 200);
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
return chunks;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
/**
|
|
114
|
+
* Extract header hierarchy from markdown content
|
|
115
|
+
*/
|
|
116
|
+
function extractHeaderHierarchy(
|
|
117
|
+
content: string
|
|
118
|
+
): Array<{ level: number; text: string; offset: number }> {
|
|
119
|
+
const headers: Array<{ level: number; text: string; offset: number }> = [];
|
|
120
|
+
const headerRegex = /^(#{1,6})\s+(.+)$/gm;
|
|
121
|
+
let match: RegExpExecArray | null;
|
|
122
|
+
|
|
123
|
+
while ((match = headerRegex.exec(content)) !== null) {
|
|
124
|
+
const hashMarks = match[1];
|
|
125
|
+
if (hashMarks) {
|
|
126
|
+
headers.push({
|
|
127
|
+
level: hashMarks.length,
|
|
128
|
+
text: match[0],
|
|
129
|
+
offset: match.index,
|
|
130
|
+
});
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
return headers;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Get relevant headers for a given position in the document
|
|
139
|
+
*/
|
|
140
|
+
function getHeadersForPosition(
|
|
141
|
+
headers: Array<{ level: number; text: string; offset: number }>,
|
|
142
|
+
position: number
|
|
143
|
+
): string[] {
|
|
144
|
+
const relevantHeaders: string[] = [];
|
|
145
|
+
const currentLevels: Map<number, string> = new Map();
|
|
146
|
+
|
|
147
|
+
for (const header of headers) {
|
|
148
|
+
if (header.offset > position) break;
|
|
149
|
+
|
|
150
|
+
// Clear all lower level headers when we encounter a new header
|
|
151
|
+
for (const [level] of currentLevels) {
|
|
152
|
+
if (level >= header.level) {
|
|
153
|
+
currentLevels.delete(level);
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
currentLevels.set(header.level, header.text);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
// Build hierarchy from level 1 to 6
|
|
160
|
+
for (let level = 1; level <= 6; level++) {
|
|
161
|
+
const headerText = currentLevels.get(level);
|
|
162
|
+
if (headerText) {
|
|
163
|
+
relevantHeaders.push(headerText);
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
return relevantHeaders;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
/**
|
|
171
|
+
* Truncate content for context, preserving word boundaries
|
|
172
|
+
*/
|
|
173
|
+
function truncateForContext(content: string, maxChars: number): string {
|
|
174
|
+
if (content.length <= maxChars) return content;
|
|
175
|
+
|
|
176
|
+
const truncated = content.slice(-maxChars);
|
|
177
|
+
const firstSpace = truncated.indexOf(" ");
|
|
178
|
+
if (firstSpace > 0 && firstSpace < 50) {
|
|
179
|
+
return "..." + truncated.slice(firstSpace + 1);
|
|
180
|
+
}
|
|
181
|
+
return "..." + truncated;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// ============================================================================
|
|
185
|
+
// Preserved Section Extraction
|
|
186
|
+
// ============================================================================
|
|
187
|
+
|
|
188
|
+
interface Segment {
|
|
189
|
+
content: string;
|
|
190
|
+
type: "translatable" | "preserve";
|
|
191
|
+
startOffset: number;
|
|
192
|
+
endOffset: number;
|
|
193
|
+
headerHierarchy?: string[];
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
function extractPreservedSections(content: string): { segments: Segment[] } {
|
|
197
|
+
const preservedRanges: Array<{
|
|
198
|
+
start: number;
|
|
199
|
+
end: number;
|
|
200
|
+
content: string;
|
|
201
|
+
}> = [];
|
|
202
|
+
|
|
203
|
+
// Find all code blocks (fenced)
|
|
204
|
+
const codeBlockRegex = /```[\s\S]*?```/g;
|
|
205
|
+
let match: RegExpExecArray | null;
|
|
206
|
+
|
|
207
|
+
while ((match = codeBlockRegex.exec(content)) !== null) {
|
|
208
|
+
preservedRanges.push({
|
|
209
|
+
start: match.index,
|
|
210
|
+
end: match.index + match[0].length,
|
|
211
|
+
content: match[0],
|
|
212
|
+
});
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
// Sort by start position
|
|
216
|
+
preservedRanges.sort((a, b) => a.start - b.start);
|
|
217
|
+
|
|
218
|
+
// Build segments
|
|
219
|
+
const segments: Segment[] = [];
|
|
220
|
+
let lastEnd = 0;
|
|
221
|
+
|
|
222
|
+
for (const range of preservedRanges) {
|
|
223
|
+
// Add translatable segment before this preserved section
|
|
224
|
+
if (range.start > lastEnd) {
|
|
225
|
+
const translatableContent = content.slice(lastEnd, range.start);
|
|
226
|
+
// Include segment even if it's only whitespace (to preserve line breaks)
|
|
227
|
+
if (translatableContent.length > 0) {
|
|
228
|
+
segments.push({
|
|
229
|
+
content: translatableContent,
|
|
230
|
+
type: translatableContent.trim() ? "translatable" : "preserve",
|
|
231
|
+
startOffset: lastEnd,
|
|
232
|
+
endOffset: range.start,
|
|
233
|
+
});
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
// Add preserved segment
|
|
238
|
+
segments.push({
|
|
239
|
+
content: range.content,
|
|
240
|
+
type: "preserve",
|
|
241
|
+
startOffset: range.start,
|
|
242
|
+
endOffset: range.end,
|
|
243
|
+
});
|
|
244
|
+
|
|
245
|
+
lastEnd = range.end;
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
// Add remaining translatable content
|
|
249
|
+
if (lastEnd < content.length) {
|
|
250
|
+
const remainingContent = content.slice(lastEnd);
|
|
251
|
+
// Include segment even if it's only whitespace (to preserve line breaks)
|
|
252
|
+
if (remainingContent.length > 0) {
|
|
253
|
+
segments.push({
|
|
254
|
+
content: remainingContent,
|
|
255
|
+
type: remainingContent.trim() ? "translatable" : "preserve",
|
|
256
|
+
startOffset: lastEnd,
|
|
257
|
+
endOffset: content.length,
|
|
258
|
+
});
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
// If no preserved sections, return whole content as translatable
|
|
263
|
+
if (segments.length === 0) {
|
|
264
|
+
segments.push({
|
|
265
|
+
content,
|
|
266
|
+
type: "translatable",
|
|
267
|
+
startOffset: 0,
|
|
268
|
+
endOffset: content.length,
|
|
269
|
+
});
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
return { segments };
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
// ============================================================================
|
|
276
|
+
// Text Chunking with Overlap
|
|
277
|
+
// ============================================================================
|
|
278
|
+
|
|
279
|
+
function splitIntoChunks(
|
|
280
|
+
text: string,
|
|
281
|
+
config: ChunkingConfig,
|
|
282
|
+
baseOffset: number
|
|
283
|
+
): Chunk[] {
|
|
284
|
+
const chunks: Chunk[] = [];
|
|
285
|
+
const tokenCount = estimateTokens(text);
|
|
286
|
+
|
|
287
|
+
// If text fits in one chunk, return it as-is (preserve whitespace)
|
|
288
|
+
if (tokenCount <= config.maxTokens) {
|
|
289
|
+
return [
|
|
290
|
+
{
|
|
291
|
+
id: "",
|
|
292
|
+
content: text,
|
|
293
|
+
type: "translatable",
|
|
294
|
+
startOffset: baseOffset,
|
|
295
|
+
endOffset: baseOffset + text.length,
|
|
296
|
+
},
|
|
297
|
+
];
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
// Split by paragraph boundaries while preserving the separators
|
|
301
|
+
// Use a regex that captures the separator so we can preserve exact whitespace
|
|
302
|
+
const parts = text.split(/(\n\n+)/);
|
|
303
|
+
|
|
304
|
+
let currentChunk = "";
|
|
305
|
+
let chunkStartOffset = baseOffset;
|
|
306
|
+
let textOffset = baseOffset;
|
|
307
|
+
|
|
308
|
+
for (let i = 0; i < parts.length; i++) {
|
|
309
|
+
const part = parts[i];
|
|
310
|
+
if (part === undefined) continue;
|
|
311
|
+
|
|
312
|
+
const potentialChunk = currentChunk + part;
|
|
313
|
+
const potentialTokens = estimateTokens(potentialChunk);
|
|
314
|
+
|
|
315
|
+
if (potentialTokens > config.maxTokens && currentChunk) {
|
|
316
|
+
// Save current chunk - preserve content as-is without trimming
|
|
317
|
+
chunks.push({
|
|
318
|
+
id: "",
|
|
319
|
+
content: currentChunk,
|
|
320
|
+
type: "translatable",
|
|
321
|
+
startOffset: chunkStartOffset,
|
|
322
|
+
endOffset: textOffset,
|
|
323
|
+
});
|
|
324
|
+
|
|
325
|
+
// Start new chunk
|
|
326
|
+
currentChunk = part;
|
|
327
|
+
chunkStartOffset = textOffset;
|
|
328
|
+
} else {
|
|
329
|
+
currentChunk = potentialChunk;
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
textOffset += part.length;
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
// Add remaining content (preserve as-is)
|
|
336
|
+
if (currentChunk.length > 0) {
|
|
337
|
+
chunks.push({
|
|
338
|
+
id: "",
|
|
339
|
+
content: currentChunk,
|
|
340
|
+
type: "translatable",
|
|
341
|
+
startOffset: chunkStartOffset,
|
|
342
|
+
endOffset: baseOffset + text.length,
|
|
343
|
+
});
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
return chunks;
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
// ============================================================================
|
|
350
|
+
// Utility Functions
|
|
351
|
+
// ============================================================================
|
|
352
|
+
|
|
353
|
+
/**
|
|
354
|
+
* Reassemble chunks back into a document
|
|
355
|
+
* Note: Chunks should not have overlapping content - overlap is only used for context metadata
|
|
356
|
+
*/
|
|
357
|
+
export function reassembleChunks(chunks: Chunk[]): string {
|
|
358
|
+
// Sort chunks by startOffset
|
|
359
|
+
const sorted = [...chunks].sort((a, b) => a.startOffset - b.startOffset);
|
|
360
|
+
|
|
361
|
+
// Simply concatenate - no overlap handling needed since content doesn't overlap
|
|
362
|
+
return sorted.map((chunk) => chunk.content).join("");
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
/**
|
|
366
|
+
* Get chunk statistics
|
|
367
|
+
*/
|
|
368
|
+
export function getChunkStats(chunks: Chunk[]): {
|
|
369
|
+
totalChunks: number;
|
|
370
|
+
translatableChunks: number;
|
|
371
|
+
preservedChunks: number;
|
|
372
|
+
totalTokens: number;
|
|
373
|
+
averageTokens: number;
|
|
374
|
+
} {
|
|
375
|
+
const translatableChunks = chunks.filter((c) => c.type === "translatable");
|
|
376
|
+
const preservedChunks = chunks.filter((c) => c.type === "preserve");
|
|
377
|
+
|
|
378
|
+
const totalTokens = chunks.reduce(
|
|
379
|
+
(sum, chunk) => sum + estimateTokens(chunk.content),
|
|
380
|
+
0
|
|
381
|
+
);
|
|
382
|
+
|
|
383
|
+
return {
|
|
384
|
+
totalChunks: chunks.length,
|
|
385
|
+
translatableChunks: translatableChunks.length,
|
|
386
|
+
preservedChunks: preservedChunks.length,
|
|
387
|
+
totalTokens,
|
|
388
|
+
averageTokens:
|
|
389
|
+
chunks.length > 0 ? Math.round(totalTokens / chunks.length) : 0,
|
|
390
|
+
};
|
|
391
|
+
}
|