@llm-translate/cli 1.0.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. package/.dockerignore +51 -0
  2. package/.env.example +33 -0
  3. package/.github/workflows/docs-pages.yml +57 -0
  4. package/.github/workflows/release.yml +49 -0
  5. package/.translaterc.json +44 -0
  6. package/CLAUDE.md +243 -0
  7. package/Dockerfile +55 -0
  8. package/README.md +371 -0
  9. package/RFC.md +1595 -0
  10. package/dist/cli/index.d.ts +2 -0
  11. package/dist/cli/index.js +4494 -0
  12. package/dist/cli/index.js.map +1 -0
  13. package/dist/index.d.ts +1152 -0
  14. package/dist/index.js +3841 -0
  15. package/dist/index.js.map +1 -0
  16. package/docker-compose.yml +56 -0
  17. package/docs/.vitepress/config.ts +161 -0
  18. package/docs/api/agent.md +262 -0
  19. package/docs/api/engine.md +274 -0
  20. package/docs/api/index.md +171 -0
  21. package/docs/api/providers.md +304 -0
  22. package/docs/changelog.md +64 -0
  23. package/docs/cli/dir.md +243 -0
  24. package/docs/cli/file.md +213 -0
  25. package/docs/cli/glossary.md +273 -0
  26. package/docs/cli/index.md +129 -0
  27. package/docs/cli/init.md +158 -0
  28. package/docs/cli/serve.md +211 -0
  29. package/docs/glossary.json +235 -0
  30. package/docs/guide/chunking.md +272 -0
  31. package/docs/guide/configuration.md +139 -0
  32. package/docs/guide/cost-optimization.md +237 -0
  33. package/docs/guide/docker.md +371 -0
  34. package/docs/guide/getting-started.md +150 -0
  35. package/docs/guide/glossary.md +241 -0
  36. package/docs/guide/index.md +86 -0
  37. package/docs/guide/ollama.md +515 -0
  38. package/docs/guide/prompt-caching.md +221 -0
  39. package/docs/guide/providers.md +232 -0
  40. package/docs/guide/quality-control.md +206 -0
  41. package/docs/guide/vitepress-integration.md +265 -0
  42. package/docs/index.md +63 -0
  43. package/docs/ja/api/agent.md +262 -0
  44. package/docs/ja/api/engine.md +274 -0
  45. package/docs/ja/api/index.md +171 -0
  46. package/docs/ja/api/providers.md +304 -0
  47. package/docs/ja/changelog.md +64 -0
  48. package/docs/ja/cli/dir.md +243 -0
  49. package/docs/ja/cli/file.md +213 -0
  50. package/docs/ja/cli/glossary.md +273 -0
  51. package/docs/ja/cli/index.md +111 -0
  52. package/docs/ja/cli/init.md +158 -0
  53. package/docs/ja/guide/chunking.md +271 -0
  54. package/docs/ja/guide/configuration.md +139 -0
  55. package/docs/ja/guide/cost-optimization.md +30 -0
  56. package/docs/ja/guide/getting-started.md +150 -0
  57. package/docs/ja/guide/glossary.md +214 -0
  58. package/docs/ja/guide/index.md +32 -0
  59. package/docs/ja/guide/ollama.md +410 -0
  60. package/docs/ja/guide/prompt-caching.md +221 -0
  61. package/docs/ja/guide/providers.md +232 -0
  62. package/docs/ja/guide/quality-control.md +137 -0
  63. package/docs/ja/guide/vitepress-integration.md +265 -0
  64. package/docs/ja/index.md +58 -0
  65. package/docs/ko/api/agent.md +262 -0
  66. package/docs/ko/api/engine.md +274 -0
  67. package/docs/ko/api/index.md +171 -0
  68. package/docs/ko/api/providers.md +304 -0
  69. package/docs/ko/changelog.md +64 -0
  70. package/docs/ko/cli/dir.md +243 -0
  71. package/docs/ko/cli/file.md +213 -0
  72. package/docs/ko/cli/glossary.md +273 -0
  73. package/docs/ko/cli/index.md +111 -0
  74. package/docs/ko/cli/init.md +158 -0
  75. package/docs/ko/guide/chunking.md +271 -0
  76. package/docs/ko/guide/configuration.md +139 -0
  77. package/docs/ko/guide/cost-optimization.md +30 -0
  78. package/docs/ko/guide/getting-started.md +150 -0
  79. package/docs/ko/guide/glossary.md +214 -0
  80. package/docs/ko/guide/index.md +32 -0
  81. package/docs/ko/guide/ollama.md +410 -0
  82. package/docs/ko/guide/prompt-caching.md +221 -0
  83. package/docs/ko/guide/providers.md +232 -0
  84. package/docs/ko/guide/quality-control.md +137 -0
  85. package/docs/ko/guide/vitepress-integration.md +265 -0
  86. package/docs/ko/index.md +58 -0
  87. package/docs/zh/api/agent.md +262 -0
  88. package/docs/zh/api/engine.md +274 -0
  89. package/docs/zh/api/index.md +171 -0
  90. package/docs/zh/api/providers.md +304 -0
  91. package/docs/zh/changelog.md +64 -0
  92. package/docs/zh/cli/dir.md +243 -0
  93. package/docs/zh/cli/file.md +213 -0
  94. package/docs/zh/cli/glossary.md +273 -0
  95. package/docs/zh/cli/index.md +111 -0
  96. package/docs/zh/cli/init.md +158 -0
  97. package/docs/zh/guide/chunking.md +271 -0
  98. package/docs/zh/guide/configuration.md +139 -0
  99. package/docs/zh/guide/cost-optimization.md +30 -0
  100. package/docs/zh/guide/getting-started.md +150 -0
  101. package/docs/zh/guide/glossary.md +214 -0
  102. package/docs/zh/guide/index.md +32 -0
  103. package/docs/zh/guide/ollama.md +410 -0
  104. package/docs/zh/guide/prompt-caching.md +221 -0
  105. package/docs/zh/guide/providers.md +232 -0
  106. package/docs/zh/guide/quality-control.md +137 -0
  107. package/docs/zh/guide/vitepress-integration.md +265 -0
  108. package/docs/zh/index.md +58 -0
  109. package/package.json +91 -0
  110. package/release.config.mjs +15 -0
  111. package/schemas/glossary.schema.json +110 -0
  112. package/src/cli/commands/dir.ts +469 -0
  113. package/src/cli/commands/file.ts +291 -0
  114. package/src/cli/commands/glossary.ts +221 -0
  115. package/src/cli/commands/init.ts +68 -0
  116. package/src/cli/commands/serve.ts +60 -0
  117. package/src/cli/index.ts +64 -0
  118. package/src/cli/options.ts +59 -0
  119. package/src/core/agent.ts +1119 -0
  120. package/src/core/chunker.ts +391 -0
  121. package/src/core/engine.ts +634 -0
  122. package/src/errors.ts +188 -0
  123. package/src/index.ts +147 -0
  124. package/src/integrations/vitepress.ts +549 -0
  125. package/src/parsers/markdown.ts +383 -0
  126. package/src/providers/claude.ts +259 -0
  127. package/src/providers/interface.ts +109 -0
  128. package/src/providers/ollama.ts +379 -0
  129. package/src/providers/openai.ts +308 -0
  130. package/src/providers/registry.ts +153 -0
  131. package/src/server/index.ts +152 -0
  132. package/src/server/middleware/auth.ts +93 -0
  133. package/src/server/middleware/logger.ts +90 -0
  134. package/src/server/routes/health.ts +84 -0
  135. package/src/server/routes/translate.ts +210 -0
  136. package/src/server/types.ts +138 -0
  137. package/src/services/cache.ts +899 -0
  138. package/src/services/config.ts +217 -0
  139. package/src/services/glossary.ts +247 -0
  140. package/src/types/analysis.ts +164 -0
  141. package/src/types/index.ts +265 -0
  142. package/src/types/modes.ts +121 -0
  143. package/src/types/mqm.ts +157 -0
  144. package/src/utils/logger.ts +141 -0
  145. package/src/utils/tokens.ts +116 -0
  146. package/tests/fixtures/glossaries/ml-glossary.json +53 -0
  147. package/tests/fixtures/input/lynq-installation.ko.md +350 -0
  148. package/tests/fixtures/input/lynq-installation.md +350 -0
  149. package/tests/fixtures/input/simple.ko.md +27 -0
  150. package/tests/fixtures/input/simple.md +27 -0
  151. package/tests/unit/chunker.test.ts +229 -0
  152. package/tests/unit/glossary.test.ts +146 -0
  153. package/tests/unit/markdown.test.ts +205 -0
  154. package/tests/unit/tokens.test.ts +81 -0
  155. package/tsconfig.json +28 -0
  156. package/tsup.config.ts +34 -0
  157. package/vitest.config.ts +16 -0
@@ -0,0 +1,391 @@
1
+ import type { Chunk, ChunkingConfig } from "../types/index.js";
2
+ import { estimateTokens } from "../utils/tokens.js";
3
+
4
+ // ============================================================================
5
+ // Default Configuration
6
+ // ============================================================================
7
+
8
+ const DEFAULT_CONFIG: ChunkingConfig = {
9
+ maxTokens: 1024,
10
+ overlapTokens: 150,
11
+ separators: ["\n\n", "\n", ". ", " "],
12
+ preservePatterns: [
13
+ /```[\s\S]*?```/g, // Code blocks
14
+ /`[^`]+`/g, // Inline code
15
+ /\[.*?\]\(.*?\)/g, // Links
16
+ ],
17
+ };
18
+
19
+ // ============================================================================
20
+ // Chunker Implementation
21
+ // ============================================================================
22
+
23
+ export interface ChunkerOptions {
24
+ maxTokens?: number;
25
+ overlapTokens?: number;
26
+ preserveCodeBlocks?: boolean;
27
+ }
28
+
29
+ /**
30
+ * Split content into chunks that respect token limits
31
+ */
32
+ export function chunkContent(
33
+ content: string,
34
+ options: ChunkerOptions = {}
35
+ ): Chunk[] {
36
+ // Handle empty or whitespace-only content
37
+ if (!content.trim()) {
38
+ return [];
39
+ }
40
+
41
+ const config: ChunkingConfig = {
42
+ ...DEFAULT_CONFIG,
43
+ maxTokens: options.maxTokens ?? DEFAULT_CONFIG.maxTokens,
44
+ overlapTokens: options.overlapTokens ?? DEFAULT_CONFIG.overlapTokens,
45
+ };
46
+
47
+ // Extract header hierarchy from the entire content
48
+ const headerHierarchy = extractHeaderHierarchy(content);
49
+
50
+ // First, identify and extract preserved sections (code blocks, etc.)
51
+ const { segments } = extractPreservedSections(content);
52
+
53
+ // Chunk the translatable segments
54
+ const chunks: Chunk[] = [];
55
+ let previousChunkContent: string | undefined;
56
+
57
+ for (const segment of segments) {
58
+ // Find relevant headers for this segment
59
+ const segmentHeaders = getHeadersForPosition(
60
+ headerHierarchy,
61
+ segment.startOffset
62
+ );
63
+
64
+ if (segment.type === "preserve") {
65
+ // Preserved content (code blocks) - don't chunk
66
+ chunks.push({
67
+ id: `chunk-${chunks.length}`,
68
+ content: segment.content,
69
+ type: "preserve",
70
+ startOffset: segment.startOffset,
71
+ endOffset: segment.endOffset,
72
+ metadata: {
73
+ headerHierarchy: segmentHeaders,
74
+ },
75
+ });
76
+ } else {
77
+ // Translatable content - split into chunks
78
+ const textChunks = splitIntoChunks(
79
+ segment.content,
80
+ config,
81
+ segment.startOffset
82
+ );
83
+
84
+ for (let idx = 0; idx < textChunks.length; idx++) {
85
+ const chunk = textChunks[idx];
86
+ if (!chunk) continue;
87
+
88
+ // Find headers specific to this chunk's position
89
+ const chunkHeaders = getHeadersForPosition(
90
+ headerHierarchy,
91
+ chunk.startOffset
92
+ );
93
+
94
+ chunks.push({
95
+ ...chunk,
96
+ id: `chunk-${chunks.length}`,
97
+ metadata: {
98
+ headerHierarchy:
99
+ chunkHeaders.length > 0 ? chunkHeaders : segmentHeaders,
100
+ previousContext: previousChunkContent,
101
+ },
102
+ });
103
+
104
+ // Store current chunk content for next iteration (truncate if too long)
105
+ previousChunkContent = truncateForContext(chunk.content, 200);
106
+ }
107
+ }
108
+ }
109
+
110
+ return chunks;
111
+ }
112
+
113
+ /**
114
+ * Extract header hierarchy from markdown content
115
+ */
116
+ function extractHeaderHierarchy(
117
+ content: string
118
+ ): Array<{ level: number; text: string; offset: number }> {
119
+ const headers: Array<{ level: number; text: string; offset: number }> = [];
120
+ const headerRegex = /^(#{1,6})\s+(.+)$/gm;
121
+ let match: RegExpExecArray | null;
122
+
123
+ while ((match = headerRegex.exec(content)) !== null) {
124
+ const hashMarks = match[1];
125
+ if (hashMarks) {
126
+ headers.push({
127
+ level: hashMarks.length,
128
+ text: match[0],
129
+ offset: match.index,
130
+ });
131
+ }
132
+ }
133
+
134
+ return headers;
135
+ }
136
+
137
+ /**
138
+ * Get relevant headers for a given position in the document
139
+ */
140
+ function getHeadersForPosition(
141
+ headers: Array<{ level: number; text: string; offset: number }>,
142
+ position: number
143
+ ): string[] {
144
+ const relevantHeaders: string[] = [];
145
+ const currentLevels: Map<number, string> = new Map();
146
+
147
+ for (const header of headers) {
148
+ if (header.offset > position) break;
149
+
150
+ // Clear all lower level headers when we encounter a new header
151
+ for (const [level] of currentLevels) {
152
+ if (level >= header.level) {
153
+ currentLevels.delete(level);
154
+ }
155
+ }
156
+ currentLevels.set(header.level, header.text);
157
+ }
158
+
159
+ // Build hierarchy from level 1 to 6
160
+ for (let level = 1; level <= 6; level++) {
161
+ const headerText = currentLevels.get(level);
162
+ if (headerText) {
163
+ relevantHeaders.push(headerText);
164
+ }
165
+ }
166
+
167
+ return relevantHeaders;
168
+ }
169
+
170
+ /**
171
+ * Truncate content for context, preserving word boundaries
172
+ */
173
+ function truncateForContext(content: string, maxChars: number): string {
174
+ if (content.length <= maxChars) return content;
175
+
176
+ const truncated = content.slice(-maxChars);
177
+ const firstSpace = truncated.indexOf(" ");
178
+ if (firstSpace > 0 && firstSpace < 50) {
179
+ return "..." + truncated.slice(firstSpace + 1);
180
+ }
181
+ return "..." + truncated;
182
+ }
183
+
184
+ // ============================================================================
185
+ // Preserved Section Extraction
186
+ // ============================================================================
187
+
188
+ interface Segment {
189
+ content: string;
190
+ type: "translatable" | "preserve";
191
+ startOffset: number;
192
+ endOffset: number;
193
+ headerHierarchy?: string[];
194
+ }
195
+
196
+ function extractPreservedSections(content: string): { segments: Segment[] } {
197
+ const preservedRanges: Array<{
198
+ start: number;
199
+ end: number;
200
+ content: string;
201
+ }> = [];
202
+
203
+ // Find all code blocks (fenced)
204
+ const codeBlockRegex = /```[\s\S]*?```/g;
205
+ let match: RegExpExecArray | null;
206
+
207
+ while ((match = codeBlockRegex.exec(content)) !== null) {
208
+ preservedRanges.push({
209
+ start: match.index,
210
+ end: match.index + match[0].length,
211
+ content: match[0],
212
+ });
213
+ }
214
+
215
+ // Sort by start position
216
+ preservedRanges.sort((a, b) => a.start - b.start);
217
+
218
+ // Build segments
219
+ const segments: Segment[] = [];
220
+ let lastEnd = 0;
221
+
222
+ for (const range of preservedRanges) {
223
+ // Add translatable segment before this preserved section
224
+ if (range.start > lastEnd) {
225
+ const translatableContent = content.slice(lastEnd, range.start);
226
+ // Include segment even if it's only whitespace (to preserve line breaks)
227
+ if (translatableContent.length > 0) {
228
+ segments.push({
229
+ content: translatableContent,
230
+ type: translatableContent.trim() ? "translatable" : "preserve",
231
+ startOffset: lastEnd,
232
+ endOffset: range.start,
233
+ });
234
+ }
235
+ }
236
+
237
+ // Add preserved segment
238
+ segments.push({
239
+ content: range.content,
240
+ type: "preserve",
241
+ startOffset: range.start,
242
+ endOffset: range.end,
243
+ });
244
+
245
+ lastEnd = range.end;
246
+ }
247
+
248
+ // Add remaining translatable content
249
+ if (lastEnd < content.length) {
250
+ const remainingContent = content.slice(lastEnd);
251
+ // Include segment even if it's only whitespace (to preserve line breaks)
252
+ if (remainingContent.length > 0) {
253
+ segments.push({
254
+ content: remainingContent,
255
+ type: remainingContent.trim() ? "translatable" : "preserve",
256
+ startOffset: lastEnd,
257
+ endOffset: content.length,
258
+ });
259
+ }
260
+ }
261
+
262
+ // If no preserved sections, return whole content as translatable
263
+ if (segments.length === 0) {
264
+ segments.push({
265
+ content,
266
+ type: "translatable",
267
+ startOffset: 0,
268
+ endOffset: content.length,
269
+ });
270
+ }
271
+
272
+ return { segments };
273
+ }
274
+
275
+ // ============================================================================
276
+ // Text Chunking with Overlap
277
+ // ============================================================================
278
+
279
+ function splitIntoChunks(
280
+ text: string,
281
+ config: ChunkingConfig,
282
+ baseOffset: number
283
+ ): Chunk[] {
284
+ const chunks: Chunk[] = [];
285
+ const tokenCount = estimateTokens(text);
286
+
287
+ // If text fits in one chunk, return it as-is (preserve whitespace)
288
+ if (tokenCount <= config.maxTokens) {
289
+ return [
290
+ {
291
+ id: "",
292
+ content: text,
293
+ type: "translatable",
294
+ startOffset: baseOffset,
295
+ endOffset: baseOffset + text.length,
296
+ },
297
+ ];
298
+ }
299
+
300
+ // Split by paragraph boundaries while preserving the separators
301
+ // Use a regex that captures the separator so we can preserve exact whitespace
302
+ const parts = text.split(/(\n\n+)/);
303
+
304
+ let currentChunk = "";
305
+ let chunkStartOffset = baseOffset;
306
+ let textOffset = baseOffset;
307
+
308
+ for (let i = 0; i < parts.length; i++) {
309
+ const part = parts[i];
310
+ if (part === undefined) continue;
311
+
312
+ const potentialChunk = currentChunk + part;
313
+ const potentialTokens = estimateTokens(potentialChunk);
314
+
315
+ if (potentialTokens > config.maxTokens && currentChunk) {
316
+ // Save current chunk - preserve content as-is without trimming
317
+ chunks.push({
318
+ id: "",
319
+ content: currentChunk,
320
+ type: "translatable",
321
+ startOffset: chunkStartOffset,
322
+ endOffset: textOffset,
323
+ });
324
+
325
+ // Start new chunk
326
+ currentChunk = part;
327
+ chunkStartOffset = textOffset;
328
+ } else {
329
+ currentChunk = potentialChunk;
330
+ }
331
+
332
+ textOffset += part.length;
333
+ }
334
+
335
+ // Add remaining content (preserve as-is)
336
+ if (currentChunk.length > 0) {
337
+ chunks.push({
338
+ id: "",
339
+ content: currentChunk,
340
+ type: "translatable",
341
+ startOffset: chunkStartOffset,
342
+ endOffset: baseOffset + text.length,
343
+ });
344
+ }
345
+
346
+ return chunks;
347
+ }
348
+
349
+ // ============================================================================
350
+ // Utility Functions
351
+ // ============================================================================
352
+
353
+ /**
354
+ * Reassemble chunks back into a document
355
+ * Note: Chunks should not have overlapping content - overlap is only used for context metadata
356
+ */
357
+ export function reassembleChunks(chunks: Chunk[]): string {
358
+ // Sort chunks by startOffset
359
+ const sorted = [...chunks].sort((a, b) => a.startOffset - b.startOffset);
360
+
361
+ // Simply concatenate - no overlap handling needed since content doesn't overlap
362
+ return sorted.map((chunk) => chunk.content).join("");
363
+ }
364
+
365
+ /**
366
+ * Get chunk statistics
367
+ */
368
+ export function getChunkStats(chunks: Chunk[]): {
369
+ totalChunks: number;
370
+ translatableChunks: number;
371
+ preservedChunks: number;
372
+ totalTokens: number;
373
+ averageTokens: number;
374
+ } {
375
+ const translatableChunks = chunks.filter((c) => c.type === "translatable");
376
+ const preservedChunks = chunks.filter((c) => c.type === "preserve");
377
+
378
+ const totalTokens = chunks.reduce(
379
+ (sum, chunk) => sum + estimateTokens(chunk.content),
380
+ 0
381
+ );
382
+
383
+ return {
384
+ totalChunks: chunks.length,
385
+ translatableChunks: translatableChunks.length,
386
+ preservedChunks: preservedChunks.length,
387
+ totalTokens,
388
+ averageTokens:
389
+ chunks.length > 0 ? Math.round(totalTokens / chunks.length) : 0,
390
+ };
391
+ }