@llm-translate/cli 1.0.0-next.2 → 1.0.0-next.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/Dockerfile CHANGED
@@ -38,9 +38,13 @@ COPY --from=builder --chown=llmtranslate:nodejs /app/node_modules ./node_modules
38
38
  COPY --from=builder --chown=llmtranslate:nodejs /app/dist ./dist
39
39
  COPY --from=builder --chown=llmtranslate:nodejs /app/package.json ./
40
40
 
41
+ # Create cache directory with correct ownership
42
+ RUN mkdir -p /app/cache && chown llmtranslate:nodejs /app/cache
43
+
41
44
  # Environment
42
45
  ENV NODE_ENV=production
43
46
  ENV TRANSLATE_PORT=3000
47
+ ENV TRANSLATE_CACHE_DIR=/app/cache
44
48
 
45
49
  # Switch to non-root user
46
50
  USER llmtranslate
@@ -51,5 +55,6 @@ EXPOSE 3000
51
55
  HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
52
56
  CMD node -e "fetch('http://localhost:3000/health/live').then(r => r.ok ? process.exit(0) : process.exit(1)).catch(() => process.exit(1))"
53
57
 
54
- # Start server with JSON logging for container environments
55
- CMD ["node", "dist/cli/index.js", "serve", "--json", "--cors", "--no-auth", "--cache-dir", "./.translate-cache/server"]
58
+ # ENTRYPOINT for CLI, CMD for default arguments
59
+ ENTRYPOINT ["node", "dist/cli/index.js"]
60
+ CMD ["serve", "--json", "--cors", "--no-auth", "--cache-dir", "/app/cache"]
package/dist/cli/index.js CHANGED
@@ -8,6 +8,7 @@ import 'remark-parse';
8
8
  import 'remark-stringify';
9
9
  import 'remark-gfm';
10
10
  import 'unist-util-visit';
11
+ import * as cheerio from 'cheerio';
11
12
  import { createAnthropic } from '@ai-sdk/anthropic';
12
13
  import { generateText, streamText } from 'ai';
13
14
  import { createOpenAI } from '@ai-sdk/openai';
@@ -1789,6 +1790,243 @@ var init_markdown = __esm({
1789
1790
  "src/parsers/markdown.ts"() {
1790
1791
  }
1791
1792
  });
1793
+ function parseHTML(content) {
1794
+ const $ = cheerio.load(content, {
1795
+ decodeEntities: false,
1796
+ xmlMode: false
1797
+ });
1798
+ const isFullDocument = content.includes("<html") || content.includes("<body");
1799
+ const sections = extractTranslatableSections($, isFullDocument);
1800
+ return {
1801
+ original: content,
1802
+ $,
1803
+ sections,
1804
+ isFullDocument
1805
+ };
1806
+ }
1807
+ function extractTranslatableSections($, isFullDocument) {
1808
+ const sections = [];
1809
+ let sectionId = 0;
1810
+ const root = isFullDocument ? $("body") : $.root();
1811
+ function processElement(element, parentSelector) {
1812
+ let textNodeIndex = 0;
1813
+ element.contents().each((_index, node) => {
1814
+ if (node.type === "text") {
1815
+ const textNode = node;
1816
+ const text = textNode.data;
1817
+ const currentTextIndex = textNodeIndex++;
1818
+ if (!text || !text.trim()) return;
1819
+ const parent = $(node).parent();
1820
+ const tagName = parent[0]?.tagName?.toLowerCase() || "unknown";
1821
+ if (SKIP_TAGS.has(tagName)) return;
1822
+ const selector = buildSelector($, parent, parentSelector, currentTextIndex);
1823
+ sections.push({
1824
+ id: `section-${sectionId++}`,
1825
+ content: text,
1826
+ selector,
1827
+ tagName,
1828
+ tokenCount: estimateTokens(text),
1829
+ translatable: true
1830
+ });
1831
+ } else if (node.type === "tag") {
1832
+ const elem = node;
1833
+ const tagName = elem.tagName?.toLowerCase();
1834
+ if (SKIP_TAGS.has(tagName)) return;
1835
+ const $elem = $(elem);
1836
+ const selector = buildSelector($, $elem, parentSelector);
1837
+ for (const attrName of TRANSLATABLE_ATTRIBUTES) {
1838
+ const attrValue = $elem.attr(attrName);
1839
+ if (attrValue && attrValue.trim()) {
1840
+ sections.push({
1841
+ id: `section-${sectionId++}`,
1842
+ content: attrValue,
1843
+ selector,
1844
+ tagName,
1845
+ isAttribute: true,
1846
+ attributeName: attrName,
1847
+ tokenCount: estimateTokens(attrValue),
1848
+ translatable: true
1849
+ });
1850
+ }
1851
+ }
1852
+ processElement($elem, selector);
1853
+ }
1854
+ });
1855
+ }
1856
+ processElement(root, "");
1857
+ return sections;
1858
+ }
1859
+ function buildSelector($, element, parentSelector, textIndex) {
1860
+ const elem = element[0];
1861
+ if (!elem || elem.type !== "tag") {
1862
+ return parentSelector + (textIndex !== void 0 ? `::text(${textIndex})` : "");
1863
+ }
1864
+ const tagElem = elem;
1865
+ const tagName = tagElem.tagName?.toLowerCase() || "unknown";
1866
+ const id = $(elem).attr("id");
1867
+ if (textIndex !== void 0) {
1868
+ if (id) {
1869
+ return `#${id}::text(${textIndex})`;
1870
+ }
1871
+ return parentSelector ? `${parentSelector}::text(${textIndex})` : `${tagName}::text(${textIndex})`;
1872
+ }
1873
+ if (id) {
1874
+ return `#${id}`;
1875
+ }
1876
+ const parent = $(elem).parent();
1877
+ const siblings = parent.children(tagName);
1878
+ const index = siblings.index(elem);
1879
+ let selector = tagName;
1880
+ if (siblings.length > 1) {
1881
+ selector += `:nth-of-type(${index + 1})`;
1882
+ }
1883
+ if (parentSelector) {
1884
+ selector = `${parentSelector} > ${selector}`;
1885
+ }
1886
+ return selector;
1887
+ }
1888
+ function chunkHTMLSections(sections, options = {}) {
1889
+ const config2 = { ...DEFAULT_HTML_CHUNKING, ...options };
1890
+ const chunks = [];
1891
+ const translatableSections = sections.filter((s) => s.translatable);
1892
+ if (translatableSections.length === 0) {
1893
+ return [];
1894
+ }
1895
+ let currentChunk = [];
1896
+ let currentTokens = 0;
1897
+ let chunkId = 0;
1898
+ for (const section of translatableSections) {
1899
+ const sectionTokens = section.tokenCount;
1900
+ if (currentTokens + sectionTokens > config2.maxTokens && currentChunk.length > 0) {
1901
+ chunks.push(createChunk(currentChunk, chunkId++));
1902
+ currentChunk = [];
1903
+ currentTokens = 0;
1904
+ }
1905
+ currentChunk.push(section);
1906
+ currentTokens += sectionTokens;
1907
+ }
1908
+ if (currentChunk.length > 0) {
1909
+ chunks.push(createChunk(currentChunk, chunkId));
1910
+ }
1911
+ return chunks;
1912
+ }
1913
+ function createChunk(sections, id) {
1914
+ const lines = [];
1915
+ for (const section of sections) {
1916
+ if (section.isAttribute) {
1917
+ lines.push(`[${section.id}:${section.attributeName}] ${section.content}`);
1918
+ } else {
1919
+ lines.push(`[${section.id}] ${section.content}`);
1920
+ }
1921
+ }
1922
+ const content = lines.join("\n\n");
1923
+ return {
1924
+ id: `chunk-${id}`,
1925
+ content,
1926
+ sections,
1927
+ tokenCount: estimateTokens(content)
1928
+ };
1929
+ }
1930
+ function parseTranslatedChunk(chunk, translatedContent) {
1931
+ const map = {};
1932
+ const regex = /\[([^\]]+)\]\s*([^[]*?)(?=\n\n\[|\n*$)/gs;
1933
+ let match;
1934
+ while ((match = regex.exec(translatedContent)) !== null) {
1935
+ const marker = match[1];
1936
+ let translation = match[2]?.trim() || "";
1937
+ const idMatch = marker?.match(/^(section-\d+)/);
1938
+ if (idMatch) {
1939
+ map[idMatch[1]] = translation;
1940
+ }
1941
+ }
1942
+ if (Object.keys(map).length === 0 && chunk.sections.length === 1) {
1943
+ map[chunk.sections[0].id] = translatedContent.trim();
1944
+ }
1945
+ return map;
1946
+ }
1947
+ function applyHTMLTranslations(document, translations) {
1948
+ const $ = document.$;
1949
+ for (const section of document.sections) {
1950
+ const translation = translations[section.id];
1951
+ if (!translation) continue;
1952
+ try {
1953
+ if (section.isAttribute && section.attributeName) {
1954
+ const elem = $(section.selector.replace(/::text\(\d+\)$/, ""));
1955
+ elem.attr(section.attributeName, translation);
1956
+ } else {
1957
+ const textMatch = section.selector.match(/^(.*)::text\((\d+)\)$/);
1958
+ if (textMatch) {
1959
+ const [, parentSelector, textIndexStr] = textMatch;
1960
+ const textIndex = parseInt(textIndexStr, 10);
1961
+ const parent = $(parentSelector);
1962
+ let currentTextIndex = 0;
1963
+ parent.contents().each((_i, node) => {
1964
+ if (node.type === "text") {
1965
+ if (currentTextIndex === textIndex) {
1966
+ node.data = translation;
1967
+ return false;
1968
+ }
1969
+ currentTextIndex++;
1970
+ }
1971
+ });
1972
+ } else {
1973
+ const elem = $(section.selector);
1974
+ const contents = elem.contents();
1975
+ let updated = false;
1976
+ contents.each((_i, node) => {
1977
+ if (node.type === "text" && !updated) {
1978
+ node.data = translation;
1979
+ updated = true;
1980
+ }
1981
+ });
1982
+ }
1983
+ }
1984
+ } catch (error) {
1985
+ console.warn(`Failed to apply translation for ${section.id}:`, error);
1986
+ }
1987
+ }
1988
+ if (document.isFullDocument) {
1989
+ return $.html();
1990
+ } else {
1991
+ return $("body").html() || $.html();
1992
+ }
1993
+ }
1994
+ function getHTMLStats(document) {
1995
+ const translatableSections = document.sections.filter((s) => s.translatable);
1996
+ const attributeSections = document.sections.filter((s) => s.isAttribute);
1997
+ const totalTokens = document.sections.reduce((sum, s) => sum + s.tokenCount, 0);
1998
+ return {
1999
+ totalSections: document.sections.length,
2000
+ translatableSections: translatableSections.length,
2001
+ attributeSections: attributeSections.length,
2002
+ totalTokens,
2003
+ avgTokensPerSection: document.sections.length > 0 ? Math.round(totalTokens / document.sections.length) : 0
2004
+ };
2005
+ }
2006
+ var SKIP_TAGS, TRANSLATABLE_ATTRIBUTES, DEFAULT_HTML_CHUNKING;
2007
+ var init_html = __esm({
2008
+ "src/parsers/html.ts"() {
2009
+ init_tokens();
2010
+ SKIP_TAGS = /* @__PURE__ */ new Set([
2011
+ "script",
2012
+ "style",
2013
+ "code",
2014
+ "pre",
2015
+ "kbd",
2016
+ "samp",
2017
+ "var",
2018
+ "noscript",
2019
+ "template",
2020
+ "svg",
2021
+ "math"
2022
+ ]);
2023
+ TRANSLATABLE_ATTRIBUTES = ["alt", "title", "placeholder", "aria-label"];
2024
+ DEFAULT_HTML_CHUNKING = {
2025
+ maxTokens: 2048,
2026
+ minTokensForChunk: 100
2027
+ };
2028
+ }
2029
+ });
1792
2030
  function mapFinishReason(reason) {
1793
2031
  switch (reason) {
1794
2032
  case "stop":
@@ -2949,6 +3187,7 @@ var init_engine = __esm({
2949
3187
  init_agent();
2950
3188
  init_chunker();
2951
3189
  init_markdown();
3190
+ init_html();
2952
3191
  init_glossary();
2953
3192
  init_registry();
2954
3193
  init_logger();
@@ -3037,7 +3276,7 @@ var init_engine = __esm({
3037
3276
  result = await this.translateMarkdown(options, glossary);
3038
3277
  break;
3039
3278
  case "html":
3040
- result = await this.translatePlainText(options, glossary);
3279
+ result = await this.translateHTML(options, glossary);
3041
3280
  break;
3042
3281
  case "text":
3043
3282
  default:
@@ -3176,6 +3415,157 @@ var init_engine = __esm({
3176
3415
  }
3177
3416
  };
3178
3417
  }
3418
+ async translateHTML(options, glossary) {
3419
+ const document = parseHTML(options.content);
3420
+ if (this.verbose) {
3421
+ const stats = getHTMLStats(document);
3422
+ logger.info(`Parsed HTML: ${stats.translatableSections} translatable sections, ${stats.totalTokens} tokens`);
3423
+ }
3424
+ if (document.sections.length === 0) {
3425
+ return {
3426
+ content: options.content,
3427
+ chunks: [],
3428
+ metadata: {
3429
+ totalTokensUsed: 0,
3430
+ totalDuration: 0,
3431
+ averageQuality: 100,
3432
+ provider: this.provider.name,
3433
+ model: this.config.provider.model ?? this.provider.defaultModel,
3434
+ totalIterations: 0,
3435
+ tokensUsed: { input: 0, output: 0 },
3436
+ cache: { hits: 0, misses: 0 }
3437
+ }
3438
+ };
3439
+ }
3440
+ const chunks = chunkHTMLSections(document.sections, {
3441
+ maxTokens: this.config.chunking.maxTokens
3442
+ });
3443
+ if (this.verbose) {
3444
+ logger.info(`Chunked into ${chunks.length} translation units`);
3445
+ }
3446
+ const agent = createTranslationAgent({
3447
+ provider: this.provider,
3448
+ qualityThreshold: options.qualityThreshold ?? this.config.quality.threshold,
3449
+ maxIterations: options.maxIterations ?? this.config.quality.maxIterations,
3450
+ verbose: this.verbose,
3451
+ strictQuality: options.strictQuality
3452
+ });
3453
+ const allTranslations = {};
3454
+ const chunkResults = [];
3455
+ let totalInputTokens = 0;
3456
+ let totalOutputTokens = 0;
3457
+ let totalIterations = 0;
3458
+ for (let i = 0; i < chunks.length; i++) {
3459
+ const chunk = chunks[i];
3460
+ if (!chunk) continue;
3461
+ if (this.verbose) {
3462
+ logger.info(`Translating HTML chunk ${i + 1}/${chunks.length} (${chunk.sections.length} sections)...`);
3463
+ }
3464
+ const glossaryString = glossary ? JSON.stringify(glossary.terms.map((t) => ({ s: t.source, t: t.target }))) : void 0;
3465
+ const cacheKey = {
3466
+ content: chunk.content,
3467
+ sourceLang: options.sourceLang,
3468
+ targetLang: options.targetLang,
3469
+ glossary: glossaryString,
3470
+ provider: this.provider.name,
3471
+ model: this.config.provider.model ?? this.provider.defaultModel
3472
+ };
3473
+ const cacheResult = this.cache.get(cacheKey);
3474
+ if (cacheResult.hit && cacheResult.entry) {
3475
+ this.cacheHits++;
3476
+ if (this.verbose) {
3477
+ logger.info(` \u21B3 Cache hit (quality: ${cacheResult.entry.qualityScore})`);
3478
+ }
3479
+ const chunkTranslations = parseTranslatedChunk(chunk, cacheResult.entry.translation);
3480
+ Object.assign(allTranslations, chunkTranslations);
3481
+ chunkResults.push({
3482
+ original: chunk.content,
3483
+ translated: cacheResult.entry.translation,
3484
+ startOffset: 0,
3485
+ endOffset: chunk.content.length,
3486
+ qualityScore: cacheResult.entry.qualityScore,
3487
+ iterations: 0,
3488
+ tokensUsed: { input: 0, output: 0, cacheRead: 1 },
3489
+ cached: true
3490
+ });
3491
+ continue;
3492
+ }
3493
+ this.cacheMisses++;
3494
+ const resolvedStyleInstruction = options.styleInstruction ?? this.config.languages.styles?.[options.targetLang];
3495
+ const request = {
3496
+ content: chunk.content,
3497
+ sourceLang: options.sourceLang,
3498
+ targetLang: options.targetLang,
3499
+ format: "html",
3500
+ glossary,
3501
+ context: {
3502
+ documentPurpose: options.context,
3503
+ styleInstruction: resolvedStyleInstruction,
3504
+ documentSummary: "HTML document with structured sections. Preserve the [section-N] markers exactly as they appear. Translate only the text after each marker."
3505
+ }
3506
+ };
3507
+ try {
3508
+ const result = await agent.translate(request);
3509
+ const chunkTranslations = parseTranslatedChunk(chunk, result.content);
3510
+ Object.assign(allTranslations, chunkTranslations);
3511
+ this.cache.set(cacheKey, result.content, result.metadata.qualityScore);
3512
+ chunkResults.push({
3513
+ original: chunk.content,
3514
+ translated: result.content,
3515
+ startOffset: 0,
3516
+ endOffset: chunk.content.length,
3517
+ qualityScore: result.metadata.qualityScore,
3518
+ iterations: result.metadata.iterations,
3519
+ tokensUsed: result.metadata.tokensUsed
3520
+ });
3521
+ if (result.metadata.tokensUsed) {
3522
+ totalInputTokens += result.metadata.tokensUsed.input;
3523
+ totalOutputTokens += result.metadata.tokensUsed.output;
3524
+ }
3525
+ totalIterations += result.metadata.iterations;
3526
+ } catch (error) {
3527
+ logger.error(`Failed to translate HTML chunk ${i + 1}: ${error}`);
3528
+ for (const section of chunk.sections) {
3529
+ allTranslations[section.id] = section.content;
3530
+ }
3531
+ chunkResults.push({
3532
+ original: chunk.content,
3533
+ translated: chunk.content,
3534
+ startOffset: 0,
3535
+ endOffset: chunk.content.length,
3536
+ qualityScore: 0,
3537
+ iterations: 0,
3538
+ tokensUsed: { input: 0, output: 0 }
3539
+ });
3540
+ }
3541
+ }
3542
+ const finalContent = applyHTMLTranslations(document, allTranslations);
3543
+ const qualityScores = chunkResults.filter((r) => r.qualityScore > 0).map((r) => r.qualityScore);
3544
+ const averageQuality = qualityScores.length > 0 ? qualityScores.reduce((a, b) => a + b, 0) / qualityScores.length : 0;
3545
+ const cacheHits = chunkResults.filter((r) => r.cached).length;
3546
+ const cacheMisses = chunkResults.filter((r) => !r.cached && r.qualityScore > 0).length;
3547
+ return {
3548
+ content: finalContent,
3549
+ chunks: chunkResults,
3550
+ metadata: {
3551
+ totalTokensUsed: totalInputTokens + totalOutputTokens,
3552
+ totalDuration: 0,
3553
+ // Will be set by caller
3554
+ averageQuality,
3555
+ provider: this.provider.name,
3556
+ model: this.config.provider.model ?? this.provider.defaultModel,
3557
+ totalIterations,
3558
+ tokensUsed: {
3559
+ input: totalInputTokens,
3560
+ output: totalOutputTokens
3561
+ },
3562
+ cache: {
3563
+ hits: cacheHits,
3564
+ misses: cacheMisses
3565
+ }
3566
+ }
3567
+ };
3568
+ }
3179
3569
  async translatePlainText(options, glossary) {
3180
3570
  const chunks = chunkContent(options.content, {
3181
3571
  maxTokens: this.config.chunking.maxTokens,