@llm-translate/cli 1.0.0-next.3 → 1.0.0-next.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli/index.js CHANGED
@@ -8,6 +8,7 @@ import 'remark-parse';
8
8
  import 'remark-stringify';
9
9
  import 'remark-gfm';
10
10
  import 'unist-util-visit';
11
+ import * as cheerio from 'cheerio';
11
12
  import { createAnthropic } from '@ai-sdk/anthropic';
12
13
  import { generateText, streamText } from 'ai';
13
14
  import { createOpenAI } from '@ai-sdk/openai';
@@ -1789,6 +1790,243 @@ var init_markdown = __esm({
1789
1790
  "src/parsers/markdown.ts"() {
1790
1791
  }
1791
1792
  });
1793
+ function parseHTML(content) {
1794
+ const $ = cheerio.load(content, {
1795
+ decodeEntities: false,
1796
+ xmlMode: false
1797
+ });
1798
+ const isFullDocument = content.includes("<html") || content.includes("<body");
1799
+ const sections = extractTranslatableSections($, isFullDocument);
1800
+ return {
1801
+ original: content,
1802
+ $,
1803
+ sections,
1804
+ isFullDocument
1805
+ };
1806
+ }
1807
+ function extractTranslatableSections($, isFullDocument) {
1808
+ const sections = [];
1809
+ let sectionId = 0;
1810
+ const root = isFullDocument ? $("body") : $.root();
1811
+ function processElement(element, parentSelector) {
1812
+ let textNodeIndex = 0;
1813
+ element.contents().each((_index, node) => {
1814
+ if (node.type === "text") {
1815
+ const textNode = node;
1816
+ const text = textNode.data;
1817
+ const currentTextIndex = textNodeIndex++;
1818
+ if (!text || !text.trim()) return;
1819
+ const parent = $(node).parent();
1820
+ const tagName = parent[0]?.tagName?.toLowerCase() || "unknown";
1821
+ if (SKIP_TAGS.has(tagName)) return;
1822
+ const selector = buildSelector($, parent, parentSelector, currentTextIndex);
1823
+ sections.push({
1824
+ id: `section-${sectionId++}`,
1825
+ content: text,
1826
+ selector,
1827
+ tagName,
1828
+ tokenCount: estimateTokens(text),
1829
+ translatable: true
1830
+ });
1831
+ } else if (node.type === "tag") {
1832
+ const elem = node;
1833
+ const tagName = elem.tagName?.toLowerCase();
1834
+ if (SKIP_TAGS.has(tagName)) return;
1835
+ const $elem = $(elem);
1836
+ const selector = buildSelector($, $elem, parentSelector);
1837
+ for (const attrName of TRANSLATABLE_ATTRIBUTES) {
1838
+ const attrValue = $elem.attr(attrName);
1839
+ if (attrValue && attrValue.trim()) {
1840
+ sections.push({
1841
+ id: `section-${sectionId++}`,
1842
+ content: attrValue,
1843
+ selector,
1844
+ tagName,
1845
+ isAttribute: true,
1846
+ attributeName: attrName,
1847
+ tokenCount: estimateTokens(attrValue),
1848
+ translatable: true
1849
+ });
1850
+ }
1851
+ }
1852
+ processElement($elem, selector);
1853
+ }
1854
+ });
1855
+ }
1856
+ processElement(root, "");
1857
+ return sections;
1858
+ }
1859
+ function buildSelector($, element, parentSelector, textIndex) {
1860
+ const elem = element[0];
1861
+ if (!elem || elem.type !== "tag") {
1862
+ return parentSelector + (textIndex !== void 0 ? `::text(${textIndex})` : "");
1863
+ }
1864
+ const tagElem = elem;
1865
+ const tagName = tagElem.tagName?.toLowerCase() || "unknown";
1866
+ const id = $(elem).attr("id");
1867
+ if (textIndex !== void 0) {
1868
+ if (id) {
1869
+ return `#${id}::text(${textIndex})`;
1870
+ }
1871
+ return parentSelector ? `${parentSelector}::text(${textIndex})` : `${tagName}::text(${textIndex})`;
1872
+ }
1873
+ if (id) {
1874
+ return `#${id}`;
1875
+ }
1876
+ const parent = $(elem).parent();
1877
+ const siblings = parent.children(tagName);
1878
+ const index = siblings.index(elem);
1879
+ let selector = tagName;
1880
+ if (siblings.length > 1) {
1881
+ selector += `:nth-of-type(${index + 1})`;
1882
+ }
1883
+ if (parentSelector) {
1884
+ selector = `${parentSelector} > ${selector}`;
1885
+ }
1886
+ return selector;
1887
+ }
1888
+ function chunkHTMLSections(sections, options = {}) {
1889
+ const config2 = { ...DEFAULT_HTML_CHUNKING, ...options };
1890
+ const chunks = [];
1891
+ const translatableSections = sections.filter((s) => s.translatable);
1892
+ if (translatableSections.length === 0) {
1893
+ return [];
1894
+ }
1895
+ let currentChunk = [];
1896
+ let currentTokens = 0;
1897
+ let chunkId = 0;
1898
+ for (const section of translatableSections) {
1899
+ const sectionTokens = section.tokenCount;
1900
+ if (currentTokens + sectionTokens > config2.maxTokens && currentChunk.length > 0) {
1901
+ chunks.push(createChunk(currentChunk, chunkId++));
1902
+ currentChunk = [];
1903
+ currentTokens = 0;
1904
+ }
1905
+ currentChunk.push(section);
1906
+ currentTokens += sectionTokens;
1907
+ }
1908
+ if (currentChunk.length > 0) {
1909
+ chunks.push(createChunk(currentChunk, chunkId));
1910
+ }
1911
+ return chunks;
1912
+ }
1913
+ function createChunk(sections, id) {
1914
+ const lines = [];
1915
+ for (const section of sections) {
1916
+ if (section.isAttribute) {
1917
+ lines.push(`[${section.id}:${section.attributeName}] ${section.content}`);
1918
+ } else {
1919
+ lines.push(`[${section.id}] ${section.content}`);
1920
+ }
1921
+ }
1922
+ const content = lines.join("\n\n");
1923
+ return {
1924
+ id: `chunk-${id}`,
1925
+ content,
1926
+ sections,
1927
+ tokenCount: estimateTokens(content)
1928
+ };
1929
+ }
1930
+ function parseTranslatedChunk(chunk, translatedContent) {
1931
+ const map = {};
1932
+ const regex = /\[([^\]]+)\]\s*([^[]*?)(?=\n\n\[|\n*$)/gs;
1933
+ let match;
1934
+ while ((match = regex.exec(translatedContent)) !== null) {
1935
+ const marker = match[1];
1936
+ let translation = match[2]?.trim() || "";
1937
+ const idMatch = marker?.match(/^(section-\d+)/);
1938
+ if (idMatch) {
1939
+ map[idMatch[1]] = translation;
1940
+ }
1941
+ }
1942
+ if (Object.keys(map).length === 0 && chunk.sections.length === 1) {
1943
+ map[chunk.sections[0].id] = translatedContent.trim();
1944
+ }
1945
+ return map;
1946
+ }
1947
+ function applyHTMLTranslations(document, translations) {
1948
+ const $ = document.$;
1949
+ for (const section of document.sections) {
1950
+ const translation = translations[section.id];
1951
+ if (!translation) continue;
1952
+ try {
1953
+ if (section.isAttribute && section.attributeName) {
1954
+ const elem = $(section.selector.replace(/::text\(\d+\)$/, ""));
1955
+ elem.attr(section.attributeName, translation);
1956
+ } else {
1957
+ const textMatch = section.selector.match(/^(.*)::text\((\d+)\)$/);
1958
+ if (textMatch) {
1959
+ const [, parentSelector, textIndexStr] = textMatch;
1960
+ const textIndex = parseInt(textIndexStr, 10);
1961
+ const parent = $(parentSelector);
1962
+ let currentTextIndex = 0;
1963
+ parent.contents().each((_i, node) => {
1964
+ if (node.type === "text") {
1965
+ if (currentTextIndex === textIndex) {
1966
+ node.data = translation;
1967
+ return false;
1968
+ }
1969
+ currentTextIndex++;
1970
+ }
1971
+ });
1972
+ } else {
1973
+ const elem = $(section.selector);
1974
+ const contents = elem.contents();
1975
+ let updated = false;
1976
+ contents.each((_i, node) => {
1977
+ if (node.type === "text" && !updated) {
1978
+ node.data = translation;
1979
+ updated = true;
1980
+ }
1981
+ });
1982
+ }
1983
+ }
1984
+ } catch (error) {
1985
+ console.warn(`Failed to apply translation for ${section.id}:`, error);
1986
+ }
1987
+ }
1988
+ if (document.isFullDocument) {
1989
+ return $.html();
1990
+ } else {
1991
+ return $("body").html() || $.html();
1992
+ }
1993
+ }
1994
+ function getHTMLStats(document) {
1995
+ const translatableSections = document.sections.filter((s) => s.translatable);
1996
+ const attributeSections = document.sections.filter((s) => s.isAttribute);
1997
+ const totalTokens = document.sections.reduce((sum, s) => sum + s.tokenCount, 0);
1998
+ return {
1999
+ totalSections: document.sections.length,
2000
+ translatableSections: translatableSections.length,
2001
+ attributeSections: attributeSections.length,
2002
+ totalTokens,
2003
+ avgTokensPerSection: document.sections.length > 0 ? Math.round(totalTokens / document.sections.length) : 0
2004
+ };
2005
+ }
2006
+ var SKIP_TAGS, TRANSLATABLE_ATTRIBUTES, DEFAULT_HTML_CHUNKING;
2007
+ var init_html = __esm({
2008
+ "src/parsers/html.ts"() {
2009
+ init_tokens();
2010
+ SKIP_TAGS = /* @__PURE__ */ new Set([
2011
+ "script",
2012
+ "style",
2013
+ "code",
2014
+ "pre",
2015
+ "kbd",
2016
+ "samp",
2017
+ "var",
2018
+ "noscript",
2019
+ "template",
2020
+ "svg",
2021
+ "math"
2022
+ ]);
2023
+ TRANSLATABLE_ATTRIBUTES = ["alt", "title", "placeholder", "aria-label"];
2024
+ DEFAULT_HTML_CHUNKING = {
2025
+ maxTokens: 2048,
2026
+ minTokensForChunk: 100
2027
+ };
2028
+ }
2029
+ });
1792
2030
  function mapFinishReason(reason) {
1793
2031
  switch (reason) {
1794
2032
  case "stop":
@@ -2949,6 +3187,7 @@ var init_engine = __esm({
2949
3187
  init_agent();
2950
3188
  init_chunker();
2951
3189
  init_markdown();
3190
+ init_html();
2952
3191
  init_glossary();
2953
3192
  init_registry();
2954
3193
  init_logger();
@@ -3037,7 +3276,7 @@ var init_engine = __esm({
3037
3276
  result = await this.translateMarkdown(options, glossary);
3038
3277
  break;
3039
3278
  case "html":
3040
- result = await this.translatePlainText(options, glossary);
3279
+ result = await this.translateHTML(options, glossary);
3041
3280
  break;
3042
3281
  case "text":
3043
3282
  default:
@@ -3176,6 +3415,157 @@ var init_engine = __esm({
3176
3415
  }
3177
3416
  };
3178
3417
  }
3418
+ async translateHTML(options, glossary) {
3419
+ const document = parseHTML(options.content);
3420
+ if (this.verbose) {
3421
+ const stats = getHTMLStats(document);
3422
+ logger.info(`Parsed HTML: ${stats.translatableSections} translatable sections, ${stats.totalTokens} tokens`);
3423
+ }
3424
+ if (document.sections.length === 0) {
3425
+ return {
3426
+ content: options.content,
3427
+ chunks: [],
3428
+ metadata: {
3429
+ totalTokensUsed: 0,
3430
+ totalDuration: 0,
3431
+ averageQuality: 100,
3432
+ provider: this.provider.name,
3433
+ model: this.config.provider.model ?? this.provider.defaultModel,
3434
+ totalIterations: 0,
3435
+ tokensUsed: { input: 0, output: 0 },
3436
+ cache: { hits: 0, misses: 0 }
3437
+ }
3438
+ };
3439
+ }
3440
+ const chunks = chunkHTMLSections(document.sections, {
3441
+ maxTokens: this.config.chunking.maxTokens
3442
+ });
3443
+ if (this.verbose) {
3444
+ logger.info(`Chunked into ${chunks.length} translation units`);
3445
+ }
3446
+ const agent = createTranslationAgent({
3447
+ provider: this.provider,
3448
+ qualityThreshold: options.qualityThreshold ?? this.config.quality.threshold,
3449
+ maxIterations: options.maxIterations ?? this.config.quality.maxIterations,
3450
+ verbose: this.verbose,
3451
+ strictQuality: options.strictQuality
3452
+ });
3453
+ const allTranslations = {};
3454
+ const chunkResults = [];
3455
+ let totalInputTokens = 0;
3456
+ let totalOutputTokens = 0;
3457
+ let totalIterations = 0;
3458
+ for (let i = 0; i < chunks.length; i++) {
3459
+ const chunk = chunks[i];
3460
+ if (!chunk) continue;
3461
+ if (this.verbose) {
3462
+ logger.info(`Translating HTML chunk ${i + 1}/${chunks.length} (${chunk.sections.length} sections)...`);
3463
+ }
3464
+ const glossaryString = glossary ? JSON.stringify(glossary.terms.map((t) => ({ s: t.source, t: t.target }))) : void 0;
3465
+ const cacheKey = {
3466
+ content: chunk.content,
3467
+ sourceLang: options.sourceLang,
3468
+ targetLang: options.targetLang,
3469
+ glossary: glossaryString,
3470
+ provider: this.provider.name,
3471
+ model: this.config.provider.model ?? this.provider.defaultModel
3472
+ };
3473
+ const cacheResult = this.cache.get(cacheKey);
3474
+ if (cacheResult.hit && cacheResult.entry) {
3475
+ this.cacheHits++;
3476
+ if (this.verbose) {
3477
+ logger.info(` \u21B3 Cache hit (quality: ${cacheResult.entry.qualityScore})`);
3478
+ }
3479
+ const chunkTranslations = parseTranslatedChunk(chunk, cacheResult.entry.translation);
3480
+ Object.assign(allTranslations, chunkTranslations);
3481
+ chunkResults.push({
3482
+ original: chunk.content,
3483
+ translated: cacheResult.entry.translation,
3484
+ startOffset: 0,
3485
+ endOffset: chunk.content.length,
3486
+ qualityScore: cacheResult.entry.qualityScore,
3487
+ iterations: 0,
3488
+ tokensUsed: { input: 0, output: 0, cacheRead: 1 },
3489
+ cached: true
3490
+ });
3491
+ continue;
3492
+ }
3493
+ this.cacheMisses++;
3494
+ const resolvedStyleInstruction = options.styleInstruction ?? this.config.languages.styles?.[options.targetLang];
3495
+ const request = {
3496
+ content: chunk.content,
3497
+ sourceLang: options.sourceLang,
3498
+ targetLang: options.targetLang,
3499
+ format: "html",
3500
+ glossary,
3501
+ context: {
3502
+ documentPurpose: options.context,
3503
+ styleInstruction: resolvedStyleInstruction,
3504
+ documentSummary: "HTML document with structured sections. Preserve the [section-N] markers exactly as they appear. Translate only the text after each marker."
3505
+ }
3506
+ };
3507
+ try {
3508
+ const result = await agent.translate(request);
3509
+ const chunkTranslations = parseTranslatedChunk(chunk, result.content);
3510
+ Object.assign(allTranslations, chunkTranslations);
3511
+ this.cache.set(cacheKey, result.content, result.metadata.qualityScore);
3512
+ chunkResults.push({
3513
+ original: chunk.content,
3514
+ translated: result.content,
3515
+ startOffset: 0,
3516
+ endOffset: chunk.content.length,
3517
+ qualityScore: result.metadata.qualityScore,
3518
+ iterations: result.metadata.iterations,
3519
+ tokensUsed: result.metadata.tokensUsed
3520
+ });
3521
+ if (result.metadata.tokensUsed) {
3522
+ totalInputTokens += result.metadata.tokensUsed.input;
3523
+ totalOutputTokens += result.metadata.tokensUsed.output;
3524
+ }
3525
+ totalIterations += result.metadata.iterations;
3526
+ } catch (error) {
3527
+ logger.error(`Failed to translate HTML chunk ${i + 1}: ${error}`);
3528
+ for (const section of chunk.sections) {
3529
+ allTranslations[section.id] = section.content;
3530
+ }
3531
+ chunkResults.push({
3532
+ original: chunk.content,
3533
+ translated: chunk.content,
3534
+ startOffset: 0,
3535
+ endOffset: chunk.content.length,
3536
+ qualityScore: 0,
3537
+ iterations: 0,
3538
+ tokensUsed: { input: 0, output: 0 }
3539
+ });
3540
+ }
3541
+ }
3542
+ const finalContent = applyHTMLTranslations(document, allTranslations);
3543
+ const qualityScores = chunkResults.filter((r) => r.qualityScore > 0).map((r) => r.qualityScore);
3544
+ const averageQuality = qualityScores.length > 0 ? qualityScores.reduce((a, b) => a + b, 0) / qualityScores.length : 0;
3545
+ const cacheHits = chunkResults.filter((r) => r.cached).length;
3546
+ const cacheMisses = chunkResults.filter((r) => !r.cached && r.qualityScore > 0).length;
3547
+ return {
3548
+ content: finalContent,
3549
+ chunks: chunkResults,
3550
+ metadata: {
3551
+ totalTokensUsed: totalInputTokens + totalOutputTokens,
3552
+ totalDuration: 0,
3553
+ // Will be set by caller
3554
+ averageQuality,
3555
+ provider: this.provider.name,
3556
+ model: this.config.provider.model ?? this.provider.defaultModel,
3557
+ totalIterations,
3558
+ tokensUsed: {
3559
+ input: totalInputTokens,
3560
+ output: totalOutputTokens
3561
+ },
3562
+ cache: {
3563
+ hits: cacheHits,
3564
+ misses: cacheMisses
3565
+ }
3566
+ }
3567
+ };
3568
+ }
3179
3569
  async translatePlainText(options, glossary) {
3180
3570
  const chunks = chunkContent(options.content, {
3181
3571
  maxTokens: this.config.chunking.maxTokens,
@@ -4368,8 +4758,9 @@ function createApp(options) {
4368
4758
  json: options.jsonLogging ?? false
4369
4759
  }));
4370
4760
  if (options.enableCors) {
4761
+ const corsOrigin = options.corsOrigins ?? "*";
4371
4762
  app.use("*", cors({
4372
- origin: "*",
4763
+ origin: corsOrigin,
4373
4764
  allowMethods: ["GET", "POST", "OPTIONS"],
4374
4765
  allowHeaders: ["Content-Type", "Authorization", "X-API-Key"],
4375
4766
  exposeHeaders: ["X-Request-Id"],
@@ -4429,7 +4820,12 @@ llm-translate server started`);
4429
4820
  console.log(` - Health: http://${options.host}:${options.port}/health`);
4430
4821
  console.log(` - Translate: http://${options.host}:${options.port}/translate`);
4431
4822
  console.log(` - Auth: ${options.enableAuth ? "enabled" : "disabled"}`);
4432
- console.log(` - CORS: ${options.enableCors ? "enabled" : "disabled"}`);
4823
+ if (options.enableCors) {
4824
+ const corsInfo = options.corsOrigins ? Array.isArray(options.corsOrigins) ? options.corsOrigins.join(", ") : options.corsOrigins : "all origins";
4825
+ console.log(` - CORS: enabled (${corsInfo})`);
4826
+ } else {
4827
+ console.log(` - CORS: disabled`);
4828
+ }
4433
4829
  console.log(` - Cache: ${options.cachePath ?? "disabled"}`);
4434
4830
  console.log("");
4435
4831
  const shutdown = (signal) => {
@@ -4458,7 +4854,7 @@ var serveCommand = new Command("serve").description("Start the translation API s
4458
4854
  "-p, --port <number>",
4459
4855
  "Server port (env: TRANSLATE_PORT)",
4460
4856
  process.env["TRANSLATE_PORT"] ?? "3000"
4461
- ).option("-H, --host <string>", "Host to bind", "0.0.0.0").option("--no-auth", "Disable API key authentication").option("--cors", "Enable CORS for browser clients").option("--json", "Use JSON logging format (for containers)").option(
4857
+ ).option("-H, --host <string>", "Host to bind", "0.0.0.0").option("--no-auth", "Disable API key authentication").option("--cors [origins]", "Enable CORS (optionally specify allowed origins, comma-separated)").option("--json", "Use JSON logging format (for containers)").option(
4462
4858
  "--cache-dir <path>",
4463
4859
  "Cache directory path (env: TRANSLATE_CACHE_DIR)",
4464
4860
  process.env["TRANSLATE_CACHE_DIR"]
@@ -4478,11 +4874,17 @@ var serveCommand = new Command("serve").description("Start the translation API s
4478
4874
  "Set TRANSLATE_API_KEY environment variable to enable authentication.\n"
4479
4875
  );
4480
4876
  }
4877
+ const enableCors = options.cors !== void 0 && options.cors !== false;
4878
+ let corsOrigins;
4879
+ if (typeof options.cors === "string") {
4880
+ corsOrigins = options.cors.includes(",") ? options.cors.split(",").map((o) => o.trim()) : options.cors;
4881
+ }
4481
4882
  startServer({
4482
4883
  port,
4483
4884
  host,
4484
4885
  enableAuth,
4485
- enableCors: options.cors ?? false,
4886
+ enableCors,
4887
+ corsOrigins,
4486
4888
  apiKey: process.env["TRANSLATE_API_KEY"],
4487
4889
  jsonLogging: options.json ?? false,
4488
4890
  cachePath: options.cacheDir