npm - @aigne/doc-smith - Versions diffs - 0.9.9 → 0.9.10-beta - Mend

@aigne/doc-smith 0.9.9 → 0.9.10-beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/CHANGELOG.md +7 -0
package/agents/utils/load-document-all-content.mjs +12 -0
package/package.json +1 -1
package/utils/file-utils.mjs +5 -40
package/utils/sync-diagram-to-translations.mjs +10 -0
package/utils/translate-diagram-images.mjs +17 -1

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,12 @@
 # Changelog
+## [0.9.10-beta](https://github.com/AIGNE-io/aigne-doc-smith/compare/v0.9.9...v0.9.10-beta) (2025-12-18)
+### Bug Fixes
+* more robust translation processing and token handling ([#366](https://github.com/AIGNE-io/aigne-doc-smith/issues/366)) ([0a634ef](https://github.com/AIGNE-io/aigne-doc-smith/commit/0a634ef46b4da1204368af63fddff126c3d4f45d))
 ## [0.9.9](https://github.com/AIGNE-io/aigne-doc-smith/compare/v0.9.9-beta.4...v0.9.9) (2025-12-13)
 ## [0.9.9-beta.4](https://github.com/AIGNE-io/aigne-doc-smith/compare/v0.9.9-beta.3...v0.9.9-beta.4) (2025-12-12)

package/agents/utils/load-document-all-content.mjs CHANGED Viewed

@@ -1,5 +1,7 @@
 import { readdirSync } from "node:fs";
+import { join } from "node:path";
 import { findItemByPath, readFileContent } from "../../utils/docs-finder-utils.mjs";
+import { pathExists } from "../../utils/file-utils.mjs";
 /**
  * Loads a document's content along with all its translations from the docs directory.
@@ -49,7 +51,17 @@ export default async function loadDocumentAllContent({ path, docsDir, documentSt
     );
     // Process each translation file
+    // Note: translationFiles are already filtered by readdirSync, but we check existence again for safety
     for (const file of translationFiles) {
+      const filePath = join(docsDir, file);
+      // Check if file exists before reading to avoid unnecessary warnings
+      // (though readdirSync should already filter existing files)
+      const fileExists = await pathExists(filePath);
+      if (!fileExists) {
+        continue;
+      }
       const content = await readFileContent(docsDir, file);
       if (content) {
         // Extract language code from filename (e.g., "en" from "doc.en.md" or "zh-CN" from "doc.zh-CN.md")

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@aigne/doc-smith",
-  "version": "0.9.9",
+  "version": "0.9.10-beta",
   "description": "AI-driven documentation generation tool built on the AIGNE Framework",
   "publishConfig": {
     "access": "public"

package/utils/file-utils.mjs CHANGED Viewed

@@ -530,38 +530,9 @@ export async function readFileContents(files, baseDir = process.cwd(), options =
   return results.filter((result) => result !== null);
 }
-/**
- * Sanitize text by removing or escaping disallowed LLM special tokens
- * This prevents errors when encoding text that contains special tokens like <|endoftext|>
- * @param {string} text - Text to sanitize
- * @returns {string} Sanitized text safe for tokenization
- */
-function sanitizeForTokenization(text) {
-  if (typeof text !== "string") return text;
-  // Replace <|endoftext|> with a safe alternative that won't trigger special token parsing
-  // We replace it with a space-separated version to prevent tokenizer from recognizing it as a special token
-  return text.replace(/<\|endoftext\|>/g, "<| endoftext |>");
-}
 export function calculateTokens(text) {
-  try {
-    // Sanitize text before encoding to avoid errors with special tokens
-    const sanitizedText = sanitizeForTokenization(text);
-    const tokens = encode(sanitizedText);
-    return tokens.length;
-  } catch (error) {
-    // If encoding still fails, try with more aggressive sanitization
-    console.warn(`Token calculation warning: ${error.message}`);
-    const fallbackText = sanitizeForTokenization(text).replace(/<\|[^|]+\|>/g, "");
-    try {
-      const tokens = encode(fallbackText);
-      return tokens.length;
-    } catch {
-      // Last resort: estimate tokens based on character count (rough approximation)
-      console.warn(`Token calculation fallback: using character-based estimation`);
-      return Math.ceil(fallbackText.length / 4); // Rough estimate: ~4 chars per token
-    }
-  }
+  const tokens = encode(text, { allowedSpecial: "all" });
+  return tokens.length;
 }
 /**
@@ -576,15 +547,9 @@ export function calculateFileStats(sourceFiles) {
   for (const source of sourceFiles) {
     const { content } = source;
     if (content) {
-      // Count tokens using gpt-tokenizer with sanitization
-      try {
-        const sanitizedContent = sanitizeForTokenization(content);
-        const tokens = encode(sanitizedContent);
-        totalTokens += tokens.length;
-      } catch {
-        // Fallback: use calculateTokens which has its own error handling
-        totalTokens += calculateTokens(content);
-      }
+      // Count tokens using gpt-tokenizer
+      const tokens = encode(content, { allowedSpecial: "all" });
+      totalTokens += tokens.length;
       // Count lines (excluding empty lines)
       totalLines += content.split("\n").filter((line) => line.trim() !== "").length;

package/utils/sync-diagram-to-translations.mjs CHANGED Viewed

@@ -4,6 +4,7 @@ import { debug } from "./debug.mjs";
 import path from "node:path";
 import fs from "fs-extra";
 import { d2CodeBlockRegex, diagramImageWithPathRegex } from "./d2-utils.mjs";
+import { pathExists } from "./file-utils.mjs";
 /**
  * Find all translation files for a document
@@ -119,6 +120,15 @@ export async function syncDiagramToTranslations(
   for (const { fileName } of translationFiles) {
     try {
       const translationFilePath = path.join(docsDir, fileName);
+      // Check if translation file exists before reading to avoid unnecessary warnings
+      const fileExists = await pathExists(translationFilePath);
+      if (!fileExists) {
+        debug(`ℹ️  Translation file does not exist yet: ${fileName} (skipping)`);
+        result.skipped++;
+        continue;
+      }
       const translationContent = await readFileContent(docsDir, fileName);
       // Check for null or undefined (file read failure), but allow empty string (valid content)

package/utils/translate-diagram-images.mjs CHANGED Viewed

@@ -7,6 +7,7 @@ import { diagramImageFullRegex } from "./d2-utils.mjs";
 import { calculateImageTimestamp } from "./diagram-version-utils.mjs";
 import { getFileName } from "./utils.mjs";
 import { compressImage } from "./image-compress.mjs";
+import { pathExists } from "./file-utils.mjs";
 // Constants
 const DEFAULT_DIAGRAM_TYPE = "architecture";
@@ -321,6 +322,15 @@ export async function translateDiagramImages(
     for (const { language, fileName } of translationFiles) {
       try {
         const translationFilePath = path.join(docsDir, fileName);
+        // Check if translation file exists before reading to avoid unnecessary warnings
+        const fileExists = await pathExists(translationFilePath);
+        if (!fileExists) {
+          debug(`ℹ️  Translation file does not exist yet: ${fileName} (skipping)`);
+          result.skipped++;
+          continue;
+        }
         const translationContent = await readFileContent(docsDir, fileName);
         if (translationContent === null || translationContent === undefined) {
@@ -715,7 +725,13 @@ export default async function translateDiagramImagesAgent(input, options) {
     // Read current translation file content (if exists) to check timestamps
     const translationFileName = getFileName(docPath, currentLanguage);
-    const translationContent = await readFileContent(docsDir, translationFileName);
+    const translationFilePath = path.join(docsDir, translationFileName);
+    // Check if translation file exists before reading to avoid unnecessary warnings
+    const translationFileExists = await pathExists(translationFilePath);
+    const translationContent = translationFileExists
+      ? await readFileContent(docsDir, translationFileName)
+      : null;
     // Cache diagram images for translation
     // This function will: