npm - @aigne/doc-smith - Versions diffs - 0.9.8 → 0.9.9-beta - Mend

@aigne/doc-smith 0.9.8 → 0.9.9-beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,12 @@
 # Changelog
+## [0.9.9-beta](https://github.com/AIGNE-io/aigne-doc-smith/compare/v0.9.8...v0.9.9-beta) (2025-12-09)
+### Bug Fixes
+* implement robust error handling for token calculation ([#352](https://github.com/AIGNE-io/aigne-doc-smith/issues/352)) ([e7ba726](https://github.com/AIGNE-io/aigne-doc-smith/commit/e7ba726e226c05a1ac2b40c74b101e1a2d972091))
 ## [0.9.8](https://github.com/AIGNE-io/aigne-doc-smith/compare/v0.9.8-beta.1...v0.9.8) (2025-12-07)
 ## [0.9.8-beta.1](https://github.com/AIGNE-io/aigne-doc-smith/compare/v0.9.8-beta...v0.9.8-beta.1) (2025-12-06)

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@aigne/doc-smith",
-  "version": "0.9.8",
+  "version": "0.9.9-beta",
   "description": "AI-driven documentation generation tool built on the AIGNE Framework",
   "publishConfig": {
     "access": "public"

package/utils/file-utils.mjs CHANGED Viewed

@@ -530,9 +530,38 @@ export async function readFileContents(files, baseDir = process.cwd(), options =
   return results.filter((result) => result !== null);
 }
+/**
+ * Sanitize text by removing or escaping disallowed LLM special tokens
+ * This prevents errors when encoding text that contains special tokens like <|endoftext|>
+ * @param {string} text - Text to sanitize
+ * @returns {string} Sanitized text safe for tokenization
+ */
+function sanitizeForTokenization(text) {
+  if (typeof text !== "string") return text;
+  // Replace <|endoftext|> with a safe alternative that won't trigger special token parsing
+  // We replace it with a space-separated version to prevent tokenizer from recognizing it as a special token
+  return text.replace(/<\|endoftext\|>/g, "<| endoftext |>");
+}
 export function calculateTokens(text) {
-  const tokens = encode(text);
-  return tokens.length;
+  try {
+    // Sanitize text before encoding to avoid errors with special tokens
+    const sanitizedText = sanitizeForTokenization(text);
+    const tokens = encode(sanitizedText);
+    return tokens.length;
+  } catch (error) {
+    // If encoding still fails, try with more aggressive sanitization
+    console.warn(`Token calculation warning: ${error.message}`);
+    const fallbackText = sanitizeForTokenization(text).replace(/<\|[^|]+\|>/g, "");
+    try {
+      const tokens = encode(fallbackText);
+      return tokens.length;
+    } catch {
+      // Last resort: estimate tokens based on character count (rough approximation)
+      console.warn(`Token calculation fallback: using character-based estimation`);
+      return Math.ceil(fallbackText.length / 4); // Rough estimate: ~4 chars per token
+    }
+  }
 }
 /**
@@ -547,9 +576,15 @@ export function calculateFileStats(sourceFiles) {
   for (const source of sourceFiles) {
     const { content } = source;
     if (content) {
-      // Count tokens using gpt-tokenizer
-      const tokens = encode(content);
-      totalTokens += tokens.length;
+      // Count tokens using gpt-tokenizer with sanitization
+      try {
+        const sanitizedContent = sanitizeForTokenization(content);
+        const tokens = encode(sanitizedContent);
+        totalTokens += tokens.length;
+      } catch {
+        // Fallback: use calculateTokens which has its own error handling
+        totalTokens += calculateTokens(content);
+      }
       // Count lines (excluding empty lines)
       totalLines += content.split("\n").filter((line) => line.trim() !== "").length;