@aigne/doc-smith 0.9.9 → 0.9.10-beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,12 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.9.10-beta](https://github.com/AIGNE-io/aigne-doc-smith/compare/v0.9.9...v0.9.10-beta) (2025-12-18)
4
+
5
+
6
+ ### Bug Fixes
7
+
8
+ * more robust translation processing and token handling ([#366](https://github.com/AIGNE-io/aigne-doc-smith/issues/366)) ([0a634ef](https://github.com/AIGNE-io/aigne-doc-smith/commit/0a634ef46b4da1204368af63fddff126c3d4f45d))
9
+
3
10
  ## [0.9.9](https://github.com/AIGNE-io/aigne-doc-smith/compare/v0.9.9-beta.4...v0.9.9) (2025-12-13)
4
11
 
5
12
  ## [0.9.9-beta.4](https://github.com/AIGNE-io/aigne-doc-smith/compare/v0.9.9-beta.3...v0.9.9-beta.4) (2025-12-12)
@@ -1,5 +1,7 @@
1
1
  import { readdirSync } from "node:fs";
2
+ import { join } from "node:path";
2
3
  import { findItemByPath, readFileContent } from "../../utils/docs-finder-utils.mjs";
4
+ import { pathExists } from "../../utils/file-utils.mjs";
3
5
 
4
6
  /**
5
7
  * Loads a document's content along with all its translations from the docs directory.
@@ -49,7 +51,17 @@ export default async function loadDocumentAllContent({ path, docsDir, documentSt
49
51
  );
50
52
 
51
53
  // Process each translation file
54
+ // Note: translationFiles are already filtered by readdirSync, but we check existence again for safety
52
55
  for (const file of translationFiles) {
56
+ const filePath = join(docsDir, file);
57
+
58
+ // Check if file exists before reading to avoid unnecessary warnings
59
+ // (though readdirSync should already filter existing files)
60
+ const fileExists = await pathExists(filePath);
61
+ if (!fileExists) {
62
+ continue;
63
+ }
64
+
53
65
  const content = await readFileContent(docsDir, file);
54
66
  if (content) {
55
67
  // Extract language code from filename (e.g., "en" from "doc.en.md" or "zh-CN" from "doc.zh-CN.md")
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@aigne/doc-smith",
3
- "version": "0.9.9",
3
+ "version": "0.9.10-beta",
4
4
  "description": "AI-driven documentation generation tool built on the AIGNE Framework",
5
5
  "publishConfig": {
6
6
  "access": "public"
@@ -530,38 +530,9 @@ export async function readFileContents(files, baseDir = process.cwd(), options =
530
530
  return results.filter((result) => result !== null);
531
531
  }
532
532
 
533
- /**
534
- * Sanitize text by removing or escaping disallowed LLM special tokens
535
- * This prevents errors when encoding text that contains special tokens like <|endoftext|>
536
- * @param {string} text - Text to sanitize
537
- * @returns {string} Sanitized text safe for tokenization
538
- */
539
- function sanitizeForTokenization(text) {
540
- if (typeof text !== "string") return text;
541
- // Replace <|endoftext|> with a safe alternative that won't trigger special token parsing
542
- // We replace it with a space-separated version to prevent tokenizer from recognizing it as a special token
543
- return text.replace(/<\|endoftext\|>/g, "<| endoftext |>");
544
- }
545
-
546
533
  export function calculateTokens(text) {
547
- try {
548
- // Sanitize text before encoding to avoid errors with special tokens
549
- const sanitizedText = sanitizeForTokenization(text);
550
- const tokens = encode(sanitizedText);
551
- return tokens.length;
552
- } catch (error) {
553
- // If encoding still fails, try with more aggressive sanitization
554
- console.warn(`Token calculation warning: ${error.message}`);
555
- const fallbackText = sanitizeForTokenization(text).replace(/<\|[^|]+\|>/g, "");
556
- try {
557
- const tokens = encode(fallbackText);
558
- return tokens.length;
559
- } catch {
560
- // Last resort: estimate tokens based on character count (rough approximation)
561
- console.warn(`Token calculation fallback: using character-based estimation`);
562
- return Math.ceil(fallbackText.length / 4); // Rough estimate: ~4 chars per token
563
- }
564
- }
534
+ const tokens = encode(text, { allowedSpecial: "all" });
535
+ return tokens.length;
565
536
  }
566
537
 
567
538
  /**
@@ -576,15 +547,9 @@ export function calculateFileStats(sourceFiles) {
576
547
  for (const source of sourceFiles) {
577
548
  const { content } = source;
578
549
  if (content) {
579
- // Count tokens using gpt-tokenizer with sanitization
580
- try {
581
- const sanitizedContent = sanitizeForTokenization(content);
582
- const tokens = encode(sanitizedContent);
583
- totalTokens += tokens.length;
584
- } catch {
585
- // Fallback: use calculateTokens which has its own error handling
586
- totalTokens += calculateTokens(content);
587
- }
550
+ // Count tokens using gpt-tokenizer
551
+ const tokens = encode(content, { allowedSpecial: "all" });
552
+ totalTokens += tokens.length;
588
553
 
589
554
  // Count lines (excluding empty lines)
590
555
  totalLines += content.split("\n").filter((line) => line.trim() !== "").length;
@@ -4,6 +4,7 @@ import { debug } from "./debug.mjs";
4
4
  import path from "node:path";
5
5
  import fs from "fs-extra";
6
6
  import { d2CodeBlockRegex, diagramImageWithPathRegex } from "./d2-utils.mjs";
7
+ import { pathExists } from "./file-utils.mjs";
7
8
 
8
9
  /**
9
10
  * Find all translation files for a document
@@ -119,6 +120,15 @@ export async function syncDiagramToTranslations(
119
120
  for (const { fileName } of translationFiles) {
120
121
  try {
121
122
  const translationFilePath = path.join(docsDir, fileName);
123
+
124
+ // Check if translation file exists before reading to avoid unnecessary warnings
125
+ const fileExists = await pathExists(translationFilePath);
126
+ if (!fileExists) {
127
+ debug(`ℹ️ Translation file does not exist yet: ${fileName} (skipping)`);
128
+ result.skipped++;
129
+ continue;
130
+ }
131
+
122
132
  const translationContent = await readFileContent(docsDir, fileName);
123
133
 
124
134
  // Check for null or undefined (file read failure), but allow empty string (valid content)
@@ -7,6 +7,7 @@ import { diagramImageFullRegex } from "./d2-utils.mjs";
7
7
  import { calculateImageTimestamp } from "./diagram-version-utils.mjs";
8
8
  import { getFileName } from "./utils.mjs";
9
9
  import { compressImage } from "./image-compress.mjs";
10
+ import { pathExists } from "./file-utils.mjs";
10
11
 
11
12
  // Constants
12
13
  const DEFAULT_DIAGRAM_TYPE = "architecture";
@@ -321,6 +322,15 @@ export async function translateDiagramImages(
321
322
  for (const { language, fileName } of translationFiles) {
322
323
  try {
323
324
  const translationFilePath = path.join(docsDir, fileName);
325
+
326
+ // Check if translation file exists before reading to avoid unnecessary warnings
327
+ const fileExists = await pathExists(translationFilePath);
328
+ if (!fileExists) {
329
+ debug(`ℹ️ Translation file does not exist yet: ${fileName} (skipping)`);
330
+ result.skipped++;
331
+ continue;
332
+ }
333
+
324
334
  const translationContent = await readFileContent(docsDir, fileName);
325
335
 
326
336
  if (translationContent === null || translationContent === undefined) {
@@ -715,7 +725,13 @@ export default async function translateDiagramImagesAgent(input, options) {
715
725
 
716
726
  // Read current translation file content (if exists) to check timestamps
717
727
  const translationFileName = getFileName(docPath, currentLanguage);
718
- const translationContent = await readFileContent(docsDir, translationFileName);
728
+ const translationFilePath = path.join(docsDir, translationFileName);
729
+
730
+ // Check if translation file exists before reading to avoid unnecessary warnings
731
+ const translationFileExists = await pathExists(translationFilePath);
732
+ const translationContent = translationFileExists
733
+ ? await readFileContent(docsDir, translationFileName)
734
+ : null;
719
735
 
720
736
  // Cache diagram images for translation
721
737
  // This function will: