@aigne/doc-smith 0.9.9-beta.4 → 0.9.10-beta
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,14 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.9.10-beta](https://github.com/AIGNE-io/aigne-doc-smith/compare/v0.9.9...v0.9.10-beta) (2025-12-18)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
### Bug Fixes
|
|
7
|
+
|
|
8
|
+
* more robust translation processing and token handling ([#366](https://github.com/AIGNE-io/aigne-doc-smith/issues/366)) ([0a634ef](https://github.com/AIGNE-io/aigne-doc-smith/commit/0a634ef46b4da1204368af63fddff126c3d4f45d))
|
|
9
|
+
|
|
10
|
+
## [0.9.9](https://github.com/AIGNE-io/aigne-doc-smith/compare/v0.9.9-beta.4...v0.9.9) (2025-12-13)
|
|
11
|
+
|
|
3
12
|
## [0.9.9-beta.4](https://github.com/AIGNE-io/aigne-doc-smith/compare/v0.9.9-beta.3...v0.9.9-beta.4) (2025-12-12)
|
|
4
13
|
|
|
5
14
|
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import { readdirSync } from "node:fs";
|
|
2
|
+
import { join } from "node:path";
|
|
2
3
|
import { findItemByPath, readFileContent } from "../../utils/docs-finder-utils.mjs";
|
|
4
|
+
import { pathExists } from "../../utils/file-utils.mjs";
|
|
3
5
|
|
|
4
6
|
/**
|
|
5
7
|
* Loads a document's content along with all its translations from the docs directory.
|
|
@@ -49,7 +51,17 @@ export default async function loadDocumentAllContent({ path, docsDir, documentSt
|
|
|
49
51
|
);
|
|
50
52
|
|
|
51
53
|
// Process each translation file
|
|
54
|
+
// Note: translationFiles are already filtered by readdirSync, but we check existence again for safety
|
|
52
55
|
for (const file of translationFiles) {
|
|
56
|
+
const filePath = join(docsDir, file);
|
|
57
|
+
|
|
58
|
+
// Check if file exists before reading to avoid unnecessary warnings
|
|
59
|
+
// (though readdirSync should already filter existing files)
|
|
60
|
+
const fileExists = await pathExists(filePath);
|
|
61
|
+
if (!fileExists) {
|
|
62
|
+
continue;
|
|
63
|
+
}
|
|
64
|
+
|
|
53
65
|
const content = await readFileContent(docsDir, file);
|
|
54
66
|
if (content) {
|
|
55
67
|
// Extract language code from filename (e.g., "en" from "doc.en.md" or "zh-CN" from "doc.zh-CN.md")
|
package/package.json
CHANGED
package/utils/file-utils.mjs
CHANGED
|
@@ -530,38 +530,9 @@ export async function readFileContents(files, baseDir = process.cwd(), options =
|
|
|
530
530
|
return results.filter((result) => result !== null);
|
|
531
531
|
}
|
|
532
532
|
|
|
533
|
-
/**
|
|
534
|
-
* Sanitize text by removing or escaping disallowed LLM special tokens
|
|
535
|
-
* This prevents errors when encoding text that contains special tokens like <|endoftext|>
|
|
536
|
-
* @param {string} text - Text to sanitize
|
|
537
|
-
* @returns {string} Sanitized text safe for tokenization
|
|
538
|
-
*/
|
|
539
|
-
function sanitizeForTokenization(text) {
|
|
540
|
-
if (typeof text !== "string") return text;
|
|
541
|
-
// Replace <|endoftext|> with a safe alternative that won't trigger special token parsing
|
|
542
|
-
// We replace it with a space-separated version to prevent tokenizer from recognizing it as a special token
|
|
543
|
-
return text.replace(/<\|endoftext\|>/g, "<| endoftext |>");
|
|
544
|
-
}
|
|
545
|
-
|
|
546
533
|
export function calculateTokens(text) {
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
const sanitizedText = sanitizeForTokenization(text);
|
|
550
|
-
const tokens = encode(sanitizedText);
|
|
551
|
-
return tokens.length;
|
|
552
|
-
} catch (error) {
|
|
553
|
-
// If encoding still fails, try with more aggressive sanitization
|
|
554
|
-
console.warn(`Token calculation warning: ${error.message}`);
|
|
555
|
-
const fallbackText = sanitizeForTokenization(text).replace(/<\|[^|]+\|>/g, "");
|
|
556
|
-
try {
|
|
557
|
-
const tokens = encode(fallbackText);
|
|
558
|
-
return tokens.length;
|
|
559
|
-
} catch {
|
|
560
|
-
// Last resort: estimate tokens based on character count (rough approximation)
|
|
561
|
-
console.warn(`Token calculation fallback: using character-based estimation`);
|
|
562
|
-
return Math.ceil(fallbackText.length / 4); // Rough estimate: ~4 chars per token
|
|
563
|
-
}
|
|
564
|
-
}
|
|
534
|
+
const tokens = encode(text, { allowedSpecial: "all" });
|
|
535
|
+
return tokens.length;
|
|
565
536
|
}
|
|
566
537
|
|
|
567
538
|
/**
|
|
@@ -576,15 +547,9 @@ export function calculateFileStats(sourceFiles) {
|
|
|
576
547
|
for (const source of sourceFiles) {
|
|
577
548
|
const { content } = source;
|
|
578
549
|
if (content) {
|
|
579
|
-
// Count tokens using gpt-tokenizer
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
const tokens = encode(sanitizedContent);
|
|
583
|
-
totalTokens += tokens.length;
|
|
584
|
-
} catch {
|
|
585
|
-
// Fallback: use calculateTokens which has its own error handling
|
|
586
|
-
totalTokens += calculateTokens(content);
|
|
587
|
-
}
|
|
550
|
+
// Count tokens using gpt-tokenizer
|
|
551
|
+
const tokens = encode(content, { allowedSpecial: "all" });
|
|
552
|
+
totalTokens += tokens.length;
|
|
588
553
|
|
|
589
554
|
// Count lines (excluding empty lines)
|
|
590
555
|
totalLines += content.split("\n").filter((line) => line.trim() !== "").length;
|
|
@@ -4,6 +4,7 @@ import { debug } from "./debug.mjs";
|
|
|
4
4
|
import path from "node:path";
|
|
5
5
|
import fs from "fs-extra";
|
|
6
6
|
import { d2CodeBlockRegex, diagramImageWithPathRegex } from "./d2-utils.mjs";
|
|
7
|
+
import { pathExists } from "./file-utils.mjs";
|
|
7
8
|
|
|
8
9
|
/**
|
|
9
10
|
* Find all translation files for a document
|
|
@@ -119,6 +120,15 @@ export async function syncDiagramToTranslations(
|
|
|
119
120
|
for (const { fileName } of translationFiles) {
|
|
120
121
|
try {
|
|
121
122
|
const translationFilePath = path.join(docsDir, fileName);
|
|
123
|
+
|
|
124
|
+
// Check if translation file exists before reading to avoid unnecessary warnings
|
|
125
|
+
const fileExists = await pathExists(translationFilePath);
|
|
126
|
+
if (!fileExists) {
|
|
127
|
+
debug(`ℹ️ Translation file does not exist yet: ${fileName} (skipping)`);
|
|
128
|
+
result.skipped++;
|
|
129
|
+
continue;
|
|
130
|
+
}
|
|
131
|
+
|
|
122
132
|
const translationContent = await readFileContent(docsDir, fileName);
|
|
123
133
|
|
|
124
134
|
// Check for null or undefined (file read failure), but allow empty string (valid content)
|
|
@@ -7,6 +7,7 @@ import { diagramImageFullRegex } from "./d2-utils.mjs";
|
|
|
7
7
|
import { calculateImageTimestamp } from "./diagram-version-utils.mjs";
|
|
8
8
|
import { getFileName } from "./utils.mjs";
|
|
9
9
|
import { compressImage } from "./image-compress.mjs";
|
|
10
|
+
import { pathExists } from "./file-utils.mjs";
|
|
10
11
|
|
|
11
12
|
// Constants
|
|
12
13
|
const DEFAULT_DIAGRAM_TYPE = "architecture";
|
|
@@ -321,6 +322,15 @@ export async function translateDiagramImages(
|
|
|
321
322
|
for (const { language, fileName } of translationFiles) {
|
|
322
323
|
try {
|
|
323
324
|
const translationFilePath = path.join(docsDir, fileName);
|
|
325
|
+
|
|
326
|
+
// Check if translation file exists before reading to avoid unnecessary warnings
|
|
327
|
+
const fileExists = await pathExists(translationFilePath);
|
|
328
|
+
if (!fileExists) {
|
|
329
|
+
debug(`ℹ️ Translation file does not exist yet: ${fileName} (skipping)`);
|
|
330
|
+
result.skipped++;
|
|
331
|
+
continue;
|
|
332
|
+
}
|
|
333
|
+
|
|
324
334
|
const translationContent = await readFileContent(docsDir, fileName);
|
|
325
335
|
|
|
326
336
|
if (translationContent === null || translationContent === undefined) {
|
|
@@ -715,7 +725,13 @@ export default async function translateDiagramImagesAgent(input, options) {
|
|
|
715
725
|
|
|
716
726
|
// Read current translation file content (if exists) to check timestamps
|
|
717
727
|
const translationFileName = getFileName(docPath, currentLanguage);
|
|
718
|
-
const
|
|
728
|
+
const translationFilePath = path.join(docsDir, translationFileName);
|
|
729
|
+
|
|
730
|
+
// Check if translation file exists before reading to avoid unnecessary warnings
|
|
731
|
+
const translationFileExists = await pathExists(translationFilePath);
|
|
732
|
+
const translationContent = translationFileExists
|
|
733
|
+
? await readFileContent(docsDir, translationFileName)
|
|
734
|
+
: null;
|
|
719
735
|
|
|
720
736
|
// Cache diagram images for translation
|
|
721
737
|
// This function will:
|