@aigne/doc-smith 0.9.8 → 0.9.9-beta
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +7 -0
- package/package.json +1 -1
- package/utils/file-utils.mjs +40 -5
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.9.9-beta](https://github.com/AIGNE-io/aigne-doc-smith/compare/v0.9.8...v0.9.9-beta) (2025-12-09)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
### Bug Fixes
|
|
7
|
+
|
|
8
|
+
* implement robust error handling for token calculation ([#352](https://github.com/AIGNE-io/aigne-doc-smith/issues/352)) ([e7ba726](https://github.com/AIGNE-io/aigne-doc-smith/commit/e7ba726e226c05a1ac2b40c74b101e1a2d972091))
|
|
9
|
+
|
|
3
10
|
## [0.9.8](https://github.com/AIGNE-io/aigne-doc-smith/compare/v0.9.8-beta.1...v0.9.8) (2025-12-07)
|
|
4
11
|
|
|
5
12
|
## [0.9.8-beta.1](https://github.com/AIGNE-io/aigne-doc-smith/compare/v0.9.8-beta...v0.9.8-beta.1) (2025-12-06)
|
package/package.json
CHANGED
package/utils/file-utils.mjs
CHANGED
|
@@ -530,9 +530,38 @@ export async function readFileContents(files, baseDir = process.cwd(), options =
|
|
|
530
530
|
return results.filter((result) => result !== null);
|
|
531
531
|
}
|
|
532
532
|
|
|
533
|
+
/**
|
|
534
|
+
* Sanitize text by removing or escaping disallowed LLM special tokens
|
|
535
|
+
* This prevents errors when encoding text that contains special tokens like <|endoftext|>
|
|
536
|
+
* @param {string} text - Text to sanitize
|
|
537
|
+
* @returns {string} Sanitized text safe for tokenization
|
|
538
|
+
*/
|
|
539
|
+
function sanitizeForTokenization(text) {
|
|
540
|
+
if (typeof text !== "string") return text;
|
|
541
|
+
// Replace <|endoftext|> with a safe alternative that won't trigger special token parsing
|
|
542
|
+
// We replace it with a space-separated version to prevent tokenizer from recognizing it as a special token
|
|
543
|
+
return text.replace(/<\|endoftext\|>/g, "<| endoftext |>");
|
|
544
|
+
}
|
|
545
|
+
|
|
533
546
|
export function calculateTokens(text) {
|
|
534
|
-
|
|
535
|
-
|
|
547
|
+
try {
|
|
548
|
+
// Sanitize text before encoding to avoid errors with special tokens
|
|
549
|
+
const sanitizedText = sanitizeForTokenization(text);
|
|
550
|
+
const tokens = encode(sanitizedText);
|
|
551
|
+
return tokens.length;
|
|
552
|
+
} catch (error) {
|
|
553
|
+
// If encoding still fails, try with more aggressive sanitization
|
|
554
|
+
console.warn(`Token calculation warning: ${error.message}`);
|
|
555
|
+
const fallbackText = sanitizeForTokenization(text).replace(/<\|[^|]+\|>/g, "");
|
|
556
|
+
try {
|
|
557
|
+
const tokens = encode(fallbackText);
|
|
558
|
+
return tokens.length;
|
|
559
|
+
} catch {
|
|
560
|
+
// Last resort: estimate tokens based on character count (rough approximation)
|
|
561
|
+
console.warn(`Token calculation fallback: using character-based estimation`);
|
|
562
|
+
return Math.ceil(fallbackText.length / 4); // Rough estimate: ~4 chars per token
|
|
563
|
+
}
|
|
564
|
+
}
|
|
536
565
|
}
|
|
537
566
|
|
|
538
567
|
/**
|
|
@@ -547,9 +576,15 @@ export function calculateFileStats(sourceFiles) {
|
|
|
547
576
|
for (const source of sourceFiles) {
|
|
548
577
|
const { content } = source;
|
|
549
578
|
if (content) {
|
|
550
|
-
// Count tokens using gpt-tokenizer
|
|
551
|
-
|
|
552
|
-
|
|
579
|
+
// Count tokens using gpt-tokenizer with sanitization
|
|
580
|
+
try {
|
|
581
|
+
const sanitizedContent = sanitizeForTokenization(content);
|
|
582
|
+
const tokens = encode(sanitizedContent);
|
|
583
|
+
totalTokens += tokens.length;
|
|
584
|
+
} catch {
|
|
585
|
+
// Fallback: use calculateTokens which has its own error handling
|
|
586
|
+
totalTokens += calculateTokens(content);
|
|
587
|
+
}
|
|
553
588
|
|
|
554
589
|
// Count lines (excluding empty lines)
|
|
555
590
|
totalLines += content.split("\n").filter((line) => line.trim() !== "").length;
|