@vertana/core 0.1.0-dev.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +20 -0
- package/dist/_virtual/rolldown_runtime.cjs +29 -0
- package/dist/accumulator.cjs +64 -0
- package/dist/accumulator.d.cts +51 -0
- package/dist/accumulator.d.ts +51 -0
- package/dist/accumulator.js +61 -0
- package/dist/chunking.cjs +76 -0
- package/dist/chunking.d.cts +124 -0
- package/dist/chunking.d.ts +124 -0
- package/dist/chunking.js +74 -0
- package/dist/context.cjs +51 -0
- package/dist/context.d.cts +148 -0
- package/dist/context.d.ts +148 -0
- package/dist/context.js +49 -0
- package/dist/evaluation.cjs +120 -0
- package/dist/evaluation.d.cts +111 -0
- package/dist/evaluation.d.ts +111 -0
- package/dist/evaluation.js +119 -0
- package/dist/glossary.cjs +0 -0
- package/dist/glossary.d.cts +25 -0
- package/dist/glossary.d.ts +25 -0
- package/dist/glossary.js +0 -0
- package/dist/html.cjs +253 -0
- package/dist/html.d.cts +41 -0
- package/dist/html.d.ts +41 -0
- package/dist/html.js +250 -0
- package/dist/index.cjs +39 -0
- package/dist/index.d.cts +17 -0
- package/dist/index.d.ts +17 -0
- package/dist/index.js +16 -0
- package/dist/markdown.cjs +300 -0
- package/dist/markdown.d.cts +17 -0
- package/dist/markdown.d.ts +17 -0
- package/dist/markdown.js +300 -0
- package/dist/plaintext.cjs +70 -0
- package/dist/plaintext.d.cts +17 -0
- package/dist/plaintext.d.ts +17 -0
- package/dist/plaintext.js +70 -0
- package/dist/prompt.cjs +91 -0
- package/dist/prompt.d.cts +74 -0
- package/dist/prompt.d.ts +74 -0
- package/dist/prompt.js +86 -0
- package/dist/refine.cjs +243 -0
- package/dist/refine.d.cts +148 -0
- package/dist/refine.d.ts +148 -0
- package/dist/refine.js +241 -0
- package/dist/select.cjs +62 -0
- package/dist/select.d.cts +83 -0
- package/dist/select.d.ts +83 -0
- package/dist/select.js +61 -0
- package/dist/terms.cjs +60 -0
- package/dist/terms.d.cts +36 -0
- package/dist/terms.d.ts +36 -0
- package/dist/terms.js +59 -0
- package/dist/tokens.cjs +40 -0
- package/dist/tokens.d.cts +24 -0
- package/dist/tokens.d.ts +24 -0
- package/dist/tokens.js +38 -0
- package/dist/tools.cjs +35 -0
- package/dist/tools.d.cts +20 -0
- package/dist/tools.d.ts +20 -0
- package/dist/tools.js +34 -0
- package/dist/translate.cjs +200 -0
- package/dist/translate.d.cts +190 -0
- package/dist/translate.d.ts +190 -0
- package/dist/translate.js +199 -0
- package/dist/window.cjs +0 -0
- package/dist/window.d.cts +48 -0
- package/dist/window.d.ts +48 -0
- package/dist/window.js +0 -0
- package/package.json +215 -0
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import { Glossary } from "./glossary.js";
|
|
2
|
+
import { LanguageModel } from "ai";
|
|
3
|
+
|
|
4
|
+
//#region src/evaluation.d.ts
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Options for {@link TranslationEvaluator}.
|
|
8
|
+
*/
|
|
9
|
+
interface EvaluatorOptions {
|
|
10
|
+
/**
|
|
11
|
+
* An optional `AbortSignal` to cancel the evaluation.
|
|
12
|
+
*/
|
|
13
|
+
readonly signal?: AbortSignal;
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Options for the {@link evaluate} function.
|
|
17
|
+
*/
|
|
18
|
+
interface EvaluateOptions {
|
|
19
|
+
/**
|
|
20
|
+
* The target language of the translation.
|
|
21
|
+
*/
|
|
22
|
+
readonly targetLanguage: Intl.Locale | string;
|
|
23
|
+
/**
|
|
24
|
+
* The source language of the original text.
|
|
25
|
+
*/
|
|
26
|
+
readonly sourceLanguage?: Intl.Locale | string;
|
|
27
|
+
/**
|
|
28
|
+
* A glossary of terms that should be used consistently.
|
|
29
|
+
*/
|
|
30
|
+
readonly glossary?: Glossary;
|
|
31
|
+
/**
|
|
32
|
+
* An optional `AbortSignal` to cancel the evaluation.
|
|
33
|
+
*/
|
|
34
|
+
readonly signal?: AbortSignal;
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Evaluates the quality of a translation.
|
|
38
|
+
*
|
|
39
|
+
* @param original The original text that was translated.
|
|
40
|
+
* @param translated The translated text to evaluate.
|
|
41
|
+
* @param options Optional settings for the evaluation.
|
|
42
|
+
* @returns A promise that resolves to the evaluation result.
|
|
43
|
+
*/
|
|
44
|
+
type TranslationEvaluator = (original: string, translated: string, options?: EvaluatorOptions) => Promise<EvaluationResult>;
|
|
45
|
+
/**
|
|
46
|
+
* The result of evaluating a translation.
|
|
47
|
+
*/
|
|
48
|
+
interface EvaluationResult {
|
|
49
|
+
/**
|
|
50
|
+
* A quality score between 0 and 1, where 1 indicates a perfect translation
|
|
51
|
+
* and 0 indicates a completely incorrect translation.
|
|
52
|
+
*/
|
|
53
|
+
readonly score: number;
|
|
54
|
+
/**
|
|
55
|
+
* Specific issues found in the translation.
|
|
56
|
+
*/
|
|
57
|
+
readonly issues: readonly TranslationIssue[];
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* The type of issue found in a translation.
|
|
61
|
+
*
|
|
62
|
+
* - `"accuracy"`: The translation does not accurately convey the meaning
|
|
63
|
+
* of the original text.
|
|
64
|
+
* - `"fluency"`: The translation is not natural or readable in the target
|
|
65
|
+
* language.
|
|
66
|
+
* - `"terminology"`: Incorrect or inconsistent use of domain-specific terms.
|
|
67
|
+
* - `"style"`: The translation does not match the desired tone or style.
|
|
68
|
+
*/
|
|
69
|
+
type TranslationIssueType = "accuracy" | "fluency" | "terminology" | "style";
|
|
70
|
+
/**
|
|
71
|
+
* A specific issue found in a translation.
|
|
72
|
+
*/
|
|
73
|
+
interface TranslationIssue {
|
|
74
|
+
/**
|
|
75
|
+
* The type of issue.
|
|
76
|
+
*/
|
|
77
|
+
readonly type: TranslationIssueType;
|
|
78
|
+
/**
|
|
79
|
+
* A human-readable description of the issue.
|
|
80
|
+
*/
|
|
81
|
+
readonly description: string;
|
|
82
|
+
/**
|
|
83
|
+
* The location of the issue in the translated text, if applicable.
|
|
84
|
+
*/
|
|
85
|
+
readonly location?: TranslationIssueLocation;
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* The location of a translation issue within the translated text.
|
|
89
|
+
*/
|
|
90
|
+
interface TranslationIssueLocation {
|
|
91
|
+
/**
|
|
92
|
+
* The starting character index (0-based, inclusive).
|
|
93
|
+
*/
|
|
94
|
+
readonly start: number;
|
|
95
|
+
/**
|
|
96
|
+
* The ending character index (0-based, exclusive).
|
|
97
|
+
*/
|
|
98
|
+
readonly end: number;
|
|
99
|
+
}
|
|
100
|
+
/**
|
|
101
|
+
* Evaluates the quality of a translation using an LLM.
|
|
102
|
+
*
|
|
103
|
+
* @param model The language model to use for evaluation.
|
|
104
|
+
* @param original The original text that was translated.
|
|
105
|
+
* @param translated The translated text to evaluate.
|
|
106
|
+
* @param options Evaluation options including target language.
|
|
107
|
+
* @returns A promise that resolves to the evaluation result.
|
|
108
|
+
*/
|
|
109
|
+
declare function evaluate(model: LanguageModel, original: string, translated: string, options: EvaluateOptions): Promise<EvaluationResult>;
|
|
110
|
+
//#endregion
|
|
111
|
+
export { EvaluateOptions, EvaluationResult, EvaluatorOptions, TranslationEvaluator, TranslationIssue, TranslationIssueLocation, TranslationIssueType, evaluate };
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import { getLogger } from "@logtape/logtape";
|
|
2
|
+
import { generateObject } from "ai";
|
|
3
|
+
import { z } from "zod";
|
|
4
|
+
|
|
5
|
+
//#region src/evaluation.ts
|
|
6
|
+
const logger = getLogger([
|
|
7
|
+
"vertana",
|
|
8
|
+
"core",
|
|
9
|
+
"evaluate"
|
|
10
|
+
]);
|
|
11
|
+
const issueTypeSchema = z.enum([
|
|
12
|
+
"accuracy",
|
|
13
|
+
"fluency",
|
|
14
|
+
"terminology",
|
|
15
|
+
"style"
|
|
16
|
+
]);
|
|
17
|
+
const issueSchema = z.object({
|
|
18
|
+
type: issueTypeSchema,
|
|
19
|
+
description: z.string(),
|
|
20
|
+
location: z.object({
|
|
21
|
+
start: z.number(),
|
|
22
|
+
end: z.number()
|
|
23
|
+
}).optional()
|
|
24
|
+
});
|
|
25
|
+
const evaluationResultSchema = z.object({
|
|
26
|
+
score: z.number().min(0).max(1),
|
|
27
|
+
issues: z.array(issueSchema)
|
|
28
|
+
});
|
|
29
|
+
/**
|
|
30
|
+
* Gets the language name from a locale.
|
|
31
|
+
*/
|
|
32
|
+
function getLanguageName(locale) {
|
|
33
|
+
const tag = typeof locale === "string" ? locale : locale.baseName;
|
|
34
|
+
try {
|
|
35
|
+
return new Intl.DisplayNames(["en"], { type: "language" }).of(tag) ?? tag;
|
|
36
|
+
} catch {
|
|
37
|
+
return tag;
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Builds the system prompt for evaluation.
|
|
42
|
+
*/
|
|
43
|
+
function buildEvaluationSystemPrompt(options) {
|
|
44
|
+
const targetLang = getLanguageName(options.targetLanguage);
|
|
45
|
+
let prompt = `You are an expert translation quality evaluator.
|
|
46
|
+
|
|
47
|
+
Your task is to evaluate the quality of a translation from ${(options.sourceLanguage ? getLanguageName(options.sourceLanguage) : null) ?? "the source language"} to ${targetLang}.
|
|
48
|
+
|
|
49
|
+
Evaluate the translation based on these criteria:
|
|
50
|
+
|
|
51
|
+
1. **Accuracy**: Does the translation accurately convey the meaning of the original text?
|
|
52
|
+
2. **Fluency**: Is the translation natural and readable in ${targetLang}?
|
|
53
|
+
3. **Terminology**: Are domain-specific terms translated correctly and consistently?
|
|
54
|
+
4. **Style**: Does the translation maintain the appropriate tone and style?
|
|
55
|
+
|
|
56
|
+
Provide:
|
|
57
|
+
- A score from 0 to 1 (where 1 is perfect, 0.9+ is excellent, 0.7-0.9 is good, 0.5-0.7 is acceptable, below 0.5 is poor)
|
|
58
|
+
- A list of specific issues found, if any
|
|
59
|
+
|
|
60
|
+
Be strict but fair in your evaluation. Minor issues should result in small deductions, while major meaning errors should significantly lower the score.`;
|
|
61
|
+
if (options.glossary != null && options.glossary.length > 0) {
|
|
62
|
+
prompt += `\n\n## Glossary
|
|
63
|
+
|
|
64
|
+
The following terms MUST be translated as specified. Violations should be marked as "terminology" issues:
|
|
65
|
+
|
|
66
|
+
`;
|
|
67
|
+
for (const entry of options.glossary) prompt += `- "${entry.original}" → "${entry.translated}"\n`;
|
|
68
|
+
}
|
|
69
|
+
return prompt;
|
|
70
|
+
}
|
|
71
|
+
/**
|
|
72
|
+
* Builds the user prompt for evaluation.
|
|
73
|
+
*/
|
|
74
|
+
function buildEvaluationUserPrompt(original, translated) {
|
|
75
|
+
return `## Original Text
|
|
76
|
+
|
|
77
|
+
${original}
|
|
78
|
+
|
|
79
|
+
## Translated Text
|
|
80
|
+
|
|
81
|
+
${translated}
|
|
82
|
+
|
|
83
|
+
Please evaluate this translation.`;
|
|
84
|
+
}
|
|
85
|
+
/**
|
|
86
|
+
* Evaluates the quality of a translation using an LLM.
|
|
87
|
+
*
|
|
88
|
+
* @param model The language model to use for evaluation.
|
|
89
|
+
* @param original The original text that was translated.
|
|
90
|
+
* @param translated The translated text to evaluate.
|
|
91
|
+
* @param options Evaluation options including target language.
|
|
92
|
+
* @returns A promise that resolves to the evaluation result.
|
|
93
|
+
*/
|
|
94
|
+
async function evaluate(model, original, translated, options) {
|
|
95
|
+
logger.debug("Evaluating translation quality...");
|
|
96
|
+
const result = await generateObject({
|
|
97
|
+
model,
|
|
98
|
+
schema: evaluationResultSchema,
|
|
99
|
+
system: buildEvaluationSystemPrompt(options),
|
|
100
|
+
prompt: buildEvaluationUserPrompt(original, translated),
|
|
101
|
+
abortSignal: options.signal
|
|
102
|
+
});
|
|
103
|
+
const issues = result.object.issues.map((issue) => ({
|
|
104
|
+
type: issue.type,
|
|
105
|
+
description: issue.description,
|
|
106
|
+
location: issue.location
|
|
107
|
+
}));
|
|
108
|
+
logger.debug("Evaluation result: score {score}, {issueCount} issue(s).", {
|
|
109
|
+
score: result.object.score,
|
|
110
|
+
issueCount: issues.length
|
|
111
|
+
});
|
|
112
|
+
return {
|
|
113
|
+
score: result.object.score,
|
|
114
|
+
issues
|
|
115
|
+
};
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
//#endregion
|
|
119
|
+
export { evaluate };
|
|
File without changes
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
//#region src/glossary.d.ts
|
|
2
|
+
/**
|
|
3
|
+
* A glossary of terms for consistent translation.
|
|
4
|
+
*/
|
|
5
|
+
type Glossary = readonly GlossaryEntry[];
|
|
6
|
+
/**
|
|
7
|
+
* An entry in a {@link Glossary}.
|
|
8
|
+
*/
|
|
9
|
+
interface GlossaryEntry {
|
|
10
|
+
/**
|
|
11
|
+
* The original term in the source language.
|
|
12
|
+
*/
|
|
13
|
+
readonly original: string;
|
|
14
|
+
/**
|
|
15
|
+
* The translated term in the target language.
|
|
16
|
+
*/
|
|
17
|
+
readonly translated: string;
|
|
18
|
+
/**
|
|
19
|
+
* Optional context describing when to use this translation.
|
|
20
|
+
* This helps disambiguate terms that may have multiple translations.
|
|
21
|
+
*/
|
|
22
|
+
readonly context?: string;
|
|
23
|
+
}
|
|
24
|
+
//#endregion
|
|
25
|
+
export { Glossary, GlossaryEntry };
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
//#region src/glossary.d.ts
|
|
2
|
+
/**
|
|
3
|
+
* A glossary of terms for consistent translation.
|
|
4
|
+
*/
|
|
5
|
+
type Glossary = readonly GlossaryEntry[];
|
|
6
|
+
/**
|
|
7
|
+
* An entry in a {@link Glossary}.
|
|
8
|
+
*/
|
|
9
|
+
interface GlossaryEntry {
|
|
10
|
+
/**
|
|
11
|
+
* The original term in the source language.
|
|
12
|
+
*/
|
|
13
|
+
readonly original: string;
|
|
14
|
+
/**
|
|
15
|
+
* The translated term in the target language.
|
|
16
|
+
*/
|
|
17
|
+
readonly translated: string;
|
|
18
|
+
/**
|
|
19
|
+
* Optional context describing when to use this translation.
|
|
20
|
+
* This helps disambiguate terms that may have multiple translations.
|
|
21
|
+
*/
|
|
22
|
+
readonly context?: string;
|
|
23
|
+
}
|
|
24
|
+
//#endregion
|
|
25
|
+
export { Glossary, GlossaryEntry };
|
package/dist/glossary.js
ADDED
|
File without changes
|
package/dist/html.cjs
ADDED
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
const require_rolldown_runtime = require('./_virtual/rolldown_runtime.cjs');
|
|
2
|
+
const require_tokens = require('./tokens.cjs');
|
|
3
|
+
let htmlparser2 = require("htmlparser2");
|
|
4
|
+
let dom_serializer = require("dom-serializer");
|
|
5
|
+
dom_serializer = require_rolldown_runtime.__toESM(dom_serializer);
|
|
6
|
+
|
|
7
|
+
//#region src/html.ts
|
|
8
|
+
/**
|
|
9
|
+
* Default attributes that should be translated.
|
|
10
|
+
*/
|
|
11
|
+
const DEFAULT_TRANSLATABLE_ATTRIBUTES = [
|
|
12
|
+
"alt",
|
|
13
|
+
"title",
|
|
14
|
+
"placeholder",
|
|
15
|
+
"aria-label",
|
|
16
|
+
"aria-description"
|
|
17
|
+
];
|
|
18
|
+
/**
|
|
19
|
+
* Block-level elements that create natural chunk boundaries.
|
|
20
|
+
*/
|
|
21
|
+
const BLOCK_ELEMENTS = new Set([
|
|
22
|
+
"address",
|
|
23
|
+
"article",
|
|
24
|
+
"aside",
|
|
25
|
+
"blockquote",
|
|
26
|
+
"canvas",
|
|
27
|
+
"dd",
|
|
28
|
+
"div",
|
|
29
|
+
"dl",
|
|
30
|
+
"dt",
|
|
31
|
+
"fieldset",
|
|
32
|
+
"figcaption",
|
|
33
|
+
"figure",
|
|
34
|
+
"footer",
|
|
35
|
+
"form",
|
|
36
|
+
"h1",
|
|
37
|
+
"h2",
|
|
38
|
+
"h3",
|
|
39
|
+
"h4",
|
|
40
|
+
"h5",
|
|
41
|
+
"h6",
|
|
42
|
+
"header",
|
|
43
|
+
"hr",
|
|
44
|
+
"li",
|
|
45
|
+
"main",
|
|
46
|
+
"nav",
|
|
47
|
+
"noscript",
|
|
48
|
+
"ol",
|
|
49
|
+
"p",
|
|
50
|
+
"pre",
|
|
51
|
+
"section",
|
|
52
|
+
"table",
|
|
53
|
+
"tbody",
|
|
54
|
+
"td",
|
|
55
|
+
"tfoot",
|
|
56
|
+
"th",
|
|
57
|
+
"thead",
|
|
58
|
+
"tr",
|
|
59
|
+
"ul",
|
|
60
|
+
"video"
|
|
61
|
+
]);
|
|
62
|
+
/**
|
|
63
|
+
* Elements whose content should not be translated.
|
|
64
|
+
*/
|
|
65
|
+
const NON_TRANSLATABLE_ELEMENTS = new Set([
|
|
66
|
+
"script",
|
|
67
|
+
"style",
|
|
68
|
+
"svg",
|
|
69
|
+
"math"
|
|
70
|
+
]);
|
|
71
|
+
/**
|
|
72
|
+
* Determines if a node is an element.
|
|
73
|
+
*/
|
|
74
|
+
function isElement(node) {
|
|
75
|
+
return node.type === "tag";
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Determines if an element is a block-level element.
|
|
79
|
+
*/
|
|
80
|
+
function isBlockElement(node) {
|
|
81
|
+
return isElement(node) && BLOCK_ELEMENTS.has(node.name.toLowerCase());
|
|
82
|
+
}
|
|
83
|
+
/**
|
|
84
|
+
* Determines if an element should be excluded from translation.
|
|
85
|
+
*/
|
|
86
|
+
function isNonTranslatable(node) {
|
|
87
|
+
return isElement(node) && NON_TRANSLATABLE_ELEMENTS.has(node.name.toLowerCase());
|
|
88
|
+
}
|
|
89
|
+
/**
|
|
90
|
+
* Determines the chunk type from an HTML element.
|
|
91
|
+
*/
|
|
92
|
+
function getChunkTypeFromElement(element) {
|
|
93
|
+
const name = element.name.toLowerCase();
|
|
94
|
+
if (/^h[1-6]$/.test(name)) return "heading";
|
|
95
|
+
if ([
|
|
96
|
+
"ul",
|
|
97
|
+
"ol",
|
|
98
|
+
"dl"
|
|
99
|
+
].includes(name)) return "list";
|
|
100
|
+
if (["pre", "code"].includes(name)) return "code";
|
|
101
|
+
if ([
|
|
102
|
+
"section",
|
|
103
|
+
"article",
|
|
104
|
+
"header",
|
|
105
|
+
"footer",
|
|
106
|
+
"nav",
|
|
107
|
+
"aside",
|
|
108
|
+
"main"
|
|
109
|
+
].includes(name)) return "section";
|
|
110
|
+
return "paragraph";
|
|
111
|
+
}
|
|
112
|
+
/**
|
|
113
|
+
* Gets the text content of a node (for checking if it has translatable
|
|
114
|
+
* content).
|
|
115
|
+
*/
|
|
116
|
+
function getTextContent(node) {
|
|
117
|
+
if ("type" in node && node.type === "text") return node.data;
|
|
118
|
+
if ("children" in node) return node.children.map((child) => getTextContent(child)).join("");
|
|
119
|
+
return "";
|
|
120
|
+
}
|
|
121
|
+
/**
|
|
122
|
+
* Checks if an element has any translatable attributes.
|
|
123
|
+
*/
|
|
124
|
+
function hasTranslatableAttributes(node) {
|
|
125
|
+
if (!isElement(node)) return false;
|
|
126
|
+
for (const attr of DEFAULT_TRANSLATABLE_ATTRIBUTES) {
|
|
127
|
+
const value = node.attribs[attr];
|
|
128
|
+
if (value != null && value.trim().length > 0) return true;
|
|
129
|
+
}
|
|
130
|
+
for (const child of node.children) if (hasTranslatableAttributes(child)) return true;
|
|
131
|
+
return false;
|
|
132
|
+
}
|
|
133
|
+
/**
|
|
134
|
+
* Checks if a node has any translatable content.
|
|
135
|
+
*/
|
|
136
|
+
function hasTranslatableContent(node) {
|
|
137
|
+
if (isNonTranslatable(node)) return false;
|
|
138
|
+
if (getTextContent(node).trim().length > 0) return true;
|
|
139
|
+
return hasTranslatableAttributes(node);
|
|
140
|
+
}
|
|
141
|
+
/**
|
|
142
|
+
* Extracts translatable blocks from an HTML document.
|
|
143
|
+
*/
|
|
144
|
+
function extractBlocks(doc, _options) {
|
|
145
|
+
const blocks = [];
|
|
146
|
+
function processNode(node) {
|
|
147
|
+
if (!isElement(node)) return;
|
|
148
|
+
const name = node.name.toLowerCase();
|
|
149
|
+
if (NON_TRANSLATABLE_ELEMENTS.has(name)) return;
|
|
150
|
+
if (BLOCK_ELEMENTS.has(name)) {
|
|
151
|
+
if (hasTranslatableContent(node)) blocks.push({
|
|
152
|
+
html: (0, dom_serializer.default)(node),
|
|
153
|
+
type: getChunkTypeFromElement(node)
|
|
154
|
+
});
|
|
155
|
+
return;
|
|
156
|
+
}
|
|
157
|
+
for (const child of node.children) processNode(child);
|
|
158
|
+
}
|
|
159
|
+
for (const node of doc.children) processNode(node);
|
|
160
|
+
return blocks;
|
|
161
|
+
}
|
|
162
|
+
/**
|
|
163
|
+
* Splits text content at sentence boundaries.
|
|
164
|
+
*/
|
|
165
|
+
function splitAtSentences(text) {
|
|
166
|
+
return text.split(/(?<=[.!?])\s+/).filter((p) => p.trim().length > 0);
|
|
167
|
+
}
|
|
168
|
+
/**
|
|
169
|
+
* Splits an HTML block into smaller pieces if it exceeds the token limit.
|
|
170
|
+
*/
|
|
171
|
+
function splitHtmlBlock(block, maxTokens, countTokens$1) {
|
|
172
|
+
if (countTokens$1(block.html) <= maxTokens) return [block];
|
|
173
|
+
const doc = (0, htmlparser2.parseDocument)(block.html);
|
|
174
|
+
const children = doc.children;
|
|
175
|
+
if (children.length === 1 && isElement(children[0])) {
|
|
176
|
+
const blockChildren = children[0].children.filter(isBlockElement);
|
|
177
|
+
if (blockChildren.length > 1) {
|
|
178
|
+
const result = [];
|
|
179
|
+
for (const child of blockChildren) {
|
|
180
|
+
const childBlocks = splitHtmlBlock({
|
|
181
|
+
html: (0, dom_serializer.default)(child),
|
|
182
|
+
type: getChunkTypeFromElement(child)
|
|
183
|
+
}, maxTokens, countTokens$1);
|
|
184
|
+
result.push(...childBlocks);
|
|
185
|
+
}
|
|
186
|
+
return result;
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
const sentences = splitAtSentences(getTextContent(doc));
|
|
190
|
+
if (sentences.length > 1) {
|
|
191
|
+
const result = [];
|
|
192
|
+
let currentText = "";
|
|
193
|
+
for (const sentence of sentences) {
|
|
194
|
+
const combined = currentText ? `${currentText} ${sentence}` : sentence;
|
|
195
|
+
if (countTokens$1(combined) <= maxTokens) currentText = combined;
|
|
196
|
+
else {
|
|
197
|
+
if (currentText) result.push({
|
|
198
|
+
html: currentText,
|
|
199
|
+
type: block.type
|
|
200
|
+
});
|
|
201
|
+
currentText = sentence;
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
if (currentText) result.push({
|
|
205
|
+
html: currentText,
|
|
206
|
+
type: block.type
|
|
207
|
+
});
|
|
208
|
+
return result;
|
|
209
|
+
}
|
|
210
|
+
return [block];
|
|
211
|
+
}
|
|
212
|
+
/**
|
|
213
|
+
* Creates an HTML chunker.
|
|
214
|
+
*
|
|
215
|
+
* The chunker parses HTML content and creates chunks that respect element
|
|
216
|
+
* boundaries. Each block element is kept as a single chunk when possible,
|
|
217
|
+
* and only split when exceeding the token limit.
|
|
218
|
+
*
|
|
219
|
+
* @param htmlOptions Optional HTML-specific chunking options.
|
|
220
|
+
* @returns A chunker function for HTML content.
|
|
221
|
+
* @since 0.2.0
|
|
222
|
+
*/
|
|
223
|
+
function createHtmlChunker(htmlOptions) {
|
|
224
|
+
const options = htmlOptions ?? {};
|
|
225
|
+
return async (text, chunkerOptions) => {
|
|
226
|
+
const maxTokens = chunkerOptions?.maxTokens ?? 4096;
|
|
227
|
+
const countTokens$1 = chunkerOptions?.countTokens ?? require_tokens.countTokens;
|
|
228
|
+
const signal = chunkerOptions?.signal;
|
|
229
|
+
signal?.throwIfAborted();
|
|
230
|
+
await Promise.resolve();
|
|
231
|
+
if (text.trim().length === 0) return [];
|
|
232
|
+
const blocks = extractBlocks((0, htmlparser2.parseDocument)(text, {
|
|
233
|
+
lowerCaseTags: true,
|
|
234
|
+
lowerCaseAttributeNames: true
|
|
235
|
+
}), options);
|
|
236
|
+
const chunks = [];
|
|
237
|
+
let chunkIndex = 0;
|
|
238
|
+
for (const block of blocks) {
|
|
239
|
+
signal?.throwIfAborted();
|
|
240
|
+
const splitBlocks = splitHtmlBlock(block, maxTokens, countTokens$1);
|
|
241
|
+
for (const splitBlock of splitBlocks) chunks.push({
|
|
242
|
+
content: splitBlock.html,
|
|
243
|
+
type: splitBlock.type,
|
|
244
|
+
index: chunkIndex++
|
|
245
|
+
});
|
|
246
|
+
}
|
|
247
|
+
return chunks;
|
|
248
|
+
};
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
//#endregion
|
|
252
|
+
exports.DEFAULT_TRANSLATABLE_ATTRIBUTES = DEFAULT_TRANSLATABLE_ATTRIBUTES;
|
|
253
|
+
exports.createHtmlChunker = createHtmlChunker;
|
package/dist/html.d.cts
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import { Chunker } from "./chunking.cjs";
|
|
2
|
+
|
|
3
|
+
//#region src/html.d.ts
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Options specific to HTML chunking.
|
|
7
|
+
*
|
|
8
|
+
* @since 0.2.0
|
|
9
|
+
*/
|
|
10
|
+
interface HtmlChunkerOptions {
|
|
11
|
+
/**
|
|
12
|
+
* Additional HTML attributes to include for translation.
|
|
13
|
+
* Default translatable attributes: alt, title, placeholder, aria-label,
|
|
14
|
+
* aria-description.
|
|
15
|
+
*/
|
|
16
|
+
readonly additionalTranslatableAttributes?: readonly string[];
|
|
17
|
+
/**
|
|
18
|
+
* Whether to strip HTML comments from the output.
|
|
19
|
+
*
|
|
20
|
+
* @default false
|
|
21
|
+
*/
|
|
22
|
+
readonly stripComments?: boolean;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Default attributes that should be translated.
|
|
26
|
+
*/
|
|
27
|
+
declare const DEFAULT_TRANSLATABLE_ATTRIBUTES: readonly ["alt", "title", "placeholder", "aria-label", "aria-description"];
|
|
28
|
+
/**
|
|
29
|
+
* Creates an HTML chunker.
|
|
30
|
+
*
|
|
31
|
+
* The chunker parses HTML content and creates chunks that respect element
|
|
32
|
+
* boundaries. Each block element is kept as a single chunk when possible,
|
|
33
|
+
* and only split when exceeding the token limit.
|
|
34
|
+
*
|
|
35
|
+
* @param htmlOptions Optional HTML-specific chunking options.
|
|
36
|
+
* @returns A chunker function for HTML content.
|
|
37
|
+
* @since 0.2.0
|
|
38
|
+
*/
|
|
39
|
+
declare function createHtmlChunker(htmlOptions?: HtmlChunkerOptions): Chunker;
|
|
40
|
+
//#endregion
|
|
41
|
+
export { DEFAULT_TRANSLATABLE_ATTRIBUTES, HtmlChunkerOptions, createHtmlChunker };
|
package/dist/html.d.ts
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import { Chunker } from "./chunking.js";
|
|
2
|
+
|
|
3
|
+
//#region src/html.d.ts
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Options specific to HTML chunking.
|
|
7
|
+
*
|
|
8
|
+
* @since 0.2.0
|
|
9
|
+
*/
|
|
10
|
+
interface HtmlChunkerOptions {
|
|
11
|
+
/**
|
|
12
|
+
* Additional HTML attributes to include for translation.
|
|
13
|
+
* Default translatable attributes: alt, title, placeholder, aria-label,
|
|
14
|
+
* aria-description.
|
|
15
|
+
*/
|
|
16
|
+
readonly additionalTranslatableAttributes?: readonly string[];
|
|
17
|
+
/**
|
|
18
|
+
* Whether to strip HTML comments from the output.
|
|
19
|
+
*
|
|
20
|
+
* @default false
|
|
21
|
+
*/
|
|
22
|
+
readonly stripComments?: boolean;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Default attributes that should be translated.
|
|
26
|
+
*/
|
|
27
|
+
declare const DEFAULT_TRANSLATABLE_ATTRIBUTES: readonly ["alt", "title", "placeholder", "aria-label", "aria-description"];
|
|
28
|
+
/**
|
|
29
|
+
* Creates an HTML chunker.
|
|
30
|
+
*
|
|
31
|
+
* The chunker parses HTML content and creates chunks that respect element
|
|
32
|
+
* boundaries. Each block element is kept as a single chunk when possible,
|
|
33
|
+
* and only split when exceeding the token limit.
|
|
34
|
+
*
|
|
35
|
+
* @param htmlOptions Optional HTML-specific chunking options.
|
|
36
|
+
* @returns A chunker function for HTML content.
|
|
37
|
+
* @since 0.2.0
|
|
38
|
+
*/
|
|
39
|
+
declare function createHtmlChunker(htmlOptions?: HtmlChunkerOptions): Chunker;
|
|
40
|
+
//#endregion
|
|
41
|
+
export { DEFAULT_TRANSLATABLE_ATTRIBUTES, HtmlChunkerOptions, createHtmlChunker };
|