@vertana/core 0.1.0-dev.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +20 -0
- package/dist/_virtual/rolldown_runtime.cjs +29 -0
- package/dist/accumulator.cjs +64 -0
- package/dist/accumulator.d.cts +51 -0
- package/dist/accumulator.d.ts +51 -0
- package/dist/accumulator.js +61 -0
- package/dist/chunking.cjs +76 -0
- package/dist/chunking.d.cts +124 -0
- package/dist/chunking.d.ts +124 -0
- package/dist/chunking.js +74 -0
- package/dist/context.cjs +51 -0
- package/dist/context.d.cts +148 -0
- package/dist/context.d.ts +148 -0
- package/dist/context.js +49 -0
- package/dist/evaluation.cjs +120 -0
- package/dist/evaluation.d.cts +111 -0
- package/dist/evaluation.d.ts +111 -0
- package/dist/evaluation.js +119 -0
- package/dist/glossary.cjs +0 -0
- package/dist/glossary.d.cts +25 -0
- package/dist/glossary.d.ts +25 -0
- package/dist/glossary.js +0 -0
- package/dist/html.cjs +253 -0
- package/dist/html.d.cts +41 -0
- package/dist/html.d.ts +41 -0
- package/dist/html.js +250 -0
- package/dist/index.cjs +39 -0
- package/dist/index.d.cts +17 -0
- package/dist/index.d.ts +17 -0
- package/dist/index.js +16 -0
- package/dist/markdown.cjs +300 -0
- package/dist/markdown.d.cts +17 -0
- package/dist/markdown.d.ts +17 -0
- package/dist/markdown.js +300 -0
- package/dist/plaintext.cjs +70 -0
- package/dist/plaintext.d.cts +17 -0
- package/dist/plaintext.d.ts +17 -0
- package/dist/plaintext.js +70 -0
- package/dist/prompt.cjs +91 -0
- package/dist/prompt.d.cts +74 -0
- package/dist/prompt.d.ts +74 -0
- package/dist/prompt.js +86 -0
- package/dist/refine.cjs +243 -0
- package/dist/refine.d.cts +148 -0
- package/dist/refine.d.ts +148 -0
- package/dist/refine.js +241 -0
- package/dist/select.cjs +62 -0
- package/dist/select.d.cts +83 -0
- package/dist/select.d.ts +83 -0
- package/dist/select.js +61 -0
- package/dist/terms.cjs +60 -0
- package/dist/terms.d.cts +36 -0
- package/dist/terms.d.ts +36 -0
- package/dist/terms.js +59 -0
- package/dist/tokens.cjs +40 -0
- package/dist/tokens.d.cts +24 -0
- package/dist/tokens.d.ts +24 -0
- package/dist/tokens.js +38 -0
- package/dist/tools.cjs +35 -0
- package/dist/tools.d.cts +20 -0
- package/dist/tools.d.ts +20 -0
- package/dist/tools.js +34 -0
- package/dist/translate.cjs +200 -0
- package/dist/translate.d.cts +190 -0
- package/dist/translate.d.ts +190 -0
- package/dist/translate.js +199 -0
- package/dist/window.cjs +0 -0
- package/dist/window.d.cts +48 -0
- package/dist/window.d.ts +48 -0
- package/dist/window.js +0 -0
- package/package.json +215 -0
package/dist/html.js
ADDED
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
import { countTokens } from "./tokens.js";
|
|
2
|
+
import { parseDocument } from "htmlparser2";
|
|
3
|
+
import render from "dom-serializer";
|
|
4
|
+
|
|
5
|
+
//#region src/html.ts
|
|
6
|
+
/**
|
|
7
|
+
* Default attributes that should be translated.
|
|
8
|
+
*/
|
|
9
|
+
const DEFAULT_TRANSLATABLE_ATTRIBUTES = [
|
|
10
|
+
"alt",
|
|
11
|
+
"title",
|
|
12
|
+
"placeholder",
|
|
13
|
+
"aria-label",
|
|
14
|
+
"aria-description"
|
|
15
|
+
];
|
|
16
|
+
/**
|
|
17
|
+
* Block-level elements that create natural chunk boundaries.
|
|
18
|
+
*/
|
|
19
|
+
const BLOCK_ELEMENTS = new Set([
|
|
20
|
+
"address",
|
|
21
|
+
"article",
|
|
22
|
+
"aside",
|
|
23
|
+
"blockquote",
|
|
24
|
+
"canvas",
|
|
25
|
+
"dd",
|
|
26
|
+
"div",
|
|
27
|
+
"dl",
|
|
28
|
+
"dt",
|
|
29
|
+
"fieldset",
|
|
30
|
+
"figcaption",
|
|
31
|
+
"figure",
|
|
32
|
+
"footer",
|
|
33
|
+
"form",
|
|
34
|
+
"h1",
|
|
35
|
+
"h2",
|
|
36
|
+
"h3",
|
|
37
|
+
"h4",
|
|
38
|
+
"h5",
|
|
39
|
+
"h6",
|
|
40
|
+
"header",
|
|
41
|
+
"hr",
|
|
42
|
+
"li",
|
|
43
|
+
"main",
|
|
44
|
+
"nav",
|
|
45
|
+
"noscript",
|
|
46
|
+
"ol",
|
|
47
|
+
"p",
|
|
48
|
+
"pre",
|
|
49
|
+
"section",
|
|
50
|
+
"table",
|
|
51
|
+
"tbody",
|
|
52
|
+
"td",
|
|
53
|
+
"tfoot",
|
|
54
|
+
"th",
|
|
55
|
+
"thead",
|
|
56
|
+
"tr",
|
|
57
|
+
"ul",
|
|
58
|
+
"video"
|
|
59
|
+
]);
|
|
60
|
+
/**
|
|
61
|
+
* Elements whose content should not be translated.
|
|
62
|
+
*/
|
|
63
|
+
const NON_TRANSLATABLE_ELEMENTS = new Set([
|
|
64
|
+
"script",
|
|
65
|
+
"style",
|
|
66
|
+
"svg",
|
|
67
|
+
"math"
|
|
68
|
+
]);
|
|
69
|
+
/**
|
|
70
|
+
* Determines if a node is an element.
|
|
71
|
+
*/
|
|
72
|
+
function isElement(node) {
|
|
73
|
+
return node.type === "tag";
|
|
74
|
+
}
|
|
75
|
+
/**
|
|
76
|
+
* Determines if an element is a block-level element.
|
|
77
|
+
*/
|
|
78
|
+
function isBlockElement(node) {
|
|
79
|
+
return isElement(node) && BLOCK_ELEMENTS.has(node.name.toLowerCase());
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* Determines if an element should be excluded from translation.
|
|
83
|
+
*/
|
|
84
|
+
function isNonTranslatable(node) {
|
|
85
|
+
return isElement(node) && NON_TRANSLATABLE_ELEMENTS.has(node.name.toLowerCase());
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Determines the chunk type from an HTML element.
|
|
89
|
+
*/
|
|
90
|
+
function getChunkTypeFromElement(element) {
|
|
91
|
+
const name = element.name.toLowerCase();
|
|
92
|
+
if (/^h[1-6]$/.test(name)) return "heading";
|
|
93
|
+
if ([
|
|
94
|
+
"ul",
|
|
95
|
+
"ol",
|
|
96
|
+
"dl"
|
|
97
|
+
].includes(name)) return "list";
|
|
98
|
+
if (["pre", "code"].includes(name)) return "code";
|
|
99
|
+
if ([
|
|
100
|
+
"section",
|
|
101
|
+
"article",
|
|
102
|
+
"header",
|
|
103
|
+
"footer",
|
|
104
|
+
"nav",
|
|
105
|
+
"aside",
|
|
106
|
+
"main"
|
|
107
|
+
].includes(name)) return "section";
|
|
108
|
+
return "paragraph";
|
|
109
|
+
}
|
|
110
|
+
/**
|
|
111
|
+
* Gets the text content of a node (for checking if it has translatable
|
|
112
|
+
* content).
|
|
113
|
+
*/
|
|
114
|
+
function getTextContent(node) {
|
|
115
|
+
if ("type" in node && node.type === "text") return node.data;
|
|
116
|
+
if ("children" in node) return node.children.map((child) => getTextContent(child)).join("");
|
|
117
|
+
return "";
|
|
118
|
+
}
|
|
119
|
+
/**
|
|
120
|
+
* Checks if an element has any translatable attributes.
|
|
121
|
+
*/
|
|
122
|
+
function hasTranslatableAttributes(node) {
|
|
123
|
+
if (!isElement(node)) return false;
|
|
124
|
+
for (const attr of DEFAULT_TRANSLATABLE_ATTRIBUTES) {
|
|
125
|
+
const value = node.attribs[attr];
|
|
126
|
+
if (value != null && value.trim().length > 0) return true;
|
|
127
|
+
}
|
|
128
|
+
for (const child of node.children) if (hasTranslatableAttributes(child)) return true;
|
|
129
|
+
return false;
|
|
130
|
+
}
|
|
131
|
+
/**
|
|
132
|
+
* Checks if a node has any translatable content.
|
|
133
|
+
*/
|
|
134
|
+
function hasTranslatableContent(node) {
|
|
135
|
+
if (isNonTranslatable(node)) return false;
|
|
136
|
+
if (getTextContent(node).trim().length > 0) return true;
|
|
137
|
+
return hasTranslatableAttributes(node);
|
|
138
|
+
}
|
|
139
|
+
/**
|
|
140
|
+
* Extracts translatable blocks from an HTML document.
|
|
141
|
+
*/
|
|
142
|
+
function extractBlocks(doc, _options) {
|
|
143
|
+
const blocks = [];
|
|
144
|
+
function processNode(node) {
|
|
145
|
+
if (!isElement(node)) return;
|
|
146
|
+
const name = node.name.toLowerCase();
|
|
147
|
+
if (NON_TRANSLATABLE_ELEMENTS.has(name)) return;
|
|
148
|
+
if (BLOCK_ELEMENTS.has(name)) {
|
|
149
|
+
if (hasTranslatableContent(node)) blocks.push({
|
|
150
|
+
html: render(node),
|
|
151
|
+
type: getChunkTypeFromElement(node)
|
|
152
|
+
});
|
|
153
|
+
return;
|
|
154
|
+
}
|
|
155
|
+
for (const child of node.children) processNode(child);
|
|
156
|
+
}
|
|
157
|
+
for (const node of doc.children) processNode(node);
|
|
158
|
+
return blocks;
|
|
159
|
+
}
|
|
160
|
+
/**
|
|
161
|
+
* Splits text content at sentence boundaries.
|
|
162
|
+
*/
|
|
163
|
+
function splitAtSentences(text) {
|
|
164
|
+
return text.split(/(?<=[.!?])\s+/).filter((p) => p.trim().length > 0);
|
|
165
|
+
}
|
|
166
|
+
/**
|
|
167
|
+
* Splits an HTML block into smaller pieces if it exceeds the token limit.
|
|
168
|
+
*/
|
|
169
|
+
function splitHtmlBlock(block, maxTokens, countTokens$1) {
|
|
170
|
+
if (countTokens$1(block.html) <= maxTokens) return [block];
|
|
171
|
+
const doc = parseDocument(block.html);
|
|
172
|
+
const children = doc.children;
|
|
173
|
+
if (children.length === 1 && isElement(children[0])) {
|
|
174
|
+
const blockChildren = children[0].children.filter(isBlockElement);
|
|
175
|
+
if (blockChildren.length > 1) {
|
|
176
|
+
const result = [];
|
|
177
|
+
for (const child of blockChildren) {
|
|
178
|
+
const childBlocks = splitHtmlBlock({
|
|
179
|
+
html: render(child),
|
|
180
|
+
type: getChunkTypeFromElement(child)
|
|
181
|
+
}, maxTokens, countTokens$1);
|
|
182
|
+
result.push(...childBlocks);
|
|
183
|
+
}
|
|
184
|
+
return result;
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
const sentences = splitAtSentences(getTextContent(doc));
|
|
188
|
+
if (sentences.length > 1) {
|
|
189
|
+
const result = [];
|
|
190
|
+
let currentText = "";
|
|
191
|
+
for (const sentence of sentences) {
|
|
192
|
+
const combined = currentText ? `${currentText} ${sentence}` : sentence;
|
|
193
|
+
if (countTokens$1(combined) <= maxTokens) currentText = combined;
|
|
194
|
+
else {
|
|
195
|
+
if (currentText) result.push({
|
|
196
|
+
html: currentText,
|
|
197
|
+
type: block.type
|
|
198
|
+
});
|
|
199
|
+
currentText = sentence;
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
if (currentText) result.push({
|
|
203
|
+
html: currentText,
|
|
204
|
+
type: block.type
|
|
205
|
+
});
|
|
206
|
+
return result;
|
|
207
|
+
}
|
|
208
|
+
return [block];
|
|
209
|
+
}
|
|
210
|
+
/**
|
|
211
|
+
* Creates an HTML chunker.
|
|
212
|
+
*
|
|
213
|
+
* The chunker parses HTML content and creates chunks that respect element
|
|
214
|
+
* boundaries. Each block element is kept as a single chunk when possible,
|
|
215
|
+
* and only split when exceeding the token limit.
|
|
216
|
+
*
|
|
217
|
+
* @param htmlOptions Optional HTML-specific chunking options.
|
|
218
|
+
* @returns A chunker function for HTML content.
|
|
219
|
+
* @since 0.2.0
|
|
220
|
+
*/
|
|
221
|
+
function createHtmlChunker(htmlOptions) {
|
|
222
|
+
const options = htmlOptions ?? {};
|
|
223
|
+
return async (text, chunkerOptions) => {
|
|
224
|
+
const maxTokens = chunkerOptions?.maxTokens ?? 4096;
|
|
225
|
+
const countTokens$1 = chunkerOptions?.countTokens ?? countTokens;
|
|
226
|
+
const signal = chunkerOptions?.signal;
|
|
227
|
+
signal?.throwIfAborted();
|
|
228
|
+
await Promise.resolve();
|
|
229
|
+
if (text.trim().length === 0) return [];
|
|
230
|
+
const blocks = extractBlocks(parseDocument(text, {
|
|
231
|
+
lowerCaseTags: true,
|
|
232
|
+
lowerCaseAttributeNames: true
|
|
233
|
+
}), options);
|
|
234
|
+
const chunks = [];
|
|
235
|
+
let chunkIndex = 0;
|
|
236
|
+
for (const block of blocks) {
|
|
237
|
+
signal?.throwIfAborted();
|
|
238
|
+
const splitBlocks = splitHtmlBlock(block, maxTokens, countTokens$1);
|
|
239
|
+
for (const splitBlock of splitBlocks) chunks.push({
|
|
240
|
+
content: splitBlock.html,
|
|
241
|
+
type: splitBlock.type,
|
|
242
|
+
index: chunkIndex++
|
|
243
|
+
});
|
|
244
|
+
}
|
|
245
|
+
return chunks;
|
|
246
|
+
};
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
//#endregion
|
|
250
|
+
export { DEFAULT_TRANSLATABLE_ATTRIBUTES, createHtmlChunker };
|
package/dist/index.cjs
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
const require_context = require('./context.cjs');
|
|
2
|
+
const require_evaluation = require('./evaluation.cjs');
|
|
3
|
+
const require_chunking = require('./chunking.cjs');
|
|
4
|
+
const require_tokens = require('./tokens.cjs');
|
|
5
|
+
const require_markdown = require('./markdown.cjs');
|
|
6
|
+
const require_html = require('./html.cjs');
|
|
7
|
+
const require_plaintext = require('./plaintext.cjs');
|
|
8
|
+
const require_refine = require('./refine.cjs');
|
|
9
|
+
const require_select = require('./select.cjs');
|
|
10
|
+
const require_prompt = require('./prompt.cjs');
|
|
11
|
+
const require_terms = require('./terms.cjs');
|
|
12
|
+
const require_tools = require('./tools.cjs');
|
|
13
|
+
const require_translate = require('./translate.cjs');
|
|
14
|
+
const require_accumulator = require('./accumulator.cjs');
|
|
15
|
+
|
|
16
|
+
exports.accumulateEvent = require_accumulator.accumulateEvent;
|
|
17
|
+
exports.buildSystemPrompt = require_prompt.buildSystemPrompt;
|
|
18
|
+
exports.buildUserPrompt = require_prompt.buildUserPrompt;
|
|
19
|
+
exports.buildUserPromptWithContext = require_prompt.buildUserPromptWithContext;
|
|
20
|
+
exports.chunkText = require_chunking.chunkText;
|
|
21
|
+
exports.combineContextResults = require_context.combineContextResults;
|
|
22
|
+
exports.countTokens = require_tokens.countTokens;
|
|
23
|
+
exports.createDefaultTokenCounter = require_tokens.createDefaultTokenCounter;
|
|
24
|
+
exports.createHtmlChunker = require_html.createHtmlChunker;
|
|
25
|
+
exports.createInitialAccumulatorState = require_accumulator.createInitialAccumulatorState;
|
|
26
|
+
exports.createMarkdownChunker = require_markdown.createMarkdownChunker;
|
|
27
|
+
exports.createPlainTextChunker = require_plaintext.createPlainTextChunker;
|
|
28
|
+
exports.createToolSet = require_tools.createToolSet;
|
|
29
|
+
exports.evaluate = require_evaluation.evaluate;
|
|
30
|
+
exports.evaluateBoundary = require_refine.evaluateBoundary;
|
|
31
|
+
exports.extractTerms = require_terms.extractTerms;
|
|
32
|
+
exports.extractTitle = require_prompt.extractTitle;
|
|
33
|
+
exports.gatherRequiredContext = require_context.gatherRequiredContext;
|
|
34
|
+
exports.getDefaultChunker = require_chunking.getDefaultChunker;
|
|
35
|
+
exports.getLanguageName = require_prompt.getLanguageName;
|
|
36
|
+
exports.maxByValue = require_accumulator.maxByValue;
|
|
37
|
+
exports.refineChunks = require_refine.refineChunks;
|
|
38
|
+
exports.selectBest = require_select.selectBest;
|
|
39
|
+
exports.translateChunks = require_translate.translateChunks;
|
package/dist/index.d.cts
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import { Glossary, GlossaryEntry } from "./glossary.cjs";
|
|
2
|
+
import { MediaType, SystemPromptOptions, TranslatedChunk, TranslationTone, buildSystemPrompt, buildUserPrompt, buildUserPromptWithContext, extractTitle, getLanguageName } from "./prompt.cjs";
|
|
3
|
+
import { DynamicGlossaryOptions, RefinementOptions, TranslateChunksComplete, TranslateChunksEvent, TranslateChunksOptions, TranslatedChunkEvent, translateChunks } from "./translate.cjs";
|
|
4
|
+
import { AccumulatorState, accumulateEvent, createInitialAccumulatorState, maxByValue } from "./accumulator.cjs";
|
|
5
|
+
import { Chunk, ChunkTextOptions, ChunkType, Chunker, ChunkerOptions, TokenCounter, chunkText, getDefaultChunker } from "./chunking.cjs";
|
|
6
|
+
import { ContextResult, ContextSource, ContextSourceFactory, ContextSourceGatherOptions, PassiveContextSource, RequiredContextSource, combineContextResults, gatherRequiredContext } from "./context.cjs";
|
|
7
|
+
import { EvaluateOptions, EvaluationResult, EvaluatorOptions, TranslationEvaluator, TranslationIssue, TranslationIssueLocation, TranslationIssueType, evaluate } from "./evaluation.cjs";
|
|
8
|
+
import { HtmlChunkerOptions, createHtmlChunker } from "./html.cjs";
|
|
9
|
+
import { AdaptiveContextWindow, ContextWindow, ExplicitContextWindow } from "./window.cjs";
|
|
10
|
+
import { countTokens, createDefaultTokenCounter } from "./tokens.cjs";
|
|
11
|
+
import { createMarkdownChunker } from "./markdown.cjs";
|
|
12
|
+
import { createPlainTextChunker } from "./plaintext.cjs";
|
|
13
|
+
import { BoundaryEvaluation, BoundaryIssue, RefineChunksOptions, RefineChunksResult, RefineIteration, evaluateBoundary, refineChunks } from "./refine.cjs";
|
|
14
|
+
import { Candidate, RankedCandidate, SelectBestOptions, SelectBestResult, selectBest } from "./select.cjs";
|
|
15
|
+
import { ExtractTermsOptions, extractTerms } from "./terms.cjs";
|
|
16
|
+
import { createToolSet } from "./tools.cjs";
|
|
17
|
+
export { type AccumulatorState, type AdaptiveContextWindow, type BoundaryEvaluation, type BoundaryIssue, type Candidate, type Chunk, type ChunkTextOptions, type ChunkType, type Chunker, type ChunkerOptions, type ContextResult, type ContextSource, type ContextSourceFactory, type ContextSourceGatherOptions, type ContextWindow, type DynamicGlossaryOptions, type EvaluateOptions, type EvaluationResult, type EvaluatorOptions, type ExplicitContextWindow, type ExtractTermsOptions, type Glossary, type GlossaryEntry, type HtmlChunkerOptions, type MediaType, type PassiveContextSource, type RankedCandidate, type RefineChunksOptions, type RefineChunksResult, type RefineIteration, type RefinementOptions, type RequiredContextSource, type SelectBestOptions, type SelectBestResult, type SystemPromptOptions, type TokenCounter, type TranslateChunksComplete, type TranslateChunksEvent, type TranslateChunksOptions, type TranslatedChunk, type TranslatedChunkEvent, type TranslationEvaluator, type TranslationIssue, type TranslationIssueLocation, type TranslationIssueType, type TranslationTone, accumulateEvent, buildSystemPrompt, buildUserPrompt, buildUserPromptWithContext, chunkText, combineContextResults, countTokens, createDefaultTokenCounter, createHtmlChunker, createInitialAccumulatorState, createMarkdownChunker, createPlainTextChunker, createToolSet, evaluate, evaluateBoundary, extractTerms, extractTitle, gatherRequiredContext, getDefaultChunker, getLanguageName, maxByValue, refineChunks, selectBest, translateChunks };
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import { Glossary, GlossaryEntry } from "./glossary.js";
|
|
2
|
+
import { MediaType, SystemPromptOptions, TranslatedChunk, TranslationTone, buildSystemPrompt, buildUserPrompt, buildUserPromptWithContext, extractTitle, getLanguageName } from "./prompt.js";
|
|
3
|
+
import { DynamicGlossaryOptions, RefinementOptions, TranslateChunksComplete, TranslateChunksEvent, TranslateChunksOptions, TranslatedChunkEvent, translateChunks } from "./translate.js";
|
|
4
|
+
import { AccumulatorState, accumulateEvent, createInitialAccumulatorState, maxByValue } from "./accumulator.js";
|
|
5
|
+
import { Chunk, ChunkTextOptions, ChunkType, Chunker, ChunkerOptions, TokenCounter, chunkText, getDefaultChunker } from "./chunking.js";
|
|
6
|
+
import { ContextResult, ContextSource, ContextSourceFactory, ContextSourceGatherOptions, PassiveContextSource, RequiredContextSource, combineContextResults, gatherRequiredContext } from "./context.js";
|
|
7
|
+
import { EvaluateOptions, EvaluationResult, EvaluatorOptions, TranslationEvaluator, TranslationIssue, TranslationIssueLocation, TranslationIssueType, evaluate } from "./evaluation.js";
|
|
8
|
+
import { HtmlChunkerOptions, createHtmlChunker } from "./html.js";
|
|
9
|
+
import { AdaptiveContextWindow, ContextWindow, ExplicitContextWindow } from "./window.js";
|
|
10
|
+
import { countTokens, createDefaultTokenCounter } from "./tokens.js";
|
|
11
|
+
import { createMarkdownChunker } from "./markdown.js";
|
|
12
|
+
import { createPlainTextChunker } from "./plaintext.js";
|
|
13
|
+
import { BoundaryEvaluation, BoundaryIssue, RefineChunksOptions, RefineChunksResult, RefineIteration, evaluateBoundary, refineChunks } from "./refine.js";
|
|
14
|
+
import { Candidate, RankedCandidate, SelectBestOptions, SelectBestResult, selectBest } from "./select.js";
|
|
15
|
+
import { ExtractTermsOptions, extractTerms } from "./terms.js";
|
|
16
|
+
import { createToolSet } from "./tools.js";
|
|
17
|
+
export { type AccumulatorState, type AdaptiveContextWindow, type BoundaryEvaluation, type BoundaryIssue, type Candidate, type Chunk, type ChunkTextOptions, type ChunkType, type Chunker, type ChunkerOptions, type ContextResult, type ContextSource, type ContextSourceFactory, type ContextSourceGatherOptions, type ContextWindow, type DynamicGlossaryOptions, type EvaluateOptions, type EvaluationResult, type EvaluatorOptions, type ExplicitContextWindow, type ExtractTermsOptions, type Glossary, type GlossaryEntry, type HtmlChunkerOptions, type MediaType, type PassiveContextSource, type RankedCandidate, type RefineChunksOptions, type RefineChunksResult, type RefineIteration, type RefinementOptions, type RequiredContextSource, type SelectBestOptions, type SelectBestResult, type SystemPromptOptions, type TokenCounter, type TranslateChunksComplete, type TranslateChunksEvent, type TranslateChunksOptions, type TranslatedChunk, type TranslatedChunkEvent, type TranslationEvaluator, type TranslationIssue, type TranslationIssueLocation, type TranslationIssueType, type TranslationTone, accumulateEvent, buildSystemPrompt, buildUserPrompt, buildUserPromptWithContext, chunkText, combineContextResults, countTokens, createDefaultTokenCounter, createHtmlChunker, createInitialAccumulatorState, createMarkdownChunker, createPlainTextChunker, createToolSet, evaluate, evaluateBoundary, extractTerms, extractTitle, gatherRequiredContext, getDefaultChunker, getLanguageName, maxByValue, refineChunks, selectBest, translateChunks };
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import { combineContextResults, gatherRequiredContext } from "./context.js";
|
|
2
|
+
import { evaluate } from "./evaluation.js";
|
|
3
|
+
import { chunkText, getDefaultChunker } from "./chunking.js";
|
|
4
|
+
import { countTokens, createDefaultTokenCounter } from "./tokens.js";
|
|
5
|
+
import { createMarkdownChunker } from "./markdown.js";
|
|
6
|
+
import { createHtmlChunker } from "./html.js";
|
|
7
|
+
import { createPlainTextChunker } from "./plaintext.js";
|
|
8
|
+
import { evaluateBoundary, refineChunks } from "./refine.js";
|
|
9
|
+
import { selectBest } from "./select.js";
|
|
10
|
+
import { buildSystemPrompt, buildUserPrompt, buildUserPromptWithContext, extractTitle, getLanguageName } from "./prompt.js";
|
|
11
|
+
import { extractTerms } from "./terms.js";
|
|
12
|
+
import { createToolSet } from "./tools.js";
|
|
13
|
+
import { translateChunks } from "./translate.js";
|
|
14
|
+
import { accumulateEvent, createInitialAccumulatorState, maxByValue } from "./accumulator.js";
|
|
15
|
+
|
|
16
|
+
export { accumulateEvent, buildSystemPrompt, buildUserPrompt, buildUserPromptWithContext, chunkText, combineContextResults, countTokens, createDefaultTokenCounter, createHtmlChunker, createInitialAccumulatorState, createMarkdownChunker, createPlainTextChunker, createToolSet, evaluate, evaluateBoundary, extractTerms, extractTitle, gatherRequiredContext, getDefaultChunker, getLanguageName, maxByValue, refineChunks, selectBest, translateChunks };
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
const require_tokens = require('./tokens.cjs');
|
|
2
|
+
|
|
3
|
+
//#region src/markdown.ts
|
|
4
|
+
/**
|
|
5
|
+
* Checks if a line is an ATX-style heading.
|
|
6
|
+
*
|
|
7
|
+
* @param line The line to check.
|
|
8
|
+
* @returns The heading level (1-6) or 0 if not a heading.
|
|
9
|
+
*/
|
|
10
|
+
function getAtxHeadingLevel(line) {
|
|
11
|
+
const match = line.match(/^(#{1,6})\s/);
|
|
12
|
+
if (match != null) return match[1].length;
|
|
13
|
+
return 0;
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Checks if a line is a Setext-style heading underline.
|
|
17
|
+
*
|
|
18
|
+
* @param line The line to check.
|
|
19
|
+
* @returns The heading level (1 for =, 2 for -) or 0 if not an underline.
|
|
20
|
+
*/
|
|
21
|
+
function getSetextUnderlineLevel(line) {
|
|
22
|
+
const trimmed = line.trim();
|
|
23
|
+
if (/^=+$/.test(trimmed) && trimmed.length >= 3) return 1;
|
|
24
|
+
if (/^-+$/.test(trimmed) && trimmed.length >= 3) return 2;
|
|
25
|
+
return 0;
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Checks if a line starts a code fence at column 0 (not indented).
|
|
29
|
+
*
|
|
30
|
+
* @param line The line to check.
|
|
31
|
+
* @returns The fence pattern if it's a code fence start, null otherwise.
|
|
32
|
+
*/
|
|
33
|
+
function getCodeFenceStart(line) {
|
|
34
|
+
const match = line.match(/^(`{3,}|~{3,})/);
|
|
35
|
+
if (match != null) return match[1];
|
|
36
|
+
return null;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Checks if a line closes a code fence.
|
|
40
|
+
*
|
|
41
|
+
* @param line The line to check.
|
|
42
|
+
* @param fenceChar The fence character (` or ~).
|
|
43
|
+
* @param fenceLength The minimum fence length.
|
|
44
|
+
* @returns True if the line closes the fence.
|
|
45
|
+
*/
|
|
46
|
+
function isCodeFenceEnd(line, fenceChar, fenceLength) {
|
|
47
|
+
return (/* @__PURE__ */ new RegExp(`^${fenceChar}{${fenceLength},}\\s*$`)).test(line);
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Parses Markdown content into sections.
|
|
51
|
+
*
|
|
52
|
+
* Each section starts with a heading (ATX or Setext style) and contains
|
|
53
|
+
* all content until the next heading of equal or higher level.
|
|
54
|
+
*
|
|
55
|
+
* @param text The Markdown text to parse.
|
|
56
|
+
* @returns An array of sections.
|
|
57
|
+
*/
|
|
58
|
+
function parseIntoSections(text) {
|
|
59
|
+
const lines = text.split(/\r?\n/);
|
|
60
|
+
const sections = [];
|
|
61
|
+
let currentHeading = "";
|
|
62
|
+
let currentLevel = 0;
|
|
63
|
+
let currentLines = [];
|
|
64
|
+
let inCodeBlock = false;
|
|
65
|
+
let codeFenceChar = "";
|
|
66
|
+
let codeFenceLength = 0;
|
|
67
|
+
function flushSection() {
|
|
68
|
+
if (currentHeading.length > 0 || currentLines.length > 0) sections.push({
|
|
69
|
+
heading: currentHeading,
|
|
70
|
+
content: currentLines.join("\n").trim(),
|
|
71
|
+
level: currentLevel
|
|
72
|
+
});
|
|
73
|
+
currentHeading = "";
|
|
74
|
+
currentLevel = 0;
|
|
75
|
+
currentLines = [];
|
|
76
|
+
}
|
|
77
|
+
for (let i = 0; i < lines.length; i++) {
|
|
78
|
+
const line = lines[i];
|
|
79
|
+
if (inCodeBlock) {
|
|
80
|
+
currentLines.push(line);
|
|
81
|
+
if (isCodeFenceEnd(line, codeFenceChar, codeFenceLength)) {
|
|
82
|
+
inCodeBlock = false;
|
|
83
|
+
codeFenceChar = "";
|
|
84
|
+
codeFenceLength = 0;
|
|
85
|
+
}
|
|
86
|
+
continue;
|
|
87
|
+
}
|
|
88
|
+
const fence = getCodeFenceStart(line);
|
|
89
|
+
if (fence != null) {
|
|
90
|
+
currentLines.push(line);
|
|
91
|
+
inCodeBlock = true;
|
|
92
|
+
codeFenceChar = fence[0];
|
|
93
|
+
codeFenceLength = fence.length;
|
|
94
|
+
continue;
|
|
95
|
+
}
|
|
96
|
+
const atxLevel = getAtxHeadingLevel(line);
|
|
97
|
+
if (atxLevel > 0) {
|
|
98
|
+
flushSection();
|
|
99
|
+
currentHeading = line;
|
|
100
|
+
currentLevel = atxLevel;
|
|
101
|
+
continue;
|
|
102
|
+
}
|
|
103
|
+
if (i + 1 < lines.length && line.trim().length > 0 && !line.startsWith(" ")) {
|
|
104
|
+
const setextLevel = getSetextUnderlineLevel(lines[i + 1]);
|
|
105
|
+
if (setextLevel > 0) {
|
|
106
|
+
flushSection();
|
|
107
|
+
currentHeading = `${line}\n${lines[i + 1]}`;
|
|
108
|
+
currentLevel = setextLevel;
|
|
109
|
+
i++;
|
|
110
|
+
continue;
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
currentLines.push(line);
|
|
114
|
+
}
|
|
115
|
+
flushSection();
|
|
116
|
+
return sections;
|
|
117
|
+
}
|
|
118
|
+
/**
|
|
119
|
+
* Determines the primary content type of a section's content.
|
|
120
|
+
*
|
|
121
|
+
* @param content The section content.
|
|
122
|
+
* @returns The primary chunk type.
|
|
123
|
+
*/
|
|
124
|
+
function getSectionContentType(content) {
|
|
125
|
+
const lines = content.split("\n").filter((l) => l.trim().length > 0);
|
|
126
|
+
if (lines.length === 0) return "paragraph";
|
|
127
|
+
let inCode = false;
|
|
128
|
+
let codeLines = 0;
|
|
129
|
+
let fenceChar = "";
|
|
130
|
+
let fenceLength = 0;
|
|
131
|
+
for (const line of lines) if (inCode) {
|
|
132
|
+
codeLines++;
|
|
133
|
+
if (isCodeFenceEnd(line, fenceChar, fenceLength)) inCode = false;
|
|
134
|
+
} else {
|
|
135
|
+
const fence = getCodeFenceStart(line);
|
|
136
|
+
if (fence != null) {
|
|
137
|
+
inCode = true;
|
|
138
|
+
fenceChar = fence[0];
|
|
139
|
+
fenceLength = fence.length;
|
|
140
|
+
codeLines++;
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
if (codeLines > lines.length / 2) return "code";
|
|
144
|
+
let listItemCount = 0;
|
|
145
|
+
let listContentLines = 0;
|
|
146
|
+
let inListItem = false;
|
|
147
|
+
let listMarkerIndent = -1;
|
|
148
|
+
for (const line of lines) {
|
|
149
|
+
const listMatch = line.match(/^(\s*)([-*+]|\d+\.)\s/);
|
|
150
|
+
if (listMatch != null) {
|
|
151
|
+
listItemCount++;
|
|
152
|
+
listContentLines++;
|
|
153
|
+
inListItem = true;
|
|
154
|
+
listMarkerIndent = listMatch[1].length;
|
|
155
|
+
} else if (inListItem) if ((line.match(/^(\s*)/)?.[1].length ?? 0) > listMarkerIndent) listContentLines++;
|
|
156
|
+
else inListItem = false;
|
|
157
|
+
}
|
|
158
|
+
if (listItemCount > 0 && listContentLines > lines.length / 2) return "list";
|
|
159
|
+
return "paragraph";
|
|
160
|
+
}
|
|
161
|
+
/**
|
|
162
|
+
* Splits text by sentences when line-level splitting isn't possible.
|
|
163
|
+
*
|
|
164
|
+
* @param text The text to split.
|
|
165
|
+
* @param maxTokens The maximum tokens per piece.
|
|
166
|
+
* @param countTokens The token counter function.
|
|
167
|
+
* @returns An array of text pieces.
|
|
168
|
+
*/
|
|
169
|
+
function splitBySentences(text, maxTokens, countTokens$1) {
|
|
170
|
+
const sentences = text.split(/(?<=[.!?])\s+/);
|
|
171
|
+
const parts = [];
|
|
172
|
+
let currentPart = "";
|
|
173
|
+
for (const sentence of sentences) {
|
|
174
|
+
const newPart = currentPart.length > 0 ? `${currentPart} ${sentence}` : sentence;
|
|
175
|
+
if (countTokens$1(newPart) > maxTokens && currentPart.length > 0) {
|
|
176
|
+
parts.push(currentPart);
|
|
177
|
+
currentPart = sentence;
|
|
178
|
+
} else currentPart = newPart;
|
|
179
|
+
}
|
|
180
|
+
if (currentPart.length > 0) parts.push(currentPart);
|
|
181
|
+
return parts.length > 0 ? parts : [text];
|
|
182
|
+
}
|
|
183
|
+
/**
|
|
184
|
+
* Splits a section's content into smaller pieces if it exceeds the token limit.
|
|
185
|
+
*
|
|
186
|
+
* @param content The content to split.
|
|
187
|
+
* @param maxTokens The maximum tokens per piece.
|
|
188
|
+
* @param countTokens The token counter function.
|
|
189
|
+
* @returns An array of content pieces.
|
|
190
|
+
*/
|
|
191
|
+
function splitContent(content, maxTokens, countTokens$1) {
|
|
192
|
+
if (countTokens$1(content) <= maxTokens) return [content];
|
|
193
|
+
const parts = [];
|
|
194
|
+
const paragraphs = content.split(/\n\n+/);
|
|
195
|
+
let currentPart = "";
|
|
196
|
+
for (const para of paragraphs) {
|
|
197
|
+
const newPart = currentPart.length > 0 ? `${currentPart}\n\n${para}` : para;
|
|
198
|
+
if (countTokens$1(newPart) > maxTokens) {
|
|
199
|
+
if (currentPart.length > 0) parts.push(currentPart);
|
|
200
|
+
if (countTokens$1(para) > maxTokens) {
|
|
201
|
+
const lines = para.split("\n");
|
|
202
|
+
let linePart = "";
|
|
203
|
+
for (const line of lines) {
|
|
204
|
+
const newLinePart = linePart.length > 0 ? `${linePart}\n${line}` : line;
|
|
205
|
+
if (countTokens$1(newLinePart) > maxTokens && linePart.length > 0) {
|
|
206
|
+
parts.push(linePart);
|
|
207
|
+
linePart = line;
|
|
208
|
+
} else linePart = newLinePart;
|
|
209
|
+
}
|
|
210
|
+
if (linePart.length > 0 && countTokens$1(linePart) > maxTokens) {
|
|
211
|
+
const sentenceParts = splitBySentences(linePart, maxTokens, countTokens$1);
|
|
212
|
+
for (let i = 0; i < sentenceParts.length - 1; i++) parts.push(sentenceParts[i]);
|
|
213
|
+
currentPart = sentenceParts[sentenceParts.length - 1];
|
|
214
|
+
} else if (linePart.length > 0) currentPart = linePart;
|
|
215
|
+
} else currentPart = para;
|
|
216
|
+
} else currentPart = newPart;
|
|
217
|
+
}
|
|
218
|
+
if (currentPart.length > 0) parts.push(currentPart);
|
|
219
|
+
return parts;
|
|
220
|
+
}
|
|
221
|
+
/**
|
|
222
|
+
* Creates a Markdown chunker.
|
|
223
|
+
*
|
|
224
|
+
* The chunker parses Markdown content into sections (heading + content) and
|
|
225
|
+
* creates chunks that respect section boundaries. Each section is kept as a
|
|
226
|
+
* single chunk when possible, and only split when exceeding the token limit.
|
|
227
|
+
*
|
|
228
|
+
* @returns A chunker function for Markdown content.
|
|
229
|
+
* @since 0.1.0
|
|
230
|
+
*/
|
|
231
|
+
function createMarkdownChunker() {
|
|
232
|
+
return async (text, options) => {
|
|
233
|
+
const maxTokens = options?.maxTokens ?? 4096;
|
|
234
|
+
const countTokens$1 = options?.countTokens ?? require_tokens.countTokens;
|
|
235
|
+
const signal = options?.signal;
|
|
236
|
+
signal?.throwIfAborted();
|
|
237
|
+
await Promise.resolve();
|
|
238
|
+
const sections = parseIntoSections(text);
|
|
239
|
+
const chunks = [];
|
|
240
|
+
let chunkIndex = 0;
|
|
241
|
+
for (const section of sections) {
|
|
242
|
+
signal?.throwIfAborted();
|
|
243
|
+
const fullSection = section.heading.length > 0 && section.content.length > 0 ? `${section.heading}\n\n${section.content}` : section.heading.length > 0 ? section.heading : section.content;
|
|
244
|
+
if (fullSection.length === 0) continue;
|
|
245
|
+
if (countTokens$1(fullSection) <= maxTokens) {
|
|
246
|
+
chunks.push({
|
|
247
|
+
content: fullSection,
|
|
248
|
+
type: section.heading.length > 0 ? "section" : getSectionContentType(section.content),
|
|
249
|
+
index: chunkIndex++
|
|
250
|
+
});
|
|
251
|
+
continue;
|
|
252
|
+
}
|
|
253
|
+
if (section.heading.length > 0 && section.content.length > 0) {
|
|
254
|
+
const remainingTokens = maxTokens - countTokens$1(section.heading) - countTokens$1("\n\n");
|
|
255
|
+
if (remainingTokens > 0) {
|
|
256
|
+
const contentParts = splitContent(section.content, remainingTokens, countTokens$1);
|
|
257
|
+
if (contentParts.length > 0) {
|
|
258
|
+
chunks.push({
|
|
259
|
+
content: `${section.heading}\n\n${contentParts[0]}`,
|
|
260
|
+
type: "section",
|
|
261
|
+
index: chunkIndex++
|
|
262
|
+
});
|
|
263
|
+
for (let i = 1; i < contentParts.length; i++) chunks.push({
|
|
264
|
+
content: contentParts[i],
|
|
265
|
+
type: getSectionContentType(contentParts[i]),
|
|
266
|
+
index: chunkIndex++
|
|
267
|
+
});
|
|
268
|
+
} else chunks.push({
|
|
269
|
+
content: section.heading,
|
|
270
|
+
type: "heading",
|
|
271
|
+
index: chunkIndex++
|
|
272
|
+
});
|
|
273
|
+
} else {
|
|
274
|
+
chunks.push({
|
|
275
|
+
content: section.heading,
|
|
276
|
+
type: "heading",
|
|
277
|
+
index: chunkIndex++
|
|
278
|
+
});
|
|
279
|
+
const contentParts = splitContent(section.content, maxTokens, countTokens$1);
|
|
280
|
+
for (const part of contentParts) chunks.push({
|
|
281
|
+
content: part,
|
|
282
|
+
type: getSectionContentType(part),
|
|
283
|
+
index: chunkIndex++
|
|
284
|
+
});
|
|
285
|
+
}
|
|
286
|
+
} else {
|
|
287
|
+
const contentParts = splitContent(fullSection, maxTokens, countTokens$1);
|
|
288
|
+
for (const part of contentParts) chunks.push({
|
|
289
|
+
content: part,
|
|
290
|
+
type: getSectionContentType(part),
|
|
291
|
+
index: chunkIndex++
|
|
292
|
+
});
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
return chunks;
|
|
296
|
+
};
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
//#endregion
|
|
300
|
+
exports.createMarkdownChunker = createMarkdownChunker;
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import { Chunker } from "./chunking.cjs";
|
|
2
|
+
|
|
3
|
+
//#region src/markdown.d.ts
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Creates a Markdown chunker.
|
|
7
|
+
*
|
|
8
|
+
* The chunker parses Markdown content into sections (heading + content) and
|
|
9
|
+
* creates chunks that respect section boundaries. Each section is kept as a
|
|
10
|
+
* single chunk when possible, and only split when exceeding the token limit.
|
|
11
|
+
*
|
|
12
|
+
* @returns A chunker function for Markdown content.
|
|
13
|
+
* @since 0.1.0
|
|
14
|
+
*/
|
|
15
|
+
declare function createMarkdownChunker(): Chunker;
|
|
16
|
+
//#endregion
|
|
17
|
+
export { createMarkdownChunker };
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import { Chunker } from "./chunking.js";
|
|
2
|
+
|
|
3
|
+
//#region src/markdown.d.ts
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Creates a Markdown chunker.
|
|
7
|
+
*
|
|
8
|
+
* The chunker parses Markdown content into sections (heading + content) and
|
|
9
|
+
* creates chunks that respect section boundaries. Each section is kept as a
|
|
10
|
+
* single chunk when possible, and only split when exceeding the token limit.
|
|
11
|
+
*
|
|
12
|
+
* @returns A chunker function for Markdown content.
|
|
13
|
+
* @since 0.1.0
|
|
14
|
+
*/
|
|
15
|
+
declare function createMarkdownChunker(): Chunker;
|
|
16
|
+
//#endregion
|
|
17
|
+
export { createMarkdownChunker };
|