@vertana/core 0.1.0-dev.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +20 -0
- package/dist/_virtual/rolldown_runtime.cjs +29 -0
- package/dist/accumulator.cjs +64 -0
- package/dist/accumulator.d.cts +51 -0
- package/dist/accumulator.d.ts +51 -0
- package/dist/accumulator.js +61 -0
- package/dist/chunking.cjs +76 -0
- package/dist/chunking.d.cts +124 -0
- package/dist/chunking.d.ts +124 -0
- package/dist/chunking.js +74 -0
- package/dist/context.cjs +51 -0
- package/dist/context.d.cts +148 -0
- package/dist/context.d.ts +148 -0
- package/dist/context.js +49 -0
- package/dist/evaluation.cjs +120 -0
- package/dist/evaluation.d.cts +111 -0
- package/dist/evaluation.d.ts +111 -0
- package/dist/evaluation.js +119 -0
- package/dist/glossary.cjs +0 -0
- package/dist/glossary.d.cts +25 -0
- package/dist/glossary.d.ts +25 -0
- package/dist/glossary.js +0 -0
- package/dist/html.cjs +253 -0
- package/dist/html.d.cts +41 -0
- package/dist/html.d.ts +41 -0
- package/dist/html.js +250 -0
- package/dist/index.cjs +39 -0
- package/dist/index.d.cts +17 -0
- package/dist/index.d.ts +17 -0
- package/dist/index.js +16 -0
- package/dist/markdown.cjs +300 -0
- package/dist/markdown.d.cts +17 -0
- package/dist/markdown.d.ts +17 -0
- package/dist/markdown.js +300 -0
- package/dist/plaintext.cjs +70 -0
- package/dist/plaintext.d.cts +17 -0
- package/dist/plaintext.d.ts +17 -0
- package/dist/plaintext.js +70 -0
- package/dist/prompt.cjs +91 -0
- package/dist/prompt.d.cts +74 -0
- package/dist/prompt.d.ts +74 -0
- package/dist/prompt.js +86 -0
- package/dist/refine.cjs +243 -0
- package/dist/refine.d.cts +148 -0
- package/dist/refine.d.ts +148 -0
- package/dist/refine.js +241 -0
- package/dist/select.cjs +62 -0
- package/dist/select.d.cts +83 -0
- package/dist/select.d.ts +83 -0
- package/dist/select.js +61 -0
- package/dist/terms.cjs +60 -0
- package/dist/terms.d.cts +36 -0
- package/dist/terms.d.ts +36 -0
- package/dist/terms.js +59 -0
- package/dist/tokens.cjs +40 -0
- package/dist/tokens.d.cts +24 -0
- package/dist/tokens.d.ts +24 -0
- package/dist/tokens.js +38 -0
- package/dist/tools.cjs +35 -0
- package/dist/tools.d.cts +20 -0
- package/dist/tools.d.ts +20 -0
- package/dist/tools.js +34 -0
- package/dist/translate.cjs +200 -0
- package/dist/translate.d.cts +190 -0
- package/dist/translate.d.ts +190 -0
- package/dist/translate.js +199 -0
- package/dist/window.cjs +0 -0
- package/dist/window.d.cts +48 -0
- package/dist/window.d.ts +48 -0
- package/dist/window.js +0 -0
- package/package.json +215 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright 2025 Hong Minhee
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
|
6
|
+
this software and associated documentation files (the "Software"), to deal in
|
|
7
|
+
the Software without restriction, including without limitation the rights to
|
|
8
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
|
9
|
+
the Software, and to permit persons to whom the Software is furnished to do so,
|
|
10
|
+
subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
|
17
|
+
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
|
18
|
+
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
|
19
|
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
20
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
//#region rolldown:runtime
|
|
2
|
+
var __create = Object.create;
|
|
3
|
+
var __defProp = Object.defineProperty;
|
|
4
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
5
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
7
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
|
+
var __copyProps = (to, from, except, desc) => {
|
|
9
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
10
|
+
for (var keys = __getOwnPropNames(from), i = 0, n = keys.length, key; i < n; i++) {
|
|
11
|
+
key = keys[i];
|
|
12
|
+
if (!__hasOwnProp.call(to, key) && key !== except) {
|
|
13
|
+
__defProp(to, key, {
|
|
14
|
+
get: ((k) => from[k]).bind(null, key),
|
|
15
|
+
enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable
|
|
16
|
+
});
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
return to;
|
|
21
|
+
};
|
|
22
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", {
|
|
23
|
+
value: mod,
|
|
24
|
+
enumerable: true
|
|
25
|
+
}) : target, mod));
|
|
26
|
+
|
|
27
|
+
//#endregion
|
|
28
|
+
|
|
29
|
+
exports.__toESM = __toESM;
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
|
|
2
|
+
//#region src/accumulator.ts
|
|
3
|
+
/**
|
|
4
|
+
* Creates the initial accumulator state.
|
|
5
|
+
*
|
|
6
|
+
* @returns A fresh accumulator state with zeroed counters.
|
|
7
|
+
*/
|
|
8
|
+
function createInitialAccumulatorState() {
|
|
9
|
+
return {
|
|
10
|
+
totalQualityScore: 0,
|
|
11
|
+
qualityScoreCount: 0,
|
|
12
|
+
modelWinCounts: /* @__PURE__ */ new Map()
|
|
13
|
+
};
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Accumulates a translation stream event into the state.
|
|
17
|
+
*
|
|
18
|
+
* This is a pure function that returns a new state without modifying the input.
|
|
19
|
+
*
|
|
20
|
+
* @param state The current accumulator state.
|
|
21
|
+
* @param event The event to accumulate.
|
|
22
|
+
* @returns A new state with the event accumulated.
|
|
23
|
+
*/
|
|
24
|
+
function accumulateEvent(state, event) {
|
|
25
|
+
if (event.type === "complete") return {
|
|
26
|
+
...state,
|
|
27
|
+
complete: event
|
|
28
|
+
};
|
|
29
|
+
let newState = state;
|
|
30
|
+
if (event.qualityScore != null) newState = {
|
|
31
|
+
...newState,
|
|
32
|
+
totalQualityScore: newState.totalQualityScore + event.qualityScore,
|
|
33
|
+
qualityScoreCount: newState.qualityScoreCount + 1
|
|
34
|
+
};
|
|
35
|
+
if (event.selectedModel != null) {
|
|
36
|
+
const newCounts = new Map(newState.modelWinCounts);
|
|
37
|
+
newCounts.set(event.selectedModel, (newCounts.get(event.selectedModel) ?? 0) + 1);
|
|
38
|
+
newState = {
|
|
39
|
+
...newState,
|
|
40
|
+
modelWinCounts: newCounts
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
return newState;
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Returns the key with the highest value in a map.
|
|
47
|
+
*
|
|
48
|
+
* @param map A map of keys to numeric values.
|
|
49
|
+
* @returns The key with the highest value, or undefined if the map is empty.
|
|
50
|
+
*/
|
|
51
|
+
function maxByValue(map) {
|
|
52
|
+
let maxKey;
|
|
53
|
+
let maxValue = -Infinity;
|
|
54
|
+
for (const [key, value] of map) if (value > maxValue) {
|
|
55
|
+
maxValue = value;
|
|
56
|
+
maxKey = key;
|
|
57
|
+
}
|
|
58
|
+
return maxKey;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
//#endregion
|
|
62
|
+
exports.accumulateEvent = accumulateEvent;
|
|
63
|
+
exports.createInitialAccumulatorState = createInitialAccumulatorState;
|
|
64
|
+
exports.maxByValue = maxByValue;
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import { TranslateChunksComplete, TranslateChunksEvent } from "./translate.cjs";
|
|
2
|
+
import { LanguageModel } from "ai";
|
|
3
|
+
|
|
4
|
+
//#region src/accumulator.d.ts
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Accumulated state from processing translation stream events.
|
|
8
|
+
*/
|
|
9
|
+
interface AccumulatorState {
|
|
10
|
+
/**
|
|
11
|
+
* The completion event, if received.
|
|
12
|
+
*/
|
|
13
|
+
readonly complete?: TranslateChunksComplete;
|
|
14
|
+
/**
|
|
15
|
+
* Sum of quality scores from chunk events.
|
|
16
|
+
*/
|
|
17
|
+
readonly totalQualityScore: number;
|
|
18
|
+
/**
|
|
19
|
+
* Number of chunks that had quality scores.
|
|
20
|
+
*/
|
|
21
|
+
readonly qualityScoreCount: number;
|
|
22
|
+
/**
|
|
23
|
+
* Count of wins per model during best-of-N selection.
|
|
24
|
+
*/
|
|
25
|
+
readonly modelWinCounts: ReadonlyMap<LanguageModel, number>;
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Creates the initial accumulator state.
|
|
29
|
+
*
|
|
30
|
+
* @returns A fresh accumulator state with zeroed counters.
|
|
31
|
+
*/
|
|
32
|
+
declare function createInitialAccumulatorState(): AccumulatorState;
|
|
33
|
+
/**
|
|
34
|
+
* Accumulates a translation stream event into the state.
|
|
35
|
+
*
|
|
36
|
+
* This is a pure function that returns a new state without modifying the input.
|
|
37
|
+
*
|
|
38
|
+
* @param state The current accumulator state.
|
|
39
|
+
* @param event The event to accumulate.
|
|
40
|
+
* @returns A new state with the event accumulated.
|
|
41
|
+
*/
|
|
42
|
+
declare function accumulateEvent(state: AccumulatorState, event: TranslateChunksEvent): AccumulatorState;
|
|
43
|
+
/**
|
|
44
|
+
* Returns the key with the highest value in a map.
|
|
45
|
+
*
|
|
46
|
+
* @param map A map of keys to numeric values.
|
|
47
|
+
* @returns The key with the highest value, or undefined if the map is empty.
|
|
48
|
+
*/
|
|
49
|
+
declare function maxByValue<K>(map: ReadonlyMap<K, number>): K | undefined;
|
|
50
|
+
//#endregion
|
|
51
|
+
export { AccumulatorState, accumulateEvent, createInitialAccumulatorState, maxByValue };
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import { TranslateChunksComplete, TranslateChunksEvent } from "./translate.js";
|
|
2
|
+
import { LanguageModel } from "ai";
|
|
3
|
+
|
|
4
|
+
//#region src/accumulator.d.ts
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Accumulated state from processing translation stream events.
|
|
8
|
+
*/
|
|
9
|
+
interface AccumulatorState {
|
|
10
|
+
/**
|
|
11
|
+
* The completion event, if received.
|
|
12
|
+
*/
|
|
13
|
+
readonly complete?: TranslateChunksComplete;
|
|
14
|
+
/**
|
|
15
|
+
* Sum of quality scores from chunk events.
|
|
16
|
+
*/
|
|
17
|
+
readonly totalQualityScore: number;
|
|
18
|
+
/**
|
|
19
|
+
* Number of chunks that had quality scores.
|
|
20
|
+
*/
|
|
21
|
+
readonly qualityScoreCount: number;
|
|
22
|
+
/**
|
|
23
|
+
* Count of wins per model during best-of-N selection.
|
|
24
|
+
*/
|
|
25
|
+
readonly modelWinCounts: ReadonlyMap<LanguageModel, number>;
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Creates the initial accumulator state.
|
|
29
|
+
*
|
|
30
|
+
* @returns A fresh accumulator state with zeroed counters.
|
|
31
|
+
*/
|
|
32
|
+
declare function createInitialAccumulatorState(): AccumulatorState;
|
|
33
|
+
/**
|
|
34
|
+
* Accumulates a translation stream event into the state.
|
|
35
|
+
*
|
|
36
|
+
* This is a pure function that returns a new state without modifying the input.
|
|
37
|
+
*
|
|
38
|
+
* @param state The current accumulator state.
|
|
39
|
+
* @param event The event to accumulate.
|
|
40
|
+
* @returns A new state with the event accumulated.
|
|
41
|
+
*/
|
|
42
|
+
declare function accumulateEvent(state: AccumulatorState, event: TranslateChunksEvent): AccumulatorState;
|
|
43
|
+
/**
|
|
44
|
+
* Returns the key with the highest value in a map.
|
|
45
|
+
*
|
|
46
|
+
* @param map A map of keys to numeric values.
|
|
47
|
+
* @returns The key with the highest value, or undefined if the map is empty.
|
|
48
|
+
*/
|
|
49
|
+
declare function maxByValue<K>(map: ReadonlyMap<K, number>): K | undefined;
|
|
50
|
+
//#endregion
|
|
51
|
+
export { AccumulatorState, accumulateEvent, createInitialAccumulatorState, maxByValue };
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
//#region src/accumulator.ts
|
|
2
|
+
/**
|
|
3
|
+
* Creates the initial accumulator state.
|
|
4
|
+
*
|
|
5
|
+
* @returns A fresh accumulator state with zeroed counters.
|
|
6
|
+
*/
|
|
7
|
+
function createInitialAccumulatorState() {
|
|
8
|
+
return {
|
|
9
|
+
totalQualityScore: 0,
|
|
10
|
+
qualityScoreCount: 0,
|
|
11
|
+
modelWinCounts: /* @__PURE__ */ new Map()
|
|
12
|
+
};
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Accumulates a translation stream event into the state.
|
|
16
|
+
*
|
|
17
|
+
* This is a pure function that returns a new state without modifying the input.
|
|
18
|
+
*
|
|
19
|
+
* @param state The current accumulator state.
|
|
20
|
+
* @param event The event to accumulate.
|
|
21
|
+
* @returns A new state with the event accumulated.
|
|
22
|
+
*/
|
|
23
|
+
function accumulateEvent(state, event) {
|
|
24
|
+
if (event.type === "complete") return {
|
|
25
|
+
...state,
|
|
26
|
+
complete: event
|
|
27
|
+
};
|
|
28
|
+
let newState = state;
|
|
29
|
+
if (event.qualityScore != null) newState = {
|
|
30
|
+
...newState,
|
|
31
|
+
totalQualityScore: newState.totalQualityScore + event.qualityScore,
|
|
32
|
+
qualityScoreCount: newState.qualityScoreCount + 1
|
|
33
|
+
};
|
|
34
|
+
if (event.selectedModel != null) {
|
|
35
|
+
const newCounts = new Map(newState.modelWinCounts);
|
|
36
|
+
newCounts.set(event.selectedModel, (newCounts.get(event.selectedModel) ?? 0) + 1);
|
|
37
|
+
newState = {
|
|
38
|
+
...newState,
|
|
39
|
+
modelWinCounts: newCounts
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
return newState;
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* Returns the key with the highest value in a map.
|
|
46
|
+
*
|
|
47
|
+
* @param map A map of keys to numeric values.
|
|
48
|
+
* @returns The key with the highest value, or undefined if the map is empty.
|
|
49
|
+
*/
|
|
50
|
+
function maxByValue(map) {
|
|
51
|
+
let maxKey;
|
|
52
|
+
let maxValue = -Infinity;
|
|
53
|
+
for (const [key, value] of map) if (value > maxValue) {
|
|
54
|
+
maxValue = value;
|
|
55
|
+
maxKey = key;
|
|
56
|
+
}
|
|
57
|
+
return maxKey;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
//#endregion
|
|
61
|
+
export { accumulateEvent, createInitialAccumulatorState, maxByValue };
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
const require_rolldown_runtime = require('./_virtual/rolldown_runtime.cjs');
|
|
2
|
+
let _logtape_logtape = require("@logtape/logtape");
|
|
3
|
+
|
|
4
|
+
//#region src/chunking.ts
|
|
5
|
+
const logger = (0, _logtape_logtape.getLogger)([
|
|
6
|
+
"vertana",
|
|
7
|
+
"core",
|
|
8
|
+
"chunking"
|
|
9
|
+
]);
|
|
10
|
+
/**
|
|
11
|
+
* Gets the default chunker based on media type.
|
|
12
|
+
*
|
|
13
|
+
* @param mediaType The media type of the text.
|
|
14
|
+
* @returns A promise that resolves to the appropriate chunker for the media type.
|
|
15
|
+
*/
|
|
16
|
+
async function getDefaultChunker(mediaType) {
|
|
17
|
+
if (mediaType === "text/html") {
|
|
18
|
+
const { createHtmlChunker } = await Promise.resolve().then(() => require("./html.cjs"));
|
|
19
|
+
return createHtmlChunker();
|
|
20
|
+
}
|
|
21
|
+
if (mediaType === "text/plain") {
|
|
22
|
+
const { createPlainTextChunker } = await Promise.resolve().then(() => require("./plaintext.cjs"));
|
|
23
|
+
return createPlainTextChunker();
|
|
24
|
+
}
|
|
25
|
+
const { createMarkdownChunker } = await Promise.resolve().then(() => require("./markdown.cjs"));
|
|
26
|
+
return createMarkdownChunker();
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* Chunks text into smaller pieces for translation.
|
|
30
|
+
*
|
|
31
|
+
* This is a convenience function that combines chunker selection and execution.
|
|
32
|
+
* If chunking is disabled (chunker is `null`), the text is returned as a
|
|
33
|
+
* single-element array.
|
|
34
|
+
*
|
|
35
|
+
* @param text The text to chunk.
|
|
36
|
+
* @param options Options for chunking.
|
|
37
|
+
* @returns A promise that resolves to an array of chunk content strings.
|
|
38
|
+
*/
|
|
39
|
+
async function chunkText(text, options) {
|
|
40
|
+
const signal = options?.signal;
|
|
41
|
+
signal?.throwIfAborted();
|
|
42
|
+
const mediaType = options?.mediaType ?? "text/markdown";
|
|
43
|
+
if (options?.chunker === null) {
|
|
44
|
+
logger.debug("Chunking disabled, returning as single chunk.", { textLength: text.length });
|
|
45
|
+
return [text];
|
|
46
|
+
}
|
|
47
|
+
logger.debug("Chunking text...", {
|
|
48
|
+
mediaType,
|
|
49
|
+
textLength: text.length,
|
|
50
|
+
maxTokens: options?.maxTokens ?? 4096
|
|
51
|
+
});
|
|
52
|
+
const chunker = options?.chunker ?? await getDefaultChunker(options?.mediaType);
|
|
53
|
+
let countTokens = options?.countTokens;
|
|
54
|
+
if (countTokens == null) {
|
|
55
|
+
const { countTokens: defaultCounter } = await Promise.resolve().then(() => require("./tokens.cjs"));
|
|
56
|
+
countTokens = defaultCounter;
|
|
57
|
+
}
|
|
58
|
+
const chunks = await chunker(text, {
|
|
59
|
+
maxTokens: options?.maxTokens ?? 4096,
|
|
60
|
+
countTokens,
|
|
61
|
+
signal
|
|
62
|
+
});
|
|
63
|
+
if (chunks.length === 0) {
|
|
64
|
+
logger.debug("No chunks produced, returning as single chunk.", { textLength: text.length });
|
|
65
|
+
return [text];
|
|
66
|
+
}
|
|
67
|
+
logger.debug("Chunking completed.", {
|
|
68
|
+
chunkCount: chunks.length,
|
|
69
|
+
mediaType
|
|
70
|
+
});
|
|
71
|
+
return chunks.map((c) => c.content);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
//#endregion
|
|
75
|
+
exports.chunkText = chunkText;
|
|
76
|
+
exports.getDefaultChunker = getDefaultChunker;
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
import { MediaType } from "./prompt.cjs";
|
|
2
|
+
|
|
3
|
+
//#region src/chunking.d.ts
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* A function that counts the number of tokens in a string.
|
|
7
|
+
*
|
|
8
|
+
* @param text The text to count tokens for.
|
|
9
|
+
* @returns The number of tokens.
|
|
10
|
+
*/
|
|
11
|
+
type TokenCounter = (text: string) => number;
|
|
12
|
+
/**
|
|
13
|
+
* Options for {@link Chunker}.
|
|
14
|
+
*/
|
|
15
|
+
interface ChunkerOptions {
|
|
16
|
+
/**
|
|
17
|
+
* The maximum number of tokens per chunk.
|
|
18
|
+
*
|
|
19
|
+
* @default `4096`
|
|
20
|
+
*/
|
|
21
|
+
readonly maxTokens?: number;
|
|
22
|
+
/**
|
|
23
|
+
* A custom token counter function. If not provided, a default
|
|
24
|
+
* implementation using js-tiktoken (cl100k_base encoding) is used.
|
|
25
|
+
*/
|
|
26
|
+
readonly countTokens?: TokenCounter;
|
|
27
|
+
/**
|
|
28
|
+
* An optional `AbortSignal` to cancel the chunking operation.
|
|
29
|
+
*/
|
|
30
|
+
readonly signal?: AbortSignal;
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Splits text into chunks for translation.
|
|
34
|
+
*
|
|
35
|
+
* @param text The text to split into chunks.
|
|
36
|
+
* @param options Optional settings for the chunking operation.
|
|
37
|
+
* @returns A promise that resolves to an array of chunks.
|
|
38
|
+
*/
|
|
39
|
+
type Chunker = (text: string, options?: ChunkerOptions) => Promise<readonly Chunk[]>;
|
|
40
|
+
/**
|
|
41
|
+
* The type of content in a chunk.
|
|
42
|
+
*
|
|
43
|
+
* - `"paragraph"`: A paragraph of text.
|
|
44
|
+
* - `"section"`: A section of the document.
|
|
45
|
+
* - `"heading"`: A heading or title.
|
|
46
|
+
* - `"list"`: A list of items.
|
|
47
|
+
* - `"code"`: A code block.
|
|
48
|
+
*/
|
|
49
|
+
type ChunkType = "paragraph" | "section" | "heading" | "list" | "code";
|
|
50
|
+
/**
|
|
51
|
+
* A chunk of text to be translated.
|
|
52
|
+
*/
|
|
53
|
+
interface Chunk {
|
|
54
|
+
/**
|
|
55
|
+
* The text content of the chunk.
|
|
56
|
+
*/
|
|
57
|
+
readonly content: string;
|
|
58
|
+
/**
|
|
59
|
+
* The type of content in the chunk.
|
|
60
|
+
*/
|
|
61
|
+
readonly type: ChunkType;
|
|
62
|
+
/**
|
|
63
|
+
* The zero-based index of the chunk in the document.
|
|
64
|
+
*/
|
|
65
|
+
readonly index: number;
|
|
66
|
+
}
|
|
67
|
+
/**
|
|
68
|
+
* Options for {@link chunkText}.
|
|
69
|
+
*/
|
|
70
|
+
interface ChunkTextOptions {
|
|
71
|
+
/**
|
|
72
|
+
* The media type of the text. Used to select the default chunker
|
|
73
|
+
* when {@link chunker} is not provided.
|
|
74
|
+
*
|
|
75
|
+
* - `"text/html"`: Uses the HTML chunker.
|
|
76
|
+
* - `"text/markdown"`: Uses the Markdown chunker.
|
|
77
|
+
* - `"text/plain"`: Uses the plain text chunker.
|
|
78
|
+
*
|
|
79
|
+
* @default `"text/markdown"`
|
|
80
|
+
*/
|
|
81
|
+
readonly mediaType?: MediaType;
|
|
82
|
+
/**
|
|
83
|
+
* A custom chunker function. If not provided, a default chunker
|
|
84
|
+
* based on {@link mediaType} is used. Set to `null` to disable
|
|
85
|
+
* chunking entirely (text will be returned as a single chunk).
|
|
86
|
+
*/
|
|
87
|
+
readonly chunker?: Chunker | null;
|
|
88
|
+
/**
|
|
89
|
+
* The maximum number of tokens per chunk.
|
|
90
|
+
*
|
|
91
|
+
* @default `4096`
|
|
92
|
+
*/
|
|
93
|
+
readonly maxTokens?: number;
|
|
94
|
+
/**
|
|
95
|
+
* A custom token counter function. If not provided, a default
|
|
96
|
+
* implementation using js-tiktoken (cl100k_base encoding) is used.
|
|
97
|
+
*/
|
|
98
|
+
readonly countTokens?: TokenCounter;
|
|
99
|
+
/**
|
|
100
|
+
* An optional `AbortSignal` to cancel the chunking operation.
|
|
101
|
+
*/
|
|
102
|
+
readonly signal?: AbortSignal;
|
|
103
|
+
}
|
|
104
|
+
/**
|
|
105
|
+
* Gets the default chunker based on media type.
|
|
106
|
+
*
|
|
107
|
+
* @param mediaType The media type of the text.
|
|
108
|
+
* @returns A promise that resolves to the appropriate chunker for the media type.
|
|
109
|
+
*/
|
|
110
|
+
declare function getDefaultChunker(mediaType?: MediaType): Promise<Chunker>;
|
|
111
|
+
/**
|
|
112
|
+
* Chunks text into smaller pieces for translation.
|
|
113
|
+
*
|
|
114
|
+
* This is a convenience function that combines chunker selection and execution.
|
|
115
|
+
* If chunking is disabled (chunker is `null`), the text is returned as a
|
|
116
|
+
* single-element array.
|
|
117
|
+
*
|
|
118
|
+
* @param text The text to chunk.
|
|
119
|
+
* @param options Options for chunking.
|
|
120
|
+
* @returns A promise that resolves to an array of chunk content strings.
|
|
121
|
+
*/
|
|
122
|
+
declare function chunkText(text: string, options?: ChunkTextOptions): Promise<readonly string[]>;
|
|
123
|
+
//#endregion
|
|
124
|
+
export { Chunk, ChunkTextOptions, ChunkType, Chunker, ChunkerOptions, type MediaType, TokenCounter, chunkText, getDefaultChunker };
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
import { MediaType } from "./prompt.js";
|
|
2
|
+
|
|
3
|
+
//#region src/chunking.d.ts
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* A function that counts the number of tokens in a string.
|
|
7
|
+
*
|
|
8
|
+
* @param text The text to count tokens for.
|
|
9
|
+
* @returns The number of tokens.
|
|
10
|
+
*/
|
|
11
|
+
type TokenCounter = (text: string) => number;
|
|
12
|
+
/**
|
|
13
|
+
* Options for {@link Chunker}.
|
|
14
|
+
*/
|
|
15
|
+
interface ChunkerOptions {
|
|
16
|
+
/**
|
|
17
|
+
* The maximum number of tokens per chunk.
|
|
18
|
+
*
|
|
19
|
+
* @default `4096`
|
|
20
|
+
*/
|
|
21
|
+
readonly maxTokens?: number;
|
|
22
|
+
/**
|
|
23
|
+
* A custom token counter function. If not provided, a default
|
|
24
|
+
* implementation using js-tiktoken (cl100k_base encoding) is used.
|
|
25
|
+
*/
|
|
26
|
+
readonly countTokens?: TokenCounter;
|
|
27
|
+
/**
|
|
28
|
+
* An optional `AbortSignal` to cancel the chunking operation.
|
|
29
|
+
*/
|
|
30
|
+
readonly signal?: AbortSignal;
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Splits text into chunks for translation.
|
|
34
|
+
*
|
|
35
|
+
* @param text The text to split into chunks.
|
|
36
|
+
* @param options Optional settings for the chunking operation.
|
|
37
|
+
* @returns A promise that resolves to an array of chunks.
|
|
38
|
+
*/
|
|
39
|
+
type Chunker = (text: string, options?: ChunkerOptions) => Promise<readonly Chunk[]>;
|
|
40
|
+
/**
|
|
41
|
+
* The type of content in a chunk.
|
|
42
|
+
*
|
|
43
|
+
* - `"paragraph"`: A paragraph of text.
|
|
44
|
+
* - `"section"`: A section of the document.
|
|
45
|
+
* - `"heading"`: A heading or title.
|
|
46
|
+
* - `"list"`: A list of items.
|
|
47
|
+
* - `"code"`: A code block.
|
|
48
|
+
*/
|
|
49
|
+
type ChunkType = "paragraph" | "section" | "heading" | "list" | "code";
|
|
50
|
+
/**
|
|
51
|
+
* A chunk of text to be translated.
|
|
52
|
+
*/
|
|
53
|
+
interface Chunk {
|
|
54
|
+
/**
|
|
55
|
+
* The text content of the chunk.
|
|
56
|
+
*/
|
|
57
|
+
readonly content: string;
|
|
58
|
+
/**
|
|
59
|
+
* The type of content in the chunk.
|
|
60
|
+
*/
|
|
61
|
+
readonly type: ChunkType;
|
|
62
|
+
/**
|
|
63
|
+
* The zero-based index of the chunk in the document.
|
|
64
|
+
*/
|
|
65
|
+
readonly index: number;
|
|
66
|
+
}
|
|
67
|
+
/**
|
|
68
|
+
* Options for {@link chunkText}.
|
|
69
|
+
*/
|
|
70
|
+
interface ChunkTextOptions {
|
|
71
|
+
/**
|
|
72
|
+
* The media type of the text. Used to select the default chunker
|
|
73
|
+
* when {@link chunker} is not provided.
|
|
74
|
+
*
|
|
75
|
+
* - `"text/html"`: Uses the HTML chunker.
|
|
76
|
+
* - `"text/markdown"`: Uses the Markdown chunker.
|
|
77
|
+
* - `"text/plain"`: Uses the plain text chunker.
|
|
78
|
+
*
|
|
79
|
+
* @default `"text/markdown"`
|
|
80
|
+
*/
|
|
81
|
+
readonly mediaType?: MediaType;
|
|
82
|
+
/**
|
|
83
|
+
* A custom chunker function. If not provided, a default chunker
|
|
84
|
+
* based on {@link mediaType} is used. Set to `null` to disable
|
|
85
|
+
* chunking entirely (text will be returned as a single chunk).
|
|
86
|
+
*/
|
|
87
|
+
readonly chunker?: Chunker | null;
|
|
88
|
+
/**
|
|
89
|
+
* The maximum number of tokens per chunk.
|
|
90
|
+
*
|
|
91
|
+
* @default `4096`
|
|
92
|
+
*/
|
|
93
|
+
readonly maxTokens?: number;
|
|
94
|
+
/**
|
|
95
|
+
* A custom token counter function. If not provided, a default
|
|
96
|
+
* implementation using js-tiktoken (cl100k_base encoding) is used.
|
|
97
|
+
*/
|
|
98
|
+
readonly countTokens?: TokenCounter;
|
|
99
|
+
/**
|
|
100
|
+
* An optional `AbortSignal` to cancel the chunking operation.
|
|
101
|
+
*/
|
|
102
|
+
readonly signal?: AbortSignal;
|
|
103
|
+
}
|
|
104
|
+
/**
|
|
105
|
+
* Gets the default chunker based on media type.
|
|
106
|
+
*
|
|
107
|
+
* @param mediaType The media type of the text.
|
|
108
|
+
* @returns A promise that resolves to the appropriate chunker for the media type.
|
|
109
|
+
*/
|
|
110
|
+
declare function getDefaultChunker(mediaType?: MediaType): Promise<Chunker>;
|
|
111
|
+
/**
|
|
112
|
+
* Chunks text into smaller pieces for translation.
|
|
113
|
+
*
|
|
114
|
+
* This is a convenience function that combines chunker selection and execution.
|
|
115
|
+
* If chunking is disabled (chunker is `null`), the text is returned as a
|
|
116
|
+
* single-element array.
|
|
117
|
+
*
|
|
118
|
+
* @param text The text to chunk.
|
|
119
|
+
* @param options Options for chunking.
|
|
120
|
+
* @returns A promise that resolves to an array of chunk content strings.
|
|
121
|
+
*/
|
|
122
|
+
declare function chunkText(text: string, options?: ChunkTextOptions): Promise<readonly string[]>;
|
|
123
|
+
//#endregion
|
|
124
|
+
export { Chunk, ChunkTextOptions, ChunkType, Chunker, ChunkerOptions, type MediaType, TokenCounter, chunkText, getDefaultChunker };
|
package/dist/chunking.js
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import { getLogger } from "@logtape/logtape";
|
|
2
|
+
|
|
3
|
+
//#region src/chunking.ts
|
|
4
|
+
const logger = getLogger([
|
|
5
|
+
"vertana",
|
|
6
|
+
"core",
|
|
7
|
+
"chunking"
|
|
8
|
+
]);
|
|
9
|
+
/**
|
|
10
|
+
* Gets the default chunker based on media type.
|
|
11
|
+
*
|
|
12
|
+
* @param mediaType The media type of the text.
|
|
13
|
+
* @returns A promise that resolves to the appropriate chunker for the media type.
|
|
14
|
+
*/
|
|
15
|
+
async function getDefaultChunker(mediaType) {
|
|
16
|
+
if (mediaType === "text/html") {
|
|
17
|
+
const { createHtmlChunker } = await import("./html.js");
|
|
18
|
+
return createHtmlChunker();
|
|
19
|
+
}
|
|
20
|
+
if (mediaType === "text/plain") {
|
|
21
|
+
const { createPlainTextChunker } = await import("./plaintext.js");
|
|
22
|
+
return createPlainTextChunker();
|
|
23
|
+
}
|
|
24
|
+
const { createMarkdownChunker } = await import("./markdown.js");
|
|
25
|
+
return createMarkdownChunker();
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Chunks text into smaller pieces for translation.
|
|
29
|
+
*
|
|
30
|
+
* This is a convenience function that combines chunker selection and execution.
|
|
31
|
+
* If chunking is disabled (chunker is `null`), the text is returned as a
|
|
32
|
+
* single-element array.
|
|
33
|
+
*
|
|
34
|
+
* @param text The text to chunk.
|
|
35
|
+
* @param options Options for chunking.
|
|
36
|
+
* @returns A promise that resolves to an array of chunk content strings.
|
|
37
|
+
*/
|
|
38
|
+
async function chunkText(text, options) {
|
|
39
|
+
const signal = options?.signal;
|
|
40
|
+
signal?.throwIfAborted();
|
|
41
|
+
const mediaType = options?.mediaType ?? "text/markdown";
|
|
42
|
+
if (options?.chunker === null) {
|
|
43
|
+
logger.debug("Chunking disabled, returning as single chunk.", { textLength: text.length });
|
|
44
|
+
return [text];
|
|
45
|
+
}
|
|
46
|
+
logger.debug("Chunking text...", {
|
|
47
|
+
mediaType,
|
|
48
|
+
textLength: text.length,
|
|
49
|
+
maxTokens: options?.maxTokens ?? 4096
|
|
50
|
+
});
|
|
51
|
+
const chunker = options?.chunker ?? await getDefaultChunker(options?.mediaType);
|
|
52
|
+
let countTokens = options?.countTokens;
|
|
53
|
+
if (countTokens == null) {
|
|
54
|
+
const { countTokens: defaultCounter } = await import("./tokens.js");
|
|
55
|
+
countTokens = defaultCounter;
|
|
56
|
+
}
|
|
57
|
+
const chunks = await chunker(text, {
|
|
58
|
+
maxTokens: options?.maxTokens ?? 4096,
|
|
59
|
+
countTokens,
|
|
60
|
+
signal
|
|
61
|
+
});
|
|
62
|
+
if (chunks.length === 0) {
|
|
63
|
+
logger.debug("No chunks produced, returning as single chunk.", { textLength: text.length });
|
|
64
|
+
return [text];
|
|
65
|
+
}
|
|
66
|
+
logger.debug("Chunking completed.", {
|
|
67
|
+
chunkCount: chunks.length,
|
|
68
|
+
mediaType
|
|
69
|
+
});
|
|
70
|
+
return chunks.map((c) => c.content);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
//#endregion
|
|
74
|
+
export { chunkText, getDefaultChunker };
|