@smart-cloud/ai-kit-ui 1.3.15 → 1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +19 -9
- package/dist/index.d.cts +1 -1
- package/dist/index.d.ts +1 -1
- package/dist/index.js +19 -9
- package/package.json +26 -23
- package/src/ai-feature/AiFeature.tsx +98 -8
- package/src/ai-feature/chunked-features.ts +254 -0
- package/src/ai-feature/chunking-utils.ts +211 -0
- package/tsup.config.ts +2 -1
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Text chunking utilities for handling large inputs in AI features
|
|
3
|
+
*
|
|
4
|
+
* Chunking is needed for:
|
|
5
|
+
* - On-device models with token quotas (~8000 tokens)
|
|
6
|
+
* - AWS Translate backend (10,000 character limit)
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
export interface TextChunk {
|
|
10
|
+
text: string;
|
|
11
|
+
start: number;
|
|
12
|
+
end: number;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Estimate token count from text
|
|
17
|
+
* Approximation: 1 token ≈ 3.5 characters for Hungarian text
|
|
18
|
+
*/
|
|
19
|
+
export function estimateTokenCount(text: string): number {
|
|
20
|
+
return Math.ceil(text.length / 3.5);
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Check if input should be chunked based on mode and size
|
|
25
|
+
*/
|
|
26
|
+
export function shouldChunkInput(
|
|
27
|
+
text: string,
|
|
28
|
+
mode: "summarize" | "translate" | "rewrite" | "proofread",
|
|
29
|
+
isOnDevice: boolean,
|
|
30
|
+
): boolean {
|
|
31
|
+
const tokens = estimateTokenCount(text);
|
|
32
|
+
|
|
33
|
+
if (isOnDevice) {
|
|
34
|
+
// On-device models have token quotas
|
|
35
|
+
const quotas = {
|
|
36
|
+
summarize: 8000,
|
|
37
|
+
translate: 8000,
|
|
38
|
+
rewrite: 8000,
|
|
39
|
+
proofread: 10000, // Proofreader has higher quota
|
|
40
|
+
};
|
|
41
|
+
|
|
42
|
+
const quota = quotas[mode] || 8000;
|
|
43
|
+
// Use 80% threshold for safety (buffer for output)
|
|
44
|
+
return tokens > quota * 0.8;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// Backend: only AWS Translate has character limit
|
|
48
|
+
if (mode === "translate") {
|
|
49
|
+
// AWS Translate limit is 10,000 characters
|
|
50
|
+
// Use 90% threshold (9,000 chars) for safety
|
|
51
|
+
return text.length > 9000;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// Other backends can handle large inputs
|
|
55
|
+
return false;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Find the last sentence boundary before the given position
|
|
60
|
+
*/
|
|
61
|
+
function findLastSentenceBoundary(
|
|
62
|
+
text: string,
|
|
63
|
+
start: number,
|
|
64
|
+
end: number,
|
|
65
|
+
): number {
|
|
66
|
+
// Look for sentence enders: . ! ? followed by space or newline
|
|
67
|
+
let lastBoundary = -1;
|
|
68
|
+
|
|
69
|
+
for (let i = end - 1; i >= start; i--) {
|
|
70
|
+
const char = text[i];
|
|
71
|
+
const nextChar = i + 1 < text.length ? text[i + 1] : "";
|
|
72
|
+
|
|
73
|
+
if (
|
|
74
|
+
(char === "." || char === "!" || char === "?") &&
|
|
75
|
+
(nextChar === " " ||
|
|
76
|
+
nextChar === "\n" ||
|
|
77
|
+
nextChar === "\r" ||
|
|
78
|
+
i === text.length - 1)
|
|
79
|
+
) {
|
|
80
|
+
lastBoundary = i + 1;
|
|
81
|
+
break;
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// Only accept if we found a boundary in the latter half of the chunk
|
|
86
|
+
return lastBoundary > start + (end - start) * 0.5 ? lastBoundary : -1;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Find the last clause boundary (comma, semicolon, colon)
|
|
91
|
+
*/
|
|
92
|
+
function findLastClauseBoundary(
|
|
93
|
+
text: string,
|
|
94
|
+
start: number,
|
|
95
|
+
end: number,
|
|
96
|
+
): number {
|
|
97
|
+
let lastBoundary = -1;
|
|
98
|
+
|
|
99
|
+
for (let i = end - 1; i >= start; i--) {
|
|
100
|
+
const char = text[i];
|
|
101
|
+
const nextChar = i + 1 < text.length ? text[i + 1] : "";
|
|
102
|
+
|
|
103
|
+
if (
|
|
104
|
+
(char === "," || char === ";" || char === ":") &&
|
|
105
|
+
(nextChar === " " || nextChar === "\n" || nextChar === "\r")
|
|
106
|
+
) {
|
|
107
|
+
lastBoundary = i + 1;
|
|
108
|
+
break;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
return lastBoundary > start + (end - start) * 0.5 ? lastBoundary : -1;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Split text into chunks at intelligent boundaries
|
|
117
|
+
*
|
|
118
|
+
* Priority order for splitting:
|
|
119
|
+
* 1. Paragraph breaks (\n\n)
|
|
120
|
+
* 2. Sentence endings (. ! ?)
|
|
121
|
+
* 3. Clause markers (, ; :)
|
|
122
|
+
* 4. Word boundaries (space)
|
|
123
|
+
*/
|
|
124
|
+
export function splitTextIntoChunks(
|
|
125
|
+
text: string,
|
|
126
|
+
maxCharsPerChunk: number,
|
|
127
|
+
): TextChunk[] {
|
|
128
|
+
const chunks: TextChunk[] = [];
|
|
129
|
+
let currentPos = 0;
|
|
130
|
+
|
|
131
|
+
while (currentPos < text.length) {
|
|
132
|
+
let chunkEnd = Math.min(currentPos + maxCharsPerChunk, text.length);
|
|
133
|
+
|
|
134
|
+
if (chunkEnd < text.length) {
|
|
135
|
+
// Try to split at paragraph break
|
|
136
|
+
const paragraphBreakPos = text.lastIndexOf("\n\n", chunkEnd);
|
|
137
|
+
if (paragraphBreakPos > currentPos + maxCharsPerChunk * 0.5) {
|
|
138
|
+
chunkEnd = paragraphBreakPos + 2;
|
|
139
|
+
} else {
|
|
140
|
+
// Try to split at sentence boundary
|
|
141
|
+
const sentenceEnd = findLastSentenceBoundary(
|
|
142
|
+
text,
|
|
143
|
+
currentPos,
|
|
144
|
+
chunkEnd,
|
|
145
|
+
);
|
|
146
|
+
if (sentenceEnd > 0) {
|
|
147
|
+
chunkEnd = sentenceEnd;
|
|
148
|
+
} else {
|
|
149
|
+
// Try to split at clause boundary
|
|
150
|
+
const clauseEnd = findLastClauseBoundary(text, currentPos, chunkEnd);
|
|
151
|
+
if (clauseEnd > 0) {
|
|
152
|
+
chunkEnd = clauseEnd;
|
|
153
|
+
} else {
|
|
154
|
+
// Last resort: split at word boundary
|
|
155
|
+
const wordEnd = text.lastIndexOf(" ", chunkEnd);
|
|
156
|
+
if (wordEnd > currentPos + maxCharsPerChunk * 0.5) {
|
|
157
|
+
chunkEnd = wordEnd + 1;
|
|
158
|
+
}
|
|
159
|
+
// If no good boundary found, just cut at maxCharsPerChunk
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
const chunkText = text.substring(currentPos, chunkEnd).trim();
|
|
166
|
+
if (chunkText.length > 0) {
|
|
167
|
+
chunks.push({
|
|
168
|
+
text: chunkText,
|
|
169
|
+
start: currentPos,
|
|
170
|
+
end: chunkEnd,
|
|
171
|
+
});
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
currentPos = chunkEnd;
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
return chunks;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
/**
|
|
181
|
+
* Calculate appropriate chunk size based on mode and whether it's on-device
|
|
182
|
+
*/
|
|
183
|
+
export function getChunkSize(
|
|
184
|
+
mode: "summarize" | "translate" | "rewrite" | "proofread",
|
|
185
|
+
isOnDevice: boolean,
|
|
186
|
+
): number {
|
|
187
|
+
if (isOnDevice) {
|
|
188
|
+
// On-device: use token-based chunking
|
|
189
|
+
// Convert tokens to characters (80% of quota for safety)
|
|
190
|
+
const quotas = {
|
|
191
|
+
summarize: 8000,
|
|
192
|
+
translate: 8000,
|
|
193
|
+
rewrite: 8000,
|
|
194
|
+
proofread: 10000,
|
|
195
|
+
};
|
|
196
|
+
|
|
197
|
+
const quota = quotas[mode] || 8000;
|
|
198
|
+
const safeQuota = quota * 0.8;
|
|
199
|
+
// Convert tokens to chars (1 token ≈ 3.5 chars)
|
|
200
|
+
return Math.floor(safeQuota * 3.5);
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
// Backend: only for AWS Translate
|
|
204
|
+
if (mode === "translate") {
|
|
205
|
+
// AWS Translate: 10,000 char limit, use 9,000 for safety
|
|
206
|
+
return 9000;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
// Should not reach here if shouldChunkInput is used correctly
|
|
210
|
+
return 10000;
|
|
211
|
+
}
|
package/tsup.config.ts
CHANGED
|
@@ -2,7 +2,8 @@ import { defineConfig } from "tsup";
|
|
|
2
2
|
|
|
3
3
|
export default defineConfig({
|
|
4
4
|
// Copy non-hashed global CSS so consumers can import it (like Mantine styles)
|
|
5
|
-
onSuccess:
|
|
5
|
+
onSuccess:
|
|
6
|
+
"node -e \"const fs=require('fs'); const path=require('path'); fs.mkdirSync('dist',{recursive:true}); fs.copyFileSync(path.join('src','styles','ai-kit-ui.css'), path.join('dist','ai-kit-ui.css'));\"",
|
|
6
7
|
|
|
7
8
|
entry: ["src/index.tsx"],
|
|
8
9
|
format: ["cjs", "esm"],
|