discourse_ai-tokenizers 0.4 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c475199d403a36cf33668b1f64e030d2166034656e1c48760dfe3a6f18c3a394
|
|
4
|
+
data.tar.gz: 48e9daf8943ba37ad8ea14cd81c275c29352459dfd226b95b605dde403dee748
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: fcf2e4fa7d73348f55bfc13a6de3eded0391fdca58d6629fc672dc6856eade5adf3f196902d6ffc5b9ef01079783c651f74fe89aad41fdf26573e866a5110120
|
|
7
|
+
data.tar.gz: 98e9dcdffa14456a090ff2f3a06189f79585d8c97c79714710a17fd62491416fe63f0c6204fd4be76c5b3348d591ab4c978fade5ae5a45870de03787d014078d
|
data/CHANGELOG.md
CHANGED
|
@@ -1,4 +1,10 @@
|
|
|
1
|
-
## [
|
|
1
|
+
## [0.4.2] - 2026-02-27
|
|
2
|
+
|
|
3
|
+
- Normalize `ASCII-8BIT`/non-UTF-8 string inputs before tokenization to prevent `EncodingError` in `truncate`, `encode`, and `below_limit?`
|
|
4
|
+
|
|
5
|
+
## [0.4.1] - 2026-02-26
|
|
6
|
+
|
|
7
|
+
- Fix tiktoken-rs stack overflow crash by chunking large inputs at whitespace boundaries before encoding
|
|
2
8
|
|
|
3
9
|
## [0.4.0] - 2026-01-06
|
|
4
10
|
|
|
@@ -21,7 +21,7 @@ module DiscourseAi
|
|
|
21
21
|
end
|
|
22
22
|
|
|
23
23
|
def tokenize(text)
|
|
24
|
-
tokenizer.encode(text).tokens
|
|
24
|
+
tokenizer.encode(normalize_text(text)).tokens
|
|
25
25
|
end
|
|
26
26
|
|
|
27
27
|
def size(text)
|
|
@@ -32,38 +32,67 @@ module DiscourseAi
|
|
|
32
32
|
tokenizer.decode(token_ids)
|
|
33
33
|
end
|
|
34
34
|
|
|
35
|
-
def encode(
|
|
36
|
-
tokenizer.encode(
|
|
35
|
+
def encode(text)
|
|
36
|
+
tokenizer.encode(normalize_text(text)).ids
|
|
37
37
|
end
|
|
38
38
|
|
|
39
39
|
def truncate(text, max_length, strict: false)
|
|
40
40
|
return "" if max_length <= 0
|
|
41
41
|
|
|
42
|
+
text = normalize_text(text)
|
|
43
|
+
|
|
42
44
|
# fast track common case, /2 to handle unicode chars
|
|
43
45
|
# than can take more than 1 token per char
|
|
44
46
|
return text if !strict && text.size < max_length / 2
|
|
45
47
|
|
|
46
48
|
# Take tokens up to max_length, decode, then ensure we don't exceed limit
|
|
47
49
|
truncated_tokens = tokenizer.encode(text).ids.take(max_length)
|
|
48
|
-
truncated_text = tokenizer.decode(truncated_tokens)
|
|
50
|
+
truncated_text = normalize_text(tokenizer.decode(truncated_tokens))
|
|
49
51
|
|
|
50
52
|
# If re-encoding exceeds the limit, we need to further truncate
|
|
51
53
|
while tokenizer.encode(truncated_text).ids.length > max_length
|
|
52
54
|
truncated_tokens = truncated_tokens[0...-1]
|
|
53
|
-
truncated_text = tokenizer.decode(truncated_tokens)
|
|
55
|
+
truncated_text = normalize_text(tokenizer.decode(truncated_tokens))
|
|
54
56
|
break if truncated_tokens.empty?
|
|
55
57
|
end
|
|
56
58
|
|
|
57
|
-
truncated_text
|
|
59
|
+
normalize_text(truncated_text)
|
|
58
60
|
end
|
|
59
61
|
|
|
60
62
|
def below_limit?(text, limit, strict: false)
|
|
63
|
+
text = normalize_text(text)
|
|
64
|
+
|
|
61
65
|
# fast track common case, /2 to handle unicode chars
|
|
62
66
|
# than can take more than 1 token per char
|
|
63
67
|
return true if !strict && text.size < limit / 2
|
|
64
68
|
|
|
65
69
|
tokenizer.encode(text).ids.length < limit
|
|
66
70
|
end
|
|
71
|
+
|
|
72
|
+
private
|
|
73
|
+
|
|
74
|
+
def normalize_text(text)
|
|
75
|
+
return text unless text.is_a?(String)
|
|
76
|
+
|
|
77
|
+
# Fast path: avoid allocations for the common valid UTF-8 case.
|
|
78
|
+
if text.encoding == Encoding::UTF_8 && text.valid_encoding?
|
|
79
|
+
return text
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
if text.encoding == Encoding::ASCII_8BIT
|
|
83
|
+
normalized = text.dup
|
|
84
|
+
normalized.force_encoding(Encoding::UTF_8)
|
|
85
|
+
elsif text.encoding != Encoding::UTF_8
|
|
86
|
+
normalized = text.encode(Encoding::UTF_8)
|
|
87
|
+
else
|
|
88
|
+
normalized = text
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
normalized.valid_encoding? ? normalized : normalized.scrub
|
|
92
|
+
rescue Encoding::UndefinedConversionError,
|
|
93
|
+
Encoding::InvalidByteSequenceError
|
|
94
|
+
text.encode(Encoding::UTF_8, invalid: :replace, undef: :replace)
|
|
95
|
+
end
|
|
67
96
|
end
|
|
68
97
|
end
|
|
69
98
|
end
|
|
@@ -4,17 +4,22 @@ module DiscourseAi
|
|
|
4
4
|
module Tokenizer
|
|
5
5
|
# Wrapper for OpenAI tokenizer library for compatibility with Discourse AI API
|
|
6
6
|
class OpenAiTokenizer < BasicTokenizer
|
|
7
|
+
# tiktoken-rs uses fancy-regex which can stack overflow on large inputs
|
|
8
|
+
# due to catastrophic backtracking (github.com/openai/tiktoken/issues/245).
|
|
9
|
+
# Chunking at whitespace boundaries prevents this while preserving accuracy.
|
|
10
|
+
SAFE_CHUNK_SIZE = 50_000
|
|
11
|
+
|
|
7
12
|
class << self
|
|
8
13
|
def tokenizer
|
|
9
14
|
@tokenizer ||= Tiktoken.get_encoding("o200k_base")
|
|
10
15
|
end
|
|
11
16
|
|
|
12
17
|
def tokenize(text)
|
|
13
|
-
|
|
18
|
+
safe_encode(text)
|
|
14
19
|
end
|
|
15
20
|
|
|
16
21
|
def encode(text)
|
|
17
|
-
|
|
22
|
+
safe_encode(text)
|
|
18
23
|
end
|
|
19
24
|
|
|
20
25
|
def decode(token_ids)
|
|
@@ -49,30 +54,63 @@ module DiscourseAi
|
|
|
49
54
|
def truncate(text, max_length, strict: false)
|
|
50
55
|
return "" if max_length <= 0
|
|
51
56
|
|
|
57
|
+
text = normalize_text(text)
|
|
58
|
+
|
|
52
59
|
# fast track common case, /2 to handle unicode chars
|
|
53
60
|
# than can take more than 1 token per char
|
|
54
61
|
return text if !strict && text.size < max_length / 2
|
|
55
62
|
|
|
56
63
|
# Take tokens up to max_length, decode, then ensure we don't exceed limit
|
|
57
64
|
truncated_tokens = tokenize(text).take(max_length)
|
|
58
|
-
truncated_text = decode(truncated_tokens)
|
|
65
|
+
truncated_text = normalize_text(decode(truncated_tokens))
|
|
59
66
|
|
|
60
67
|
# If re-encoding exceeds the limit, we need to further truncate
|
|
61
68
|
while tokenize(truncated_text).length > max_length
|
|
62
69
|
truncated_tokens = truncated_tokens[0...-1]
|
|
63
|
-
truncated_text = decode(truncated_tokens)
|
|
70
|
+
truncated_text = normalize_text(decode(truncated_tokens))
|
|
64
71
|
break if truncated_tokens.empty?
|
|
65
72
|
end
|
|
66
73
|
|
|
67
|
-
truncated_text
|
|
74
|
+
normalize_text(truncated_text)
|
|
68
75
|
end
|
|
69
76
|
|
|
70
77
|
def below_limit?(text, limit, strict: false)
|
|
78
|
+
text = normalize_text(text)
|
|
79
|
+
|
|
71
80
|
# fast track common case, /2 to handle unicode chars
|
|
72
81
|
# than can take more than 1 token per char
|
|
73
82
|
return true if !strict && text.size < limit / 2
|
|
74
83
|
|
|
75
|
-
|
|
84
|
+
safe_encode(text).length < limit
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
private
|
|
88
|
+
|
|
89
|
+
def safe_encode(text)
|
|
90
|
+
text = normalize_text(text)
|
|
91
|
+
|
|
92
|
+
if !text.is_a?(String) || text.size <= SAFE_CHUNK_SIZE
|
|
93
|
+
return tokenizer.encode(text)
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
tokens = []
|
|
97
|
+
offset = 0
|
|
98
|
+
while offset < text.size
|
|
99
|
+
chunk_end = offset + SAFE_CHUNK_SIZE
|
|
100
|
+
|
|
101
|
+
if chunk_end < text.size
|
|
102
|
+
# Split at a whitespace boundary to preserve tokenization accuracy
|
|
103
|
+
break_point = text.rindex(/\s/, chunk_end)
|
|
104
|
+
chunk_end = break_point if break_point && break_point > offset
|
|
105
|
+
else
|
|
106
|
+
chunk_end = text.size
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
tokens.concat(tokenizer.encode(text[offset...chunk_end]))
|
|
110
|
+
offset = chunk_end
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
tokens
|
|
76
114
|
end
|
|
77
115
|
end
|
|
78
116
|
end
|