discourse_ai-tokenizers 0.4 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b3868dc8e228ff41a7319ceb5c97b1443646bc715654ff6872f6a107680d1240
4
- data.tar.gz: 1cd60cd362995d4b3a7be7495768e9ee0885a532eafaed9b482a7a1dab36e391
3
+ metadata.gz: c475199d403a36cf33668b1f64e030d2166034656e1c48760dfe3a6f18c3a394
4
+ data.tar.gz: 48e9daf8943ba37ad8ea14cd81c275c29352459dfd226b95b605dde403dee748
5
5
  SHA512:
6
- metadata.gz: 632ada3172909b8e230e20742da68d98ffd0f48ccfdb7d5b79d7055dd1579e8b9ec8d163c30221a63f6581a46ae8dcf376be97e3db9c6677a8a06b247eabfa1a
7
- data.tar.gz: 436f944b815606c911f98b465284860ebdd628a1a8b70deab0b37a3050d72dc18947caddbd4e47b0ad50ddfe997454f44454d91ac481a27b99cb9fe413c1d7be
6
+ metadata.gz: fcf2e4fa7d73348f55bfc13a6de3eded0391fdca58d6629fc672dc6856eade5adf3f196902d6ffc5b9ef01079783c651f74fe89aad41fdf26573e866a5110120
7
+ data.tar.gz: 98e9dcdffa14456a090ff2f3a06189f79585d8c97c79714710a17fd62491416fe63f0c6204fd4be76c5b3348d591ab4c978fade5ae5a45870de03787d014078d
data/CHANGELOG.md CHANGED
@@ -1,4 +1,10 @@
1
- ## [Unreleased]
1
+ ## [0.4.2] - 2026-02-27
2
+
3
+ - Normalize `ASCII-8BIT`/non-UTF-8 string inputs before tokenization to prevent `EncodingError` in `truncate`, `encode`, and `below_limit?`
4
+
5
+ ## [0.4.1] - 2026-02-26
6
+
7
+ - Fix tiktoken-rs stack overflow crash by chunking large inputs at whitespace boundaries before encoding
2
8
 
3
9
  ## [0.4.0] - 2026-01-06
4
10
 
@@ -21,7 +21,7 @@ module DiscourseAi
21
21
  end
22
22
 
23
23
  def tokenize(text)
24
- tokenizer.encode(text).tokens
24
+ tokenizer.encode(normalize_text(text)).tokens
25
25
  end
26
26
 
27
27
  def size(text)
@@ -32,38 +32,67 @@ module DiscourseAi
32
32
  tokenizer.decode(token_ids)
33
33
  end
34
34
 
35
- def encode(tokens)
36
- tokenizer.encode(tokens).ids
35
+ def encode(text)
36
+ tokenizer.encode(normalize_text(text)).ids
37
37
  end
38
38
 
39
39
  def truncate(text, max_length, strict: false)
40
40
  return "" if max_length <= 0
41
41
 
42
+ text = normalize_text(text)
43
+
42
44
  # fast track common case, /2 to handle unicode chars
43
45
  # than can take more than 1 token per char
44
46
  return text if !strict && text.size < max_length / 2
45
47
 
46
48
  # Take tokens up to max_length, decode, then ensure we don't exceed limit
47
49
  truncated_tokens = tokenizer.encode(text).ids.take(max_length)
48
- truncated_text = tokenizer.decode(truncated_tokens)
50
+ truncated_text = normalize_text(tokenizer.decode(truncated_tokens))
49
51
 
50
52
  # If re-encoding exceeds the limit, we need to further truncate
51
53
  while tokenizer.encode(truncated_text).ids.length > max_length
52
54
  truncated_tokens = truncated_tokens[0...-1]
53
- truncated_text = tokenizer.decode(truncated_tokens)
55
+ truncated_text = normalize_text(tokenizer.decode(truncated_tokens))
54
56
  break if truncated_tokens.empty?
55
57
  end
56
58
 
57
- truncated_text
59
+ normalize_text(truncated_text)
58
60
  end
59
61
 
60
62
  def below_limit?(text, limit, strict: false)
63
+ text = normalize_text(text)
64
+
61
65
  # fast track common case, /2 to handle unicode chars
62
66
  # than can take more than 1 token per char
63
67
  return true if !strict && text.size < limit / 2
64
68
 
65
69
  tokenizer.encode(text).ids.length < limit
66
70
  end
71
+
72
+ private
73
+
74
+ def normalize_text(text)
75
+ return text unless text.is_a?(String)
76
+
77
+ # Fast path: avoid allocations for the common valid UTF-8 case.
78
+ if text.encoding == Encoding::UTF_8 && text.valid_encoding?
79
+ return text
80
+ end
81
+
82
+ if text.encoding == Encoding::ASCII_8BIT
83
+ normalized = text.dup
84
+ normalized.force_encoding(Encoding::UTF_8)
85
+ elsif text.encoding != Encoding::UTF_8
86
+ normalized = text.encode(Encoding::UTF_8)
87
+ else
88
+ normalized = text
89
+ end
90
+
91
+ normalized.valid_encoding? ? normalized : normalized.scrub
92
+ rescue Encoding::UndefinedConversionError,
93
+ Encoding::InvalidByteSequenceError
94
+ text.encode(Encoding::UTF_8, invalid: :replace, undef: :replace)
95
+ end
67
96
  end
68
97
  end
69
98
  end
@@ -4,17 +4,22 @@ module DiscourseAi
4
4
  module Tokenizer
5
5
  # Wrapper for OpenAI tokenizer library for compatibility with Discourse AI API
6
6
  class OpenAiTokenizer < BasicTokenizer
7
+ # tiktoken-rs uses fancy-regex which can stack overflow on large inputs
8
+ # due to catastrophic backtracking (github.com/openai/tiktoken/issues/245).
9
+ # Chunking at whitespace boundaries prevents this while preserving accuracy.
10
+ SAFE_CHUNK_SIZE = 50_000
11
+
7
12
  class << self
8
13
  def tokenizer
9
14
  @tokenizer ||= Tiktoken.get_encoding("o200k_base")
10
15
  end
11
16
 
12
17
  def tokenize(text)
13
- tokenizer.encode(text)
18
+ safe_encode(text)
14
19
  end
15
20
 
16
21
  def encode(text)
17
- tokenizer.encode(text)
22
+ safe_encode(text)
18
23
  end
19
24
 
20
25
  def decode(token_ids)
@@ -49,30 +54,63 @@ module DiscourseAi
49
54
  def truncate(text, max_length, strict: false)
50
55
  return "" if max_length <= 0
51
56
 
57
+ text = normalize_text(text)
58
+
52
59
  # fast track common case, /2 to handle unicode chars
53
60
  # than can take more than 1 token per char
54
61
  return text if !strict && text.size < max_length / 2
55
62
 
56
63
  # Take tokens up to max_length, decode, then ensure we don't exceed limit
57
64
  truncated_tokens = tokenize(text).take(max_length)
58
- truncated_text = decode(truncated_tokens)
65
+ truncated_text = normalize_text(decode(truncated_tokens))
59
66
 
60
67
  # If re-encoding exceeds the limit, we need to further truncate
61
68
  while tokenize(truncated_text).length > max_length
62
69
  truncated_tokens = truncated_tokens[0...-1]
63
- truncated_text = decode(truncated_tokens)
70
+ truncated_text = normalize_text(decode(truncated_tokens))
64
71
  break if truncated_tokens.empty?
65
72
  end
66
73
 
67
- truncated_text
74
+ normalize_text(truncated_text)
68
75
  end
69
76
 
70
77
  def below_limit?(text, limit, strict: false)
78
+ text = normalize_text(text)
79
+
71
80
  # fast track common case, /2 to handle unicode chars
72
81
  # than can take more than 1 token per char
73
82
  return true if !strict && text.size < limit / 2
74
83
 
75
- tokenizer.encode(text).length < limit
84
+ safe_encode(text).length < limit
85
+ end
86
+
87
+ private
88
+
89
+ def safe_encode(text)
90
+ text = normalize_text(text)
91
+
92
+ if !text.is_a?(String) || text.size <= SAFE_CHUNK_SIZE
93
+ return tokenizer.encode(text)
94
+ end
95
+
96
+ tokens = []
97
+ offset = 0
98
+ while offset < text.size
99
+ chunk_end = offset + SAFE_CHUNK_SIZE
100
+
101
+ if chunk_end < text.size
102
+ # Split at a whitespace boundary to preserve tokenization accuracy
103
+ break_point = text.rindex(/\s/, chunk_end)
104
+ chunk_end = break_point if break_point && break_point > offset
105
+ else
106
+ chunk_end = text.size
107
+ end
108
+
109
+ tokens.concat(tokenizer.encode(text[offset...chunk_end]))
110
+ offset = chunk_end
111
+ end
112
+
113
+ tokens
76
114
  end
77
115
  end
78
116
  end
@@ -2,6 +2,6 @@
2
2
 
3
3
  module DiscourseAi
4
4
  module Tokenizers
5
- VERSION = "0.4"
5
+ VERSION = "0.4.2"
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: discourse_ai-tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.4'
4
+ version: 0.4.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rafael Silva