discourse_ai-tokenizers 0.4.1 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 476e96609f0c9bd5ccbb94f5dd464df65cb125d047941689834d8ff994094d80
4
- data.tar.gz: 1c987d4572105aab891e91ba86229a409d4932148521c32e180eaccbbe4fbc35
3
+ metadata.gz: c475199d403a36cf33668b1f64e030d2166034656e1c48760dfe3a6f18c3a394
4
+ data.tar.gz: 48e9daf8943ba37ad8ea14cd81c275c29352459dfd226b95b605dde403dee748
5
5
  SHA512:
6
- metadata.gz: 4cb19dbf675d42b4360f5beecb384a92835ffcd891f3010fcca589c1c90ba188253939aa3bb3b7b0506c72422554e69968a00a2389794febf77ab29b1b188546
7
- data.tar.gz: 4bd2db8132f668ed0a8f5a71060ac5a5d254cb4020a55d2fcd629dcc0775a29616e443af71c04d007ec629ee7dfa062eeddf7f0e86a26abd3e67c5b24cac7c09
6
+ metadata.gz: fcf2e4fa7d73348f55bfc13a6de3eded0391fdca58d6629fc672dc6856eade5adf3f196902d6ffc5b9ef01079783c651f74fe89aad41fdf26573e866a5110120
7
+ data.tar.gz: 98e9dcdffa14456a090ff2f3a06189f79585d8c97c79714710a17fd62491416fe63f0c6204fd4be76c5b3348d591ab4c978fade5ae5a45870de03787d014078d
data/CHANGELOG.md CHANGED
@@ -1,4 +1,6 @@
1
- ## [Unreleased]
1
+ ## [0.4.2] - 2026-02-27
2
+
3
+ - Normalize `ASCII-8BIT`/non-UTF-8 string inputs before tokenization to prevent `EncodingError` in `truncate`, `encode`, and `below_limit?`
2
4
 
3
5
  ## [0.4.1] - 2026-02-26
4
6
 
@@ -21,7 +21,7 @@ module DiscourseAi
21
21
  end
22
22
 
23
23
  def tokenize(text)
24
- tokenizer.encode(text).tokens
24
+ tokenizer.encode(normalize_text(text)).tokens
25
25
  end
26
26
 
27
27
  def size(text)
@@ -32,38 +32,67 @@ module DiscourseAi
32
32
  tokenizer.decode(token_ids)
33
33
  end
34
34
 
35
- def encode(tokens)
36
- tokenizer.encode(tokens).ids
35
+ def encode(text)
36
+ tokenizer.encode(normalize_text(text)).ids
37
37
  end
38
38
 
39
39
  def truncate(text, max_length, strict: false)
40
40
  return "" if max_length <= 0
41
41
 
42
+ text = normalize_text(text)
43
+
42
44
  # fast track common case, /2 to handle unicode chars
43
45
  # than can take more than 1 token per char
44
46
  return text if !strict && text.size < max_length / 2
45
47
 
46
48
  # Take tokens up to max_length, decode, then ensure we don't exceed limit
47
49
  truncated_tokens = tokenizer.encode(text).ids.take(max_length)
48
- truncated_text = tokenizer.decode(truncated_tokens)
50
+ truncated_text = normalize_text(tokenizer.decode(truncated_tokens))
49
51
 
50
52
  # If re-encoding exceeds the limit, we need to further truncate
51
53
  while tokenizer.encode(truncated_text).ids.length > max_length
52
54
  truncated_tokens = truncated_tokens[0...-1]
53
- truncated_text = tokenizer.decode(truncated_tokens)
55
+ truncated_text = normalize_text(tokenizer.decode(truncated_tokens))
54
56
  break if truncated_tokens.empty?
55
57
  end
56
58
 
57
- truncated_text
59
+ normalize_text(truncated_text)
58
60
  end
59
61
 
60
62
  def below_limit?(text, limit, strict: false)
63
+ text = normalize_text(text)
64
+
61
65
  # fast track common case, /2 to handle unicode chars
62
66
  # than can take more than 1 token per char
63
67
  return true if !strict && text.size < limit / 2
64
68
 
65
69
  tokenizer.encode(text).ids.length < limit
66
70
  end
71
+
72
+ private
73
+
74
+ def normalize_text(text)
75
+ return text unless text.is_a?(String)
76
+
77
+ # Fast path: avoid allocations for the common valid UTF-8 case.
78
+ if text.encoding == Encoding::UTF_8 && text.valid_encoding?
79
+ return text
80
+ end
81
+
82
+ if text.encoding == Encoding::ASCII_8BIT
83
+ normalized = text.dup
84
+ normalized.force_encoding(Encoding::UTF_8)
85
+ elsif text.encoding != Encoding::UTF_8
86
+ normalized = text.encode(Encoding::UTF_8)
87
+ else
88
+ normalized = text
89
+ end
90
+
91
+ normalized.valid_encoding? ? normalized : normalized.scrub
92
+ rescue Encoding::UndefinedConversionError,
93
+ Encoding::InvalidByteSequenceError
94
+ text.encode(Encoding::UTF_8, invalid: :replace, undef: :replace)
95
+ end
67
96
  end
68
97
  end
69
98
  end
@@ -54,25 +54,29 @@ module DiscourseAi
54
54
  def truncate(text, max_length, strict: false)
55
55
  return "" if max_length <= 0
56
56
 
57
+ text = normalize_text(text)
58
+
57
59
  # fast track common case, /2 to handle unicode chars
58
60
  # than can take more than 1 token per char
59
61
  return text if !strict && text.size < max_length / 2
60
62
 
61
63
  # Take tokens up to max_length, decode, then ensure we don't exceed limit
62
64
  truncated_tokens = tokenize(text).take(max_length)
63
- truncated_text = decode(truncated_tokens)
65
+ truncated_text = normalize_text(decode(truncated_tokens))
64
66
 
65
67
  # If re-encoding exceeds the limit, we need to further truncate
66
68
  while tokenize(truncated_text).length > max_length
67
69
  truncated_tokens = truncated_tokens[0...-1]
68
- truncated_text = decode(truncated_tokens)
70
+ truncated_text = normalize_text(decode(truncated_tokens))
69
71
  break if truncated_tokens.empty?
70
72
  end
71
73
 
72
- truncated_text
74
+ normalize_text(truncated_text)
73
75
  end
74
76
 
75
77
  def below_limit?(text, limit, strict: false)
78
+ text = normalize_text(text)
79
+
76
80
  # fast track common case, /2 to handle unicode chars
77
81
  # than can take more than 1 token per char
78
82
  return true if !strict && text.size < limit / 2
@@ -83,6 +87,8 @@ module DiscourseAi
83
87
  private
84
88
 
85
89
  def safe_encode(text)
90
+ text = normalize_text(text)
91
+
86
92
  if !text.is_a?(String) || text.size <= SAFE_CHUNK_SIZE
87
93
  return tokenizer.encode(text)
88
94
  end
@@ -2,6 +2,6 @@
2
2
 
3
3
  module DiscourseAi
4
4
  module Tokenizers
5
- VERSION = "0.4.1"
5
+ VERSION = "0.4.2"
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: discourse_ai-tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.1
4
+ version: 0.4.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rafael Silva