discourse_ai-tokenizers 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 36b5e98f002fe493df0c192a5ba86cf1a65d7c5d58207a3ee51a151c71d25002
4
- data.tar.gz: c20fdaa5692731610370d9c8bf790a12ace12a5b3513d95f238e64369396dfcf
3
+ metadata.gz: 5691c266deeffc5e632d111fdbf6fa9b54797d1f1fc6f030d53418e9a7a50394
4
+ data.tar.gz: 23c28ddeed6956dd051741e153b044fd2fef28c882b3efb16de14633ceca64a0
5
5
  SHA512:
6
- metadata.gz: f83d3e648f680f40c099add8596111d25e74cadb21109bcc7eb1914b12c19b42b118435cd0c99e2781002ea5090325a59b69ded692a05fc8ea98c86a6f13bd5e
7
- data.tar.gz: 0bc123e4127d01bb85650147b4c56134c2789b12ec3edffa0377512482e731e1718b0811f7df18b9b8978e0f4556a10de030de1d4548496e11bc180979d9cf4b
6
+ metadata.gz: de190053755df5292b99c99fe5f758cbacd190b3c7da16379e702dd097a572ceee4b1da79e6d23e2a31e06effd2a75fd492e031139ddee6fe030b18c1267f01b
7
+ data.tar.gz: a3fa6e00c7e4e49244944e75b978a89f0b0ec44217d6168f1b87ea1862cb34416f571d554e5fed6995be4a96f614f72458ffc12387ddcfcbba0b62e4dfc7df4f
data/CHANGELOG.md CHANGED
@@ -1,5 +1,9 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [0.3.2] - 2025-12-10
4
+
5
+ - Fix truncation logic in OpenAiTokenizer could lead to string parsing fails
6
+
3
7
  ## [0.3.1] - 2025-07-07
4
8
 
5
9
  - Refactor OpenAiO200kTokenizer class to OpenAiTokenizer as primary class name
@@ -19,9 +19,31 @@ module DiscourseAi
19
19
 
20
20
  def decode(token_ids)
21
21
  tokenizer.decode(token_ids)
22
- rescue Tiktoken::UnicodeError => e
23
- # Handle invalid token IDs gracefully by returning empty string
24
- ""
22
+ rescue Tiktoken::UnicodeError
23
+ token_ids = token_ids.dup
24
+
25
+ # this easy case, we started with a valid sequnce but truncated it on an invalid boundary
26
+ # work backwards removing tokens until we can decode again
27
+ tries = 4
28
+ while tries > 0
29
+ begin
30
+ token_ids.pop
31
+ return tokenizer.decode(token_ids)
32
+ rescue Tiktoken::UnicodeError
33
+ tries -= 1
34
+ end
35
+ end
36
+
37
+ # at this point we may have a corrupted sequence so just decode what we can
38
+ token_ids
39
+ .map do |id|
40
+ begin
41
+ tokenizer.decode([id])
42
+ rescue Tiktoken::UnicodeError
43
+ ""
44
+ end
45
+ end
46
+ .join
25
47
  end
26
48
 
27
49
  def truncate(text, max_length, strict: false)
@@ -33,12 +55,12 @@ module DiscourseAi
33
55
 
34
56
  # Take tokens up to max_length, decode, then ensure we don't exceed limit
35
57
  truncated_tokens = tokenize(text).take(max_length)
36
- truncated_text = tokenizer.decode(truncated_tokens)
58
+ truncated_text = decode(truncated_tokens)
37
59
 
38
60
  # If re-encoding exceeds the limit, we need to further truncate
39
61
  while tokenize(truncated_text).length > max_length
40
62
  truncated_tokens = truncated_tokens[0...-1]
41
- truncated_text = tokenizer.decode(truncated_tokens)
63
+ truncated_text = decode(truncated_tokens)
42
64
  break if truncated_tokens.empty?
43
65
  end
44
66
 
@@ -2,6 +2,6 @@
2
2
 
3
3
  module DiscourseAi
4
4
  module Tokenizers
5
- VERSION = "0.3.1"
5
+ VERSION = "0.3.2"
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: discourse_ai-tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rafael Silva
@@ -145,7 +145,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
145
145
  - !ruby/object:Gem::Version
146
146
  version: '0'
147
147
  requirements: []
148
- rubygems_version: 3.6.7
148
+ rubygems_version: 3.6.9
149
149
  specification_version: 4
150
150
  summary: Unified tokenizer interface for AI/ML models supporting OpenAI, Anthropic,
151
151
  Gemini, Llama, and embedding models