discourse_ai-tokenizers 0.3.1 → 0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 36b5e98f002fe493df0c192a5ba86cf1a65d7c5d58207a3ee51a151c71d25002
4
- data.tar.gz: c20fdaa5692731610370d9c8bf790a12ace12a5b3513d95f238e64369396dfcf
3
+ metadata.gz: b3868dc8e228ff41a7319ceb5c97b1443646bc715654ff6872f6a107680d1240
4
+ data.tar.gz: 1cd60cd362995d4b3a7be7495768e9ee0885a532eafaed9b482a7a1dab36e391
5
5
  SHA512:
6
- metadata.gz: f83d3e648f680f40c099add8596111d25e74cadb21109bcc7eb1914b12c19b42b118435cd0c99e2781002ea5090325a59b69ded692a05fc8ea98c86a6f13bd5e
7
- data.tar.gz: 0bc123e4127d01bb85650147b4c56134c2789b12ec3edffa0377512482e731e1718b0811f7df18b9b8978e0f4556a10de030de1d4548496e11bc180979d9cf4b
6
+ metadata.gz: 632ada3172909b8e230e20742da68d98ffd0f48ccfdb7d5b79d7055dd1579e8b9ec8d163c30221a63f6581a46ae8dcf376be97e3db9c6677a8a06b247eabfa1a
7
+ data.tar.gz: 436f944b815606c911f98b465284860ebdd628a1a8b70deab0b37a3050d72dc18947caddbd4e47b0ad50ddfe997454f44454d91ac481a27b99cb9fe413c1d7be
data/CHANGELOG.md CHANGED
@@ -1,5 +1,13 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [0.4.0] - 2026-01-06
4
+
5
+ - Add Ruby 4.0 compatibility
6
+
7
+ ## [0.3.2] - 2025-12-10
8
+
9
+ - Fix truncation logic in OpenAiTokenizer could lead to string parsing fails
10
+
3
11
  ## [0.3.1] - 2025-07-07
4
12
 
5
13
  - Refactor OpenAiO200kTokenizer class to OpenAiTokenizer as primary class name
@@ -19,9 +19,31 @@ module DiscourseAi
19
19
 
20
20
  def decode(token_ids)
21
21
  tokenizer.decode(token_ids)
22
- rescue Tiktoken::UnicodeError => e
23
- # Handle invalid token IDs gracefully by returning empty string
24
- ""
22
+ rescue Tiktoken::UnicodeError
23
+ token_ids = token_ids.dup
24
+
25
+ # this easy case, we started with a valid sequnce but truncated it on an invalid boundary
26
+ # work backwards removing tokens until we can decode again
27
+ tries = 4
28
+ while tries > 0
29
+ begin
30
+ token_ids.pop
31
+ return tokenizer.decode(token_ids)
32
+ rescue Tiktoken::UnicodeError
33
+ tries -= 1
34
+ end
35
+ end
36
+
37
+ # at this point we may have a corrupted sequence so just decode what we can
38
+ token_ids
39
+ .map do |id|
40
+ begin
41
+ tokenizer.decode([id])
42
+ rescue Tiktoken::UnicodeError
43
+ ""
44
+ end
45
+ end
46
+ .join
25
47
  end
26
48
 
27
49
  def truncate(text, max_length, strict: false)
@@ -33,12 +55,12 @@ module DiscourseAi
33
55
 
34
56
  # Take tokens up to max_length, decode, then ensure we don't exceed limit
35
57
  truncated_tokens = tokenize(text).take(max_length)
36
- truncated_text = tokenizer.decode(truncated_tokens)
58
+ truncated_text = decode(truncated_tokens)
37
59
 
38
60
  # If re-encoding exceeds the limit, we need to further truncate
39
61
  while tokenize(truncated_text).length > max_length
40
62
  truncated_tokens = truncated_tokens[0...-1]
41
- truncated_text = tokenizer.decode(truncated_tokens)
63
+ truncated_text = decode(truncated_tokens)
42
64
  break if truncated_tokens.empty?
43
65
  end
44
66
 
@@ -2,6 +2,6 @@
2
2
 
3
3
  module DiscourseAi
4
4
  module Tokenizers
5
- VERSION = "0.3.1"
5
+ VERSION = "0.4"
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: discourse_ai-tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: '0.4'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rafael Silva
@@ -29,42 +29,42 @@ dependencies:
29
29
  requirements:
30
30
  - - "~>"
31
31
  - !ruby/object:Gem::Version
32
- version: 0.0.11.1
32
+ version: 0.0.15
33
33
  type: :runtime
34
34
  prerelease: false
35
35
  version_requirements: !ruby/object:Gem::Requirement
36
36
  requirements:
37
37
  - - "~>"
38
38
  - !ruby/object:Gem::Version
39
- version: 0.0.11.1
39
+ version: 0.0.15
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: tokenizers
42
42
  requirement: !ruby/object:Gem::Requirement
43
43
  requirements:
44
44
  - - "~>"
45
45
  - !ruby/object:Gem::Version
46
- version: 0.5.4
46
+ version: 0.6.3
47
47
  type: :runtime
48
48
  prerelease: false
49
49
  version_requirements: !ruby/object:Gem::Requirement
50
50
  requirements:
51
51
  - - "~>"
52
52
  - !ruby/object:Gem::Version
53
- version: 0.5.4
53
+ version: 0.6.3
54
54
  - !ruby/object:Gem::Dependency
55
55
  name: rubocop-discourse
56
56
  requirement: !ruby/object:Gem::Requirement
57
57
  requirements:
58
- - - '='
58
+ - - "~>"
59
59
  - !ruby/object:Gem::Version
60
- version: 3.8.1
60
+ version: '3.8'
61
61
  type: :development
62
62
  prerelease: false
63
63
  version_requirements: !ruby/object:Gem::Requirement
64
64
  requirements:
65
- - - '='
65
+ - - "~>"
66
66
  - !ruby/object:Gem::Version
67
- version: 3.8.1
67
+ version: '3.8'
68
68
  - !ruby/object:Gem::Dependency
69
69
  name: syntax_tree
70
70
  requirement: !ruby/object:Gem::Requirement
@@ -145,7 +145,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
145
145
  - !ruby/object:Gem::Version
146
146
  version: '0'
147
147
  requirements: []
148
- rubygems_version: 3.6.7
148
+ rubygems_version: 3.6.9
149
149
  specification_version: 4
150
150
  summary: Unified tokenizer interface for AI/ML models supporting OpenAI, Anthropic,
151
151
  Gemini, Llama, and embedding models