discourse_ai-tokenizers 0.3.2 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5691c266deeffc5e632d111fdbf6fa9b54797d1f1fc6f030d53418e9a7a50394
4
- data.tar.gz: 23c28ddeed6956dd051741e153b044fd2fef28c882b3efb16de14633ceca64a0
3
+ metadata.gz: 476e96609f0c9bd5ccbb94f5dd464df65cb125d047941689834d8ff994094d80
4
+ data.tar.gz: 1c987d4572105aab891e91ba86229a409d4932148521c32e180eaccbbe4fbc35
5
5
  SHA512:
6
- metadata.gz: de190053755df5292b99c99fe5f758cbacd190b3c7da16379e702dd097a572ceee4b1da79e6d23e2a31e06effd2a75fd492e031139ddee6fe030b18c1267f01b
7
- data.tar.gz: a3fa6e00c7e4e49244944e75b978a89f0b0ec44217d6168f1b87ea1862cb34416f571d554e5fed6995be4a96f614f72458ffc12387ddcfcbba0b62e4dfc7df4f
6
+ metadata.gz: 4cb19dbf675d42b4360f5beecb384a92835ffcd891f3010fcca589c1c90ba188253939aa3bb3b7b0506c72422554e69968a00a2389794febf77ab29b1b188546
7
+ data.tar.gz: 4bd2db8132f668ed0a8f5a71060ac5a5d254cb4020a55d2fcd629dcc0775a29616e443af71c04d007ec629ee7dfa062eeddf7f0e86a26abd3e67c5b24cac7c09
data/CHANGELOG.md CHANGED
@@ -1,5 +1,13 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [0.4.1] - 2026-02-26
4
+
5
+ - Fix tiktoken-rs stack overflow crash by chunking large inputs at whitespace boundaries before encoding
6
+
7
+ ## [0.4.0] - 2026-01-06
8
+
9
+ - Add Ruby 4.0 compatibility
10
+
3
11
  ## [0.3.2] - 2025-12-10
4
12
 
5
13
  - Fix truncation logic in OpenAiTokenizer could lead to string parsing fails
@@ -4,17 +4,22 @@ module DiscourseAi
4
4
  module Tokenizer
5
5
  # Wrapper for OpenAI tokenizer library for compatibility with Discourse AI API
6
6
  class OpenAiTokenizer < BasicTokenizer
7
+ # tiktoken-rs uses fancy-regex which can stack overflow on large inputs
8
+ # due to catastrophic backtracking (github.com/openai/tiktoken/issues/245).
9
+ # Chunking at whitespace boundaries prevents this while preserving accuracy.
10
+ SAFE_CHUNK_SIZE = 50_000
11
+
7
12
  class << self
8
13
  def tokenizer
9
14
  @tokenizer ||= Tiktoken.get_encoding("o200k_base")
10
15
  end
11
16
 
12
17
  def tokenize(text)
13
- tokenizer.encode(text)
18
+ safe_encode(text)
14
19
  end
15
20
 
16
21
  def encode(text)
17
- tokenizer.encode(text)
22
+ safe_encode(text)
18
23
  end
19
24
 
20
25
  def decode(token_ids)
@@ -72,7 +77,34 @@ module DiscourseAi
72
77
  # than can take more than 1 token per char
73
78
  return true if !strict && text.size < limit / 2
74
79
 
75
- tokenizer.encode(text).length < limit
80
+ safe_encode(text).length < limit
81
+ end
82
+
83
+ private
84
+
85
+ def safe_encode(text)
86
+ if !text.is_a?(String) || text.size <= SAFE_CHUNK_SIZE
87
+ return tokenizer.encode(text)
88
+ end
89
+
90
+ tokens = []
91
+ offset = 0
92
+ while offset < text.size
93
+ chunk_end = offset + SAFE_CHUNK_SIZE
94
+
95
+ if chunk_end < text.size
96
+ # Split at a whitespace boundary to preserve tokenization accuracy
97
+ break_point = text.rindex(/\s/, chunk_end)
98
+ chunk_end = break_point if break_point && break_point > offset
99
+ else
100
+ chunk_end = text.size
101
+ end
102
+
103
+ tokens.concat(tokenizer.encode(text[offset...chunk_end]))
104
+ offset = chunk_end
105
+ end
106
+
107
+ tokens
76
108
  end
77
109
  end
78
110
  end
@@ -2,6 +2,6 @@
2
2
 
3
3
  module DiscourseAi
4
4
  module Tokenizers
5
- VERSION = "0.3.2"
5
+ VERSION = "0.4.1"
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: discourse_ai-tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.2
4
+ version: 0.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rafael Silva
@@ -29,42 +29,42 @@ dependencies:
29
29
  requirements:
30
30
  - - "~>"
31
31
  - !ruby/object:Gem::Version
32
- version: 0.0.11.1
32
+ version: 0.0.15
33
33
  type: :runtime
34
34
  prerelease: false
35
35
  version_requirements: !ruby/object:Gem::Requirement
36
36
  requirements:
37
37
  - - "~>"
38
38
  - !ruby/object:Gem::Version
39
- version: 0.0.11.1
39
+ version: 0.0.15
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: tokenizers
42
42
  requirement: !ruby/object:Gem::Requirement
43
43
  requirements:
44
44
  - - "~>"
45
45
  - !ruby/object:Gem::Version
46
- version: 0.5.4
46
+ version: 0.6.3
47
47
  type: :runtime
48
48
  prerelease: false
49
49
  version_requirements: !ruby/object:Gem::Requirement
50
50
  requirements:
51
51
  - - "~>"
52
52
  - !ruby/object:Gem::Version
53
- version: 0.5.4
53
+ version: 0.6.3
54
54
  - !ruby/object:Gem::Dependency
55
55
  name: rubocop-discourse
56
56
  requirement: !ruby/object:Gem::Requirement
57
57
  requirements:
58
- - - '='
58
+ - - "~>"
59
59
  - !ruby/object:Gem::Version
60
- version: 3.8.1
60
+ version: '3.8'
61
61
  type: :development
62
62
  prerelease: false
63
63
  version_requirements: !ruby/object:Gem::Requirement
64
64
  requirements:
65
- - - '='
65
+ - - "~>"
66
66
  - !ruby/object:Gem::Version
67
- version: 3.8.1
67
+ version: '3.8'
68
68
  - !ruby/object:Gem::Dependency
69
69
  name: syntax_tree
70
70
  requirement: !ruby/object:Gem::Requirement