discourse_ai-tokenizers 0.3.2 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/lib/discourse_ai/tokenizer/open_ai_tokenizer.rb +35 -3
- data/lib/discourse_ai/tokenizers/version.rb +1 -1
- metadata +9 -9
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 476e96609f0c9bd5ccbb94f5dd464df65cb125d047941689834d8ff994094d80
|
|
4
|
+
data.tar.gz: 1c987d4572105aab891e91ba86229a409d4932148521c32e180eaccbbe4fbc35
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 4cb19dbf675d42b4360f5beecb384a92835ffcd891f3010fcca589c1c90ba188253939aa3bb3b7b0506c72422554e69968a00a2389794febf77ab29b1b188546
|
|
7
|
+
data.tar.gz: 4bd2db8132f668ed0a8f5a71060ac5a5d254cb4020a55d2fcd629dcc0775a29616e443af71c04d007ec629ee7dfa062eeddf7f0e86a26abd3e67c5b24cac7c09
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,13 @@
|
|
|
1
1
|
## [Unreleased]
|
|
2
2
|
|
|
3
|
+
## [0.4.1] - 2026-02-26
|
|
4
|
+
|
|
5
|
+
- Fix tiktoken-rs stack overflow crash by chunking large inputs at whitespace boundaries before encoding
|
|
6
|
+
|
|
7
|
+
## [0.4.0] - 2026-01-06
|
|
8
|
+
|
|
9
|
+
- Add Ruby 4.0 compatibility
|
|
10
|
+
|
|
3
11
|
## [0.3.2] - 2025-12-10
|
|
4
12
|
|
|
5
13
|
- Fix truncation logic in OpenAiTokenizer could lead to string parsing fails
|
|
@@ -4,17 +4,22 @@ module DiscourseAi
|
|
|
4
4
|
module Tokenizer
|
|
5
5
|
# Wrapper for OpenAI tokenizer library for compatibility with Discourse AI API
|
|
6
6
|
class OpenAiTokenizer < BasicTokenizer
|
|
7
|
+
# tiktoken-rs uses fancy-regex which can stack overflow on large inputs
|
|
8
|
+
# due to catastrophic backtracking (github.com/openai/tiktoken/issues/245).
|
|
9
|
+
# Chunking at whitespace boundaries prevents this while preserving accuracy.
|
|
10
|
+
SAFE_CHUNK_SIZE = 50_000
|
|
11
|
+
|
|
7
12
|
class << self
|
|
8
13
|
def tokenizer
|
|
9
14
|
@tokenizer ||= Tiktoken.get_encoding("o200k_base")
|
|
10
15
|
end
|
|
11
16
|
|
|
12
17
|
def tokenize(text)
|
|
13
|
-
|
|
18
|
+
safe_encode(text)
|
|
14
19
|
end
|
|
15
20
|
|
|
16
21
|
def encode(text)
|
|
17
|
-
|
|
22
|
+
safe_encode(text)
|
|
18
23
|
end
|
|
19
24
|
|
|
20
25
|
def decode(token_ids)
|
|
@@ -72,7 +77,34 @@ module DiscourseAi
|
|
|
72
77
|
# than can take more than 1 token per char
|
|
73
78
|
return true if !strict && text.size < limit / 2
|
|
74
79
|
|
|
75
|
-
|
|
80
|
+
safe_encode(text).length < limit
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
private
|
|
84
|
+
|
|
85
|
+
def safe_encode(text)
|
|
86
|
+
if !text.is_a?(String) || text.size <= SAFE_CHUNK_SIZE
|
|
87
|
+
return tokenizer.encode(text)
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
tokens = []
|
|
91
|
+
offset = 0
|
|
92
|
+
while offset < text.size
|
|
93
|
+
chunk_end = offset + SAFE_CHUNK_SIZE
|
|
94
|
+
|
|
95
|
+
if chunk_end < text.size
|
|
96
|
+
# Split at a whitespace boundary to preserve tokenization accuracy
|
|
97
|
+
break_point = text.rindex(/\s/, chunk_end)
|
|
98
|
+
chunk_end = break_point if break_point && break_point > offset
|
|
99
|
+
else
|
|
100
|
+
chunk_end = text.size
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
tokens.concat(tokenizer.encode(text[offset...chunk_end]))
|
|
104
|
+
offset = chunk_end
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
tokens
|
|
76
108
|
end
|
|
77
109
|
end
|
|
78
110
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: discourse_ai-tokenizers
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.4.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Rafael Silva
|
|
@@ -29,42 +29,42 @@ dependencies:
|
|
|
29
29
|
requirements:
|
|
30
30
|
- - "~>"
|
|
31
31
|
- !ruby/object:Gem::Version
|
|
32
|
-
version: 0.0.
|
|
32
|
+
version: 0.0.15
|
|
33
33
|
type: :runtime
|
|
34
34
|
prerelease: false
|
|
35
35
|
version_requirements: !ruby/object:Gem::Requirement
|
|
36
36
|
requirements:
|
|
37
37
|
- - "~>"
|
|
38
38
|
- !ruby/object:Gem::Version
|
|
39
|
-
version: 0.0.
|
|
39
|
+
version: 0.0.15
|
|
40
40
|
- !ruby/object:Gem::Dependency
|
|
41
41
|
name: tokenizers
|
|
42
42
|
requirement: !ruby/object:Gem::Requirement
|
|
43
43
|
requirements:
|
|
44
44
|
- - "~>"
|
|
45
45
|
- !ruby/object:Gem::Version
|
|
46
|
-
version: 0.
|
|
46
|
+
version: 0.6.3
|
|
47
47
|
type: :runtime
|
|
48
48
|
prerelease: false
|
|
49
49
|
version_requirements: !ruby/object:Gem::Requirement
|
|
50
50
|
requirements:
|
|
51
51
|
- - "~>"
|
|
52
52
|
- !ruby/object:Gem::Version
|
|
53
|
-
version: 0.
|
|
53
|
+
version: 0.6.3
|
|
54
54
|
- !ruby/object:Gem::Dependency
|
|
55
55
|
name: rubocop-discourse
|
|
56
56
|
requirement: !ruby/object:Gem::Requirement
|
|
57
57
|
requirements:
|
|
58
|
-
- -
|
|
58
|
+
- - "~>"
|
|
59
59
|
- !ruby/object:Gem::Version
|
|
60
|
-
version: 3.8
|
|
60
|
+
version: '3.8'
|
|
61
61
|
type: :development
|
|
62
62
|
prerelease: false
|
|
63
63
|
version_requirements: !ruby/object:Gem::Requirement
|
|
64
64
|
requirements:
|
|
65
|
-
- -
|
|
65
|
+
- - "~>"
|
|
66
66
|
- !ruby/object:Gem::Version
|
|
67
|
-
version: 3.8
|
|
67
|
+
version: '3.8'
|
|
68
68
|
- !ruby/object:Gem::Dependency
|
|
69
69
|
name: syntax_tree
|
|
70
70
|
requirement: !ruby/object:Gem::Requirement
|