discourse_ai-tokenizers 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/discourse_ai/tokenizer/open_ai_tokenizer.rb +27 -5
- data/lib/discourse_ai/tokenizers/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 5691c266deeffc5e632d111fdbf6fa9b54797d1f1fc6f030d53418e9a7a50394
|
|
4
|
+
data.tar.gz: 23c28ddeed6956dd051741e153b044fd2fef28c882b3efb16de14633ceca64a0
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: de190053755df5292b99c99fe5f758cbacd190b3c7da16379e702dd097a572ceee4b1da79e6d23e2a31e06effd2a75fd492e031139ddee6fe030b18c1267f01b
|
|
7
|
+
data.tar.gz: a3fa6e00c7e4e49244944e75b978a89f0b0ec44217d6168f1b87ea1862cb34416f571d554e5fed6995be4a96f614f72458ffc12387ddcfcbba0b62e4dfc7df4f
|
data/CHANGELOG.md
CHANGED
|
@@ -19,9 +19,31 @@ module DiscourseAi
|
|
|
19
19
|
|
|
20
20
|
def decode(token_ids)
|
|
21
21
|
tokenizer.decode(token_ids)
|
|
22
|
-
rescue Tiktoken::UnicodeError
|
|
23
|
-
|
|
24
|
-
|
|
22
|
+
rescue Tiktoken::UnicodeError
|
|
23
|
+
token_ids = token_ids.dup
|
|
24
|
+
|
|
25
|
+
# this easy case, we started with a valid sequnce but truncated it on an invalid boundary
|
|
26
|
+
# work backwards removing tokens until we can decode again
|
|
27
|
+
tries = 4
|
|
28
|
+
while tries > 0
|
|
29
|
+
begin
|
|
30
|
+
token_ids.pop
|
|
31
|
+
return tokenizer.decode(token_ids)
|
|
32
|
+
rescue Tiktoken::UnicodeError
|
|
33
|
+
tries -= 1
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# at this point we may have a corrupted sequence so just decode what we can
|
|
38
|
+
token_ids
|
|
39
|
+
.map do |id|
|
|
40
|
+
begin
|
|
41
|
+
tokenizer.decode([id])
|
|
42
|
+
rescue Tiktoken::UnicodeError
|
|
43
|
+
""
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
.join
|
|
25
47
|
end
|
|
26
48
|
|
|
27
49
|
def truncate(text, max_length, strict: false)
|
|
@@ -33,12 +55,12 @@ module DiscourseAi
|
|
|
33
55
|
|
|
34
56
|
# Take tokens up to max_length, decode, then ensure we don't exceed limit
|
|
35
57
|
truncated_tokens = tokenize(text).take(max_length)
|
|
36
|
-
truncated_text =
|
|
58
|
+
truncated_text = decode(truncated_tokens)
|
|
37
59
|
|
|
38
60
|
# If re-encoding exceeds the limit, we need to further truncate
|
|
39
61
|
while tokenize(truncated_text).length > max_length
|
|
40
62
|
truncated_tokens = truncated_tokens[0...-1]
|
|
41
|
-
truncated_text =
|
|
63
|
+
truncated_text = decode(truncated_tokens)
|
|
42
64
|
break if truncated_tokens.empty?
|
|
43
65
|
end
|
|
44
66
|
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: discourse_ai-tokenizers
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.3.
|
|
4
|
+
version: 0.3.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Rafael Silva
|
|
@@ -145,7 +145,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
145
145
|
- !ruby/object:Gem::Version
|
|
146
146
|
version: '0'
|
|
147
147
|
requirements: []
|
|
148
|
-
rubygems_version: 3.6.
|
|
148
|
+
rubygems_version: 3.6.9
|
|
149
149
|
specification_version: 4
|
|
150
150
|
summary: Unified tokenizer interface for AI/ML models supporting OpenAI, Anthropic,
|
|
151
151
|
Gemini, Llama, and embedding models
|