discourse_ai-tokenizers 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c2d3921cc11a89b45ff8e5f7a58c4ae4cb170791a45dc445d1f908a5cc83a88a
|
4
|
+
data.tar.gz: 7d5d4a725d97d608baea0c3946bde1156c5f7808ad5d6e5bbbb420670e195287
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 407774d2cfd411c88e4b43fb31aa572ea8a59bc285887f24dc96cf4843d7b8c1dc5b0c35b5731223a4e269772146ff8a9b499829bcace07b5b954f540d534bdf
|
7
|
+
data.tar.gz: da30167cc708d12dbba2763bdb31802c6ba165018d5ca7a1e698898ac5df41ee4b11c91aa24153588a238f98d1ebd886316e9716cf9bfef8c2ea65fbd9fec2a5
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,11 @@
|
|
1
1
|
## [Unreleased]
|
2
2
|
|
3
|
+
## [0.3.0] - 2025-07-04
|
4
|
+
|
5
|
+
- Add OpenAiCl100kTokenizer class for cl100k_base encoding
|
6
|
+
- Refactor OpenAiTokenizer to OpenAiO200kTokenizer with backward compatibility alias
|
7
|
+
- Update version to 0.3.0
|
8
|
+
|
3
9
|
## [0.2.0] - 2025-07-02
|
4
10
|
|
5
11
|
- Initial release
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DiscourseAi
|
4
|
+
module Tokenizer
|
5
|
+
# Wrapper for OpenAI tokenizer library for compatibility with Discourse AI API
|
6
|
+
class OpenAiCl100kTokenizer < OpenAiTokenizer
|
7
|
+
class << self
|
8
|
+
def tokenizer
|
9
|
+
@tokenizer ||= Tiktoken.get_encoding("cl100k_base")
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -3,7 +3,7 @@
|
|
3
3
|
module DiscourseAi
|
4
4
|
module Tokenizer
|
5
5
|
# Wrapper for OpenAI tokenizer library for compatibility with Discourse AI API
|
6
|
-
class
|
6
|
+
class OpenAiO200kTokenizer < BasicTokenizer
|
7
7
|
class << self
|
8
8
|
def tokenizer
|
9
9
|
@tokenizer ||= Tiktoken.get_encoding("o200k_base")
|
@@ -54,5 +54,7 @@ module DiscourseAi
|
|
54
54
|
end
|
55
55
|
end
|
56
56
|
end
|
57
|
+
|
58
|
+
OpenAiTokenizer = OpenAiO200kTokenizer
|
57
59
|
end
|
58
60
|
end
|
@@ -8,6 +8,7 @@ require_relative "tokenizer/basic_tokenizer"
|
|
8
8
|
require_relative "tokenizer/bert_tokenizer"
|
9
9
|
require_relative "tokenizer/anthropic_tokenizer"
|
10
10
|
require_relative "tokenizer/open_ai_tokenizer"
|
11
|
+
require_relative "tokenizer/open_ai_cl100k_tokenizer"
|
11
12
|
require_relative "tokenizer/all_mpnet_base_v2_tokenizer"
|
12
13
|
require_relative "tokenizer/multilingual_e5_large_tokenizer"
|
13
14
|
require_relative "tokenizer/bge_large_en_tokenizer"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: discourse_ai-tokenizers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rafael Silva
|
@@ -106,6 +106,7 @@ files:
|
|
106
106
|
- lib/discourse_ai/tokenizer/llama3_tokenizer.rb
|
107
107
|
- lib/discourse_ai/tokenizer/mistral_tokenizer.rb
|
108
108
|
- lib/discourse_ai/tokenizer/multilingual_e5_large_tokenizer.rb
|
109
|
+
- lib/discourse_ai/tokenizer/open_ai_cl100k_tokenizer.rb
|
109
110
|
- lib/discourse_ai/tokenizer/open_ai_tokenizer.rb
|
110
111
|
- lib/discourse_ai/tokenizer/qwen_tokenizer.rb
|
111
112
|
- lib/discourse_ai/tokenizers.rb
|