discourse_ai-tokenizers 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bc2fd76c9fd338fd19f6d56d4e21c98695d2ff8a2baf5626f36f7df7f98af3d9
4
- data.tar.gz: d5e9b1ca74715a0346a3d22e2413834191488167e867d5bec795b6f9c9c25f5d
3
+ metadata.gz: d23181327ee259c76aa29f86a2b40702b39524705d11afd14239dfe6e3e90009
4
+ data.tar.gz: 76c2d7bbe1c4ebe97dff5576aec832e8664831ade0b819cf9afd244600f1be38
5
5
  SHA512:
6
- metadata.gz: 4b06f6c801f878f173471a337f0d9d28c3321ba0b7b089876b83296353144257080bc0e1495bdbe020914f4cb6324696f9505a625cd01545b17d6e99624f190f
7
- data.tar.gz: f7c37e3d464b16419c7218554a29514c63d84d1d4247969940d2f9d7f3316618ae593aa98df8941a44d2c721471f97073b75b37b2246a18349f43b6a324adb8a
6
+ metadata.gz: 02ad662edd31f57b8cba0b1ea221bb7c9f1684d65b9fcd1002739eebf4c1393152ee3f17ea675a8bd596eed4ba9a4fc61cea20b49bd7f3c5b86f64d0e2772bbb
7
+ data.tar.gz: 9869f1d01ce0388ac2bec619060e7ace28ad5155ff336667072d965a9405217b0ad21e36b258f384ef5456f3e3c3ee35b9c10c536a11e562c4ca5cb948c1ff81
data/CHANGELOG.md CHANGED
@@ -1,5 +1,5 @@
1
1
  ## [Unreleased]
2
2
 
3
- ## [0.1.0] - 2025-06-30
3
+ ## [0.2.0] - 2025-07-02
4
4
 
5
5
  - Initial release
data/README.md CHANGED
@@ -39,27 +39,27 @@ gem install discourse_ai-tokenizers
39
39
  require 'discourse_ai/tokenizers'
40
40
 
41
41
  # Get token count
42
- DiscourseAi::Tokenizers::OpenAiTokenizer.size("Hello world!")
42
+ DiscourseAi::Tokenizer::OpenAiTokenizer.size("Hello world!")
43
43
  # => 3
44
44
 
45
45
  # Tokenize text
46
- DiscourseAi::Tokenizers::OpenAiTokenizer.tokenize("Hello world!")
46
+ DiscourseAi::Tokenizer::OpenAiTokenizer.tokenize("Hello world!")
47
47
  # => [9906, 1917, 0]
48
48
 
49
49
  # Encode text to token IDs
50
- DiscourseAi::Tokenizers::OpenAiTokenizer.encode("Hello world!")
50
+ DiscourseAi::Tokenizer::OpenAiTokenizer.encode("Hello world!")
51
51
  # => [9906, 1917, 0]
52
52
 
53
53
  # Decode token IDs back to text
54
- DiscourseAi::Tokenizers::OpenAiTokenizer.decode([9906, 1917, 0])
54
+ DiscourseAi::Tokenizer::OpenAiTokenizer.decode([9906, 1917, 0])
55
55
  # => "Hello world!"
56
56
 
57
57
  # Truncate text to token limit
58
- DiscourseAi::Tokenizers::OpenAiTokenizer.truncate("This is a long sentence", 5)
58
+ DiscourseAi::Tokenizer::OpenAiTokenizer.truncate("This is a long sentence", 5)
59
59
  # => "This is a"
60
60
 
61
61
  # Check if text is within token limit
62
- DiscourseAi::Tokenizers::OpenAiTokenizer.below_limit?("Short text", 10)
62
+ DiscourseAi::Tokenizer::OpenAiTokenizer.below_limit?("Short text", 10)
63
63
  # => true
64
64
  ```
65
65
 
@@ -67,27 +67,27 @@ DiscourseAi::Tokenizers::OpenAiTokenizer.below_limit?("Short text", 10)
67
67
 
68
68
  #### LLM Tokenizers
69
69
 
70
- - `DiscourseAi::Tokenizers::AnthropicTokenizer` - Claude models
71
- - `DiscourseAi::Tokenizers::OpenAiTokenizer` - GPT models
72
- - `DiscourseAi::Tokenizers::GeminiTokenizer` - Google Gemini
73
- - `DiscourseAi::Tokenizers::Llama3Tokenizer` - Meta Llama 3
74
- - `DiscourseAi::Tokenizers::QwenTokenizer` - Alibaba Qwen
75
- - `DiscourseAi::Tokenizers::MistralTokenizer` - Mistral models
70
+ - `DiscourseAi::Tokenizer::AnthropicTokenizer` - Claude models
71
+ - `DiscourseAi::Tokenizer::OpenAiTokenizer` - GPT models
72
+ - `DiscourseAi::Tokenizer::GeminiTokenizer` - Google Gemini
73
+ - `DiscourseAi::Tokenizer::Llama3Tokenizer` - Meta Llama 3
74
+ - `DiscourseAi::Tokenizer::QwenTokenizer` - Alibaba Qwen
75
+ - `DiscourseAi::Tokenizer::MistralTokenizer` - Mistral models
76
76
 
77
77
  #### Embedding Tokenizers
78
78
 
79
- - `DiscourseAi::Tokenizers::BertTokenizer` - BERT-based models
80
- - `DiscourseAi::Tokenizers::AllMpnetBaseV2Tokenizer` - sentence-transformers/all-mpnet-base-v2
81
- - `DiscourseAi::Tokenizers::BgeLargeEnTokenizer` - BAAI/bge-large-en
82
- - `DiscourseAi::Tokenizers::BgeM3Tokenizer` - BAAI/bge-m3
83
- - `DiscourseAi::Tokenizers::MultilingualE5LargeTokenizer` - intfloat/multilingual-e5-large
79
+ - `DiscourseAi::Tokenizer::BertTokenizer` - BERT-based models
80
+ - `DiscourseAi::Tokenizer::AllMpnetBaseV2Tokenizer` - sentence-transformers/all-mpnet-base-v2
81
+ - `DiscourseAi::Tokenizer::BgeLargeEnTokenizer` - BAAI/bge-large-en
82
+ - `DiscourseAi::Tokenizer::BgeM3Tokenizer` - BAAI/bge-m3
83
+ - `DiscourseAi::Tokenizer::MultilingualE5LargeTokenizer` - intfloat/multilingual-e5-large
84
84
 
85
85
  ### Getting Available LLM Tokenizers
86
86
 
87
87
  ```ruby
88
88
  # Get all available LLM tokenizers dynamically
89
- DiscourseAi::Tokenizers::BasicTokenizer.available_llm_tokenizers
90
- # => [DiscourseAi::Tokenizers::AnthropicTokenizer, DiscourseAi::Tokenizers::OpenAiTokenizer, ...]
89
+ DiscourseAi::Tokenizer::BasicTokenizer.available_llm_tokenizers
90
+ # => [DiscourseAi::Tokenizer::AnthropicTokenizer, DiscourseAi::Tokenizer::OpenAiTokenizer, ...]
91
91
  ```
92
92
 
93
93
  ### Advanced Usage
@@ -96,10 +96,10 @@ DiscourseAi::Tokenizers::BasicTokenizer.available_llm_tokenizers
96
96
 
97
97
  ```ruby
98
98
  # Strict mode ensures exact token limit compliance
99
- DiscourseAi::Tokenizers::OpenAiTokenizer.truncate("Long text here", 5, strict: true)
99
+ DiscourseAi::Tokenizer::OpenAiTokenizer.truncate("Long text here", 5, strict: true)
100
100
 
101
101
  # Check limits with strict mode
102
- DiscourseAi::Tokenizers::OpenAiTokenizer.below_limit?("Text", 10, strict: true)
102
+ DiscourseAi::Tokenizer::OpenAiTokenizer.below_limit?("Text", 10, strict: true)
103
103
  ```
104
104
 
105
105
  #### Unicode and Emoji Support
@@ -107,11 +107,11 @@ DiscourseAi::Tokenizers::OpenAiTokenizer.below_limit?("Text", 10, strict: true)
107
107
  ```ruby
108
108
  # Handles unicode characters properly
109
109
  text = "Hello 世界 🌍 👨‍👩‍👧‍👦"
110
- DiscourseAi::Tokenizers::OpenAiTokenizer.size(text)
110
+ DiscourseAi::Tokenizer::OpenAiTokenizer.size(text)
111
111
  # => 8
112
112
 
113
113
  # Truncation preserves unicode integrity
114
- truncated = DiscourseAi::Tokenizers::OpenAiTokenizer.truncate(text, 5)
114
+ truncated = DiscourseAi::Tokenizer::OpenAiTokenizer.truncate(text, 5)
115
115
  # => "Hello 世界 🌍"
116
116
  ```
117
117
 
@@ -157,4 +157,4 @@ The gem is available as open source under the terms of the [MIT License](https:/
157
157
 
158
158
  ## Code of Conduct
159
159
 
160
- Everyone interacting in the DiscourseAi::Tokenizers project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](CODE_OF_CONDUCT.md).
160
+ Everyone interacting in the DiscourseAi::Tokenizer project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](CODE_OF_CONDUCT.md).
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DiscourseAi
4
- module Tokenizers
4
+ module Tokenizer
5
5
  # Tokenizer for the mpnet based embeddings models
6
6
  class AllMpnetBaseV2Tokenizer < BasicTokenizer
7
7
  def self.tokenizer
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DiscourseAi
4
- module Tokenizers
4
+ module Tokenizer
5
5
  # Extracted from Anthropic's python SDK, compatible with first Claude versions
6
6
  class AnthropicTokenizer < BasicTokenizer
7
7
  def self.tokenizer
@@ -1,18 +1,18 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DiscourseAi
4
- module Tokenizers
4
+ module Tokenizer
5
5
  # Base class for tokenizers to inherit from
6
6
  class BasicTokenizer
7
7
  class << self
8
8
  def available_llm_tokenizers
9
9
  [
10
- DiscourseAi::Tokenizers::AnthropicTokenizer,
11
- DiscourseAi::Tokenizers::GeminiTokenizer,
12
- DiscourseAi::Tokenizers::Llama3Tokenizer,
13
- DiscourseAi::Tokenizers::MistralTokenizer,
14
- DiscourseAi::Tokenizers::OpenAiTokenizer,
15
- DiscourseAi::Tokenizers::QwenTokenizer
10
+ DiscourseAi::Tokenizer::AnthropicTokenizer,
11
+ DiscourseAi::Tokenizer::GeminiTokenizer,
12
+ DiscourseAi::Tokenizer::Llama3Tokenizer,
13
+ DiscourseAi::Tokenizer::MistralTokenizer,
14
+ DiscourseAi::Tokenizer::OpenAiTokenizer,
15
+ DiscourseAi::Tokenizer::QwenTokenizer
16
16
  ]
17
17
  end
18
18
 
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DiscourseAi
4
- module Tokenizers
4
+ module Tokenizer
5
5
  # Bert tokenizer, useful for lots of embeddings and small classification models
6
6
  class BertTokenizer < BasicTokenizer
7
7
  def self.tokenizer
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DiscourseAi
4
- module Tokenizers
4
+ module Tokenizer
5
5
  # Tokenizer used in bge-large-en-v1.5, the most common embeddings model used for Discourse
6
6
  class BgeLargeEnTokenizer < BasicTokenizer
7
7
  def self.tokenizer
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DiscourseAi
4
- module Tokenizers
4
+ module Tokenizer
5
5
  # Tokenizer used in bge-m3, a capable multilingual long context embeddings model.
6
6
  class BgeM3Tokenizer < BasicTokenizer
7
7
  def self.tokenizer
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DiscourseAi
4
- module Tokenizers
4
+ module Tokenizer
5
5
  # Tokenizer from Gemma3, which is said to be the same for Gemini
6
6
  class GeminiTokenizer < BasicTokenizer
7
7
  def self.tokenizer
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DiscourseAi
4
- module Tokenizers
4
+ module Tokenizer
5
5
  # Tokenizer from Llama3, popular open weights LLM
6
6
  class Llama3Tokenizer < BasicTokenizer
7
7
  def self.tokenizer
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DiscourseAi
4
- module Tokenizers
4
+ module Tokenizer
5
5
  # Tokenizer from Mistral Small 2503 LLM
6
6
  class MistralTokenizer < BasicTokenizer
7
7
  def self.tokenizer
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DiscourseAi
4
- module Tokenizers
4
+ module Tokenizer
5
5
  # Tokenizer from multilingual-e5-large, first multilingual embeddings model used in Discourse
6
6
  class MultilingualE5LargeTokenizer < BasicTokenizer
7
7
  def self.tokenizer
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DiscourseAi
4
- module Tokenizers
4
+ module Tokenizer
5
5
  # Wrapper for OpenAI tokenizer library for compatibility with Discourse AI API
6
6
  class OpenAiTokenizer < BasicTokenizer
7
7
  class << self
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DiscourseAi
4
- module Tokenizers
4
+ module Tokenizer
5
5
  # Tokenizer from Qwen3 LLM series. Also compatible with their embedding models
6
6
  class QwenTokenizer < BasicTokenizer
7
7
  def self.tokenizer
@@ -2,6 +2,6 @@
2
2
 
3
3
  module DiscourseAi
4
4
  module Tokenizers
5
- VERSION = "0.1.1"
5
+ VERSION = "0.2.0"
6
6
  end
7
7
  end
@@ -3,18 +3,19 @@
3
3
  require "tokenizers"
4
4
  require "tiktoken_ruby"
5
5
  require_relative "tokenizers/version"
6
- require_relative "tokenizers/basic_tokenizer"
7
- require_relative "tokenizers/bert_tokenizer"
8
- require_relative "tokenizers/anthropic_tokenizer"
9
- require_relative "tokenizers/open_ai_tokenizer"
10
- require_relative "tokenizers/all_mpnet_base_v2_tokenizer"
11
- require_relative "tokenizers/multilingual_e5_large_tokenizer"
12
- require_relative "tokenizers/bge_large_en_tokenizer"
13
- require_relative "tokenizers/bge_m3_tokenizer"
14
- require_relative "tokenizers/llama3_tokenizer"
15
- require_relative "tokenizers/gemini_tokenizer"
16
- require_relative "tokenizers/qwen_tokenizer"
17
- require_relative "tokenizers/mistral_tokenizer"
6
+
7
+ require_relative "tokenizer/basic_tokenizer"
8
+ require_relative "tokenizer/bert_tokenizer"
9
+ require_relative "tokenizer/anthropic_tokenizer"
10
+ require_relative "tokenizer/open_ai_tokenizer"
11
+ require_relative "tokenizer/all_mpnet_base_v2_tokenizer"
12
+ require_relative "tokenizer/multilingual_e5_large_tokenizer"
13
+ require_relative "tokenizer/bge_large_en_tokenizer"
14
+ require_relative "tokenizer/bge_m3_tokenizer"
15
+ require_relative "tokenizer/llama3_tokenizer"
16
+ require_relative "tokenizer/gemini_tokenizer"
17
+ require_relative "tokenizer/qwen_tokenizer"
18
+ require_relative "tokenizer/mistral_tokenizer"
18
19
 
19
20
  module DiscourseAi
20
21
  module Tokenizers
@@ -1,5 +1,5 @@
1
1
  module DiscourseAi
2
- module Tokenizers
2
+ module Tokenizer
3
3
  VERSION: String
4
4
  # See the writing guide of rbs: https://github.com/ruby/rbs#guides
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: discourse_ai-tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rafael Silva
@@ -96,19 +96,19 @@ files:
96
96
  - LICENSE.txt
97
97
  - README.md
98
98
  - Rakefile
99
+ - lib/discourse_ai/tokenizer/all_mpnet_base_v2_tokenizer.rb
100
+ - lib/discourse_ai/tokenizer/anthropic_tokenizer.rb
101
+ - lib/discourse_ai/tokenizer/basic_tokenizer.rb
102
+ - lib/discourse_ai/tokenizer/bert_tokenizer.rb
103
+ - lib/discourse_ai/tokenizer/bge_large_en_tokenizer.rb
104
+ - lib/discourse_ai/tokenizer/bge_m3_tokenizer.rb
105
+ - lib/discourse_ai/tokenizer/gemini_tokenizer.rb
106
+ - lib/discourse_ai/tokenizer/llama3_tokenizer.rb
107
+ - lib/discourse_ai/tokenizer/mistral_tokenizer.rb
108
+ - lib/discourse_ai/tokenizer/multilingual_e5_large_tokenizer.rb
109
+ - lib/discourse_ai/tokenizer/open_ai_tokenizer.rb
110
+ - lib/discourse_ai/tokenizer/qwen_tokenizer.rb
99
111
  - lib/discourse_ai/tokenizers.rb
100
- - lib/discourse_ai/tokenizers/all_mpnet_base_v2_tokenizer.rb
101
- - lib/discourse_ai/tokenizers/anthropic_tokenizer.rb
102
- - lib/discourse_ai/tokenizers/basic_tokenizer.rb
103
- - lib/discourse_ai/tokenizers/bert_tokenizer.rb
104
- - lib/discourse_ai/tokenizers/bge_large_en_tokenizer.rb
105
- - lib/discourse_ai/tokenizers/bge_m3_tokenizer.rb
106
- - lib/discourse_ai/tokenizers/gemini_tokenizer.rb
107
- - lib/discourse_ai/tokenizers/llama3_tokenizer.rb
108
- - lib/discourse_ai/tokenizers/mistral_tokenizer.rb
109
- - lib/discourse_ai/tokenizers/multilingual_e5_large_tokenizer.rb
110
- - lib/discourse_ai/tokenizers/open_ai_tokenizer.rb
111
- - lib/discourse_ai/tokenizers/qwen_tokenizer.rb
112
112
  - lib/discourse_ai/tokenizers/version.rb
113
113
  - sig/discourse_ai/tokenizers.rbs
114
114
  - vendor/Meta-Llama-3-70B-Instruct.json