discourse_ai-tokenizers 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d23181327ee259c76aa29f86a2b40702b39524705d11afd14239dfe6e3e90009
4
- data.tar.gz: 76c2d7bbe1c4ebe97dff5576aec832e8664831ade0b819cf9afd244600f1be38
3
+ metadata.gz: c2d3921cc11a89b45ff8e5f7a58c4ae4cb170791a45dc445d1f908a5cc83a88a
4
+ data.tar.gz: 7d5d4a725d97d608baea0c3946bde1156c5f7808ad5d6e5bbbb420670e195287
5
5
  SHA512:
6
- metadata.gz: 02ad662edd31f57b8cba0b1ea221bb7c9f1684d65b9fcd1002739eebf4c1393152ee3f17ea675a8bd596eed4ba9a4fc61cea20b49bd7f3c5b86f64d0e2772bbb
7
- data.tar.gz: 9869f1d01ce0388ac2bec619060e7ace28ad5155ff336667072d965a9405217b0ad21e36b258f384ef5456f3e3c3ee35b9c10c536a11e562c4ca5cb948c1ff81
6
+ metadata.gz: 407774d2cfd411c88e4b43fb31aa572ea8a59bc285887f24dc96cf4843d7b8c1dc5b0c35b5731223a4e269772146ff8a9b499829bcace07b5b954f540d534bdf
7
+ data.tar.gz: da30167cc708d12dbba2763bdb31802c6ba165018d5ca7a1e698898ac5df41ee4b11c91aa24153588a238f98d1ebd886316e9716cf9bfef8c2ea65fbd9fec2a5
data/CHANGELOG.md CHANGED
@@ -1,5 +1,11 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [0.3.0] - 2025-07-04
4
+
5
+ - Add OpenAiCl100kTokenizer class for cl100k_base encoding
6
+ - Refactor OpenAiTokenizer to OpenAiO200kTokenizer with backward compatibility alias
7
+ - Update version to 0.3.0
8
+
3
9
  ## [0.2.0] - 2025-07-02
4
10
 
5
11
  - Initial release
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DiscourseAi
4
+ module Tokenizer
5
+ # Wrapper for OpenAI tokenizer library for compatibility with Discourse AI API
6
+ class OpenAiCl100kTokenizer < OpenAiTokenizer
7
+ class << self
8
+ def tokenizer
9
+ @tokenizer ||= Tiktoken.get_encoding("cl100k_base")
10
+ end
11
+ end
12
+ end
13
+ end
14
+ end
@@ -3,7 +3,7 @@
3
3
  module DiscourseAi
4
4
  module Tokenizer
5
5
  # Wrapper for OpenAI tokenizer library for compatibility with Discourse AI API
6
- class OpenAiTokenizer < BasicTokenizer
6
+ class OpenAiO200kTokenizer < BasicTokenizer
7
7
  class << self
8
8
  def tokenizer
9
9
  @tokenizer ||= Tiktoken.get_encoding("o200k_base")
@@ -54,5 +54,7 @@ module DiscourseAi
54
54
  end
55
55
  end
56
56
  end
57
+
58
+ OpenAiTokenizer = OpenAiO200kTokenizer
57
59
  end
58
60
  end
@@ -2,6 +2,6 @@
2
2
 
3
3
  module DiscourseAi
4
4
  module Tokenizers
5
- VERSION = "0.2.0"
5
+ VERSION = "0.3.0"
6
6
  end
7
7
  end
@@ -8,6 +8,7 @@ require_relative "tokenizer/basic_tokenizer"
8
8
  require_relative "tokenizer/bert_tokenizer"
9
9
  require_relative "tokenizer/anthropic_tokenizer"
10
10
  require_relative "tokenizer/open_ai_tokenizer"
11
+ require_relative "tokenizer/open_ai_cl100k_tokenizer"
11
12
  require_relative "tokenizer/all_mpnet_base_v2_tokenizer"
12
13
  require_relative "tokenizer/multilingual_e5_large_tokenizer"
13
14
  require_relative "tokenizer/bge_large_en_tokenizer"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: discourse_ai-tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rafael Silva
@@ -106,6 +106,7 @@ files:
106
106
  - lib/discourse_ai/tokenizer/llama3_tokenizer.rb
107
107
  - lib/discourse_ai/tokenizer/mistral_tokenizer.rb
108
108
  - lib/discourse_ai/tokenizer/multilingual_e5_large_tokenizer.rb
109
+ - lib/discourse_ai/tokenizer/open_ai_cl100k_tokenizer.rb
109
110
  - lib/discourse_ai/tokenizer/open_ai_tokenizer.rb
110
111
  - lib/discourse_ai/tokenizer/qwen_tokenizer.rb
111
112
  - lib/discourse_ai/tokenizers.rb