discourse_ai-tokenizers 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/discourse_ai/{tokenizers → tokenizer}/all_mpnet_base_v2_tokenizer.rb +5 -2
- data/lib/discourse_ai/{tokenizers → tokenizer}/anthropic_tokenizer.rb +4 -2
- data/lib/discourse_ai/{tokenizers → tokenizer}/basic_tokenizer.rb +7 -7
- data/lib/discourse_ai/{tokenizers → tokenizer}/bert_tokenizer.rb +5 -2
- data/lib/discourse_ai/{tokenizers → tokenizer}/bge_large_en_tokenizer.rb +5 -2
- data/lib/discourse_ai/{tokenizers → tokenizer}/bge_m3_tokenizer.rb +5 -2
- data/lib/discourse_ai/{tokenizers → tokenizer}/gemini_tokenizer.rb +5 -2
- data/lib/discourse_ai/{tokenizers → tokenizer}/llama3_tokenizer.rb +6 -2
- data/lib/discourse_ai/{tokenizers → tokenizer}/mistral_tokenizer.rb +6 -2
- data/lib/discourse_ai/{tokenizers → tokenizer}/multilingual_e5_large_tokenizer.rb +4 -2
- data/lib/discourse_ai/{tokenizers → tokenizer}/open_ai_tokenizer.rb +1 -1
- data/lib/discourse_ai/{tokenizers → tokenizer}/qwen_tokenizer.rb +5 -2
- data/lib/discourse_ai/tokenizers/version.rb +1 -1
- data/lib/discourse_ai/tokenizers.rb +21 -13
- data/sig/discourse_ai/tokenizers.rbs +1 -1
- metadata +13 -13
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b9e8c362a4c2c227617258dee4d75e3d5555e1c4fde8575cd060a0efa2eb36b9
|
4
|
+
data.tar.gz: e9b3a6f950399628d6faec4bcf5a472c556a8f936c8fe82d511c63077f2d6c64
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 69ef898cca64debd9d89297caf97531a9523e6d1fac769eaf86db05aca5aa3568b1e54b5065e09bda35b67deb447e38a1ef31997e3fb23ac0fa77d0a12af2983
|
7
|
+
data.tar.gz: b5561b0d77c81fb5076adfbd74a4232ba1a0c8b50fbb1de0c1f59de0c0f8753e79890f9439b375146dc28b59db434da3003d49cae789b3cd19ddcfb3f94c3ecb
|
@@ -1,11 +1,14 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module DiscourseAi
|
4
|
-
module
|
4
|
+
module Tokenizer
|
5
5
|
# Tokenizer for the mpnet based embeddings models
|
6
6
|
class AllMpnetBaseV2Tokenizer < BasicTokenizer
|
7
7
|
def self.tokenizer
|
8
|
-
@tokenizer ||=
|
8
|
+
@tokenizer ||=
|
9
|
+
::Tokenizers.from_file(
|
10
|
+
DiscourseAi::Tokenizers.vendor_path("all-mpnet-base-v2.json")
|
11
|
+
)
|
9
12
|
end
|
10
13
|
end
|
11
14
|
end
|
@@ -1,12 +1,14 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module DiscourseAi
|
4
|
-
module
|
4
|
+
module Tokenizer
|
5
5
|
# Extracted from Anthropic's python SDK, compatible with first Claude versions
|
6
6
|
class AnthropicTokenizer < BasicTokenizer
|
7
7
|
def self.tokenizer
|
8
8
|
@tokenizer ||=
|
9
|
-
::Tokenizers.from_file(
|
9
|
+
::Tokenizers.from_file(
|
10
|
+
DiscourseAi::Tokenizers.vendor_path("claude-v1-tokenization.json")
|
11
|
+
)
|
10
12
|
end
|
11
13
|
end
|
12
14
|
end
|
@@ -1,18 +1,18 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module DiscourseAi
|
4
|
-
module
|
4
|
+
module Tokenizer
|
5
5
|
# Base class for tokenizers to inherit from
|
6
6
|
class BasicTokenizer
|
7
7
|
class << self
|
8
8
|
def available_llm_tokenizers
|
9
9
|
[
|
10
|
-
DiscourseAi::
|
11
|
-
DiscourseAi::
|
12
|
-
DiscourseAi::
|
13
|
-
DiscourseAi::
|
14
|
-
DiscourseAi::
|
15
|
-
DiscourseAi::
|
10
|
+
DiscourseAi::Tokenizer::AnthropicTokenizer,
|
11
|
+
DiscourseAi::Tokenizer::GeminiTokenizer,
|
12
|
+
DiscourseAi::Tokenizer::Llama3Tokenizer,
|
13
|
+
DiscourseAi::Tokenizer::MistralTokenizer,
|
14
|
+
DiscourseAi::Tokenizer::OpenAiTokenizer,
|
15
|
+
DiscourseAi::Tokenizer::QwenTokenizer
|
16
16
|
]
|
17
17
|
end
|
18
18
|
|
@@ -1,11 +1,14 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module DiscourseAi
|
4
|
-
module
|
4
|
+
module Tokenizer
|
5
5
|
# Bert tokenizer, useful for lots of embeddings and small classification models
|
6
6
|
class BertTokenizer < BasicTokenizer
|
7
7
|
def self.tokenizer
|
8
|
-
@tokenizer ||=
|
8
|
+
@tokenizer ||=
|
9
|
+
::Tokenizers.from_file(
|
10
|
+
DiscourseAi::Tokenizers.vendor_path("bert-base-uncased.json")
|
11
|
+
)
|
9
12
|
end
|
10
13
|
end
|
11
14
|
end
|
@@ -1,11 +1,14 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module DiscourseAi
|
4
|
-
module
|
4
|
+
module Tokenizer
|
5
5
|
# Tokenizer used in bge-large-en-v1.5, the most common embeddings model used for Discourse
|
6
6
|
class BgeLargeEnTokenizer < BasicTokenizer
|
7
7
|
def self.tokenizer
|
8
|
-
@tokenizer ||=
|
8
|
+
@tokenizer ||=
|
9
|
+
::Tokenizers.from_file(
|
10
|
+
DiscourseAi::Tokenizers.vendor_path("bge-large-en.json")
|
11
|
+
)
|
9
12
|
end
|
10
13
|
end
|
11
14
|
end
|
@@ -1,11 +1,14 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module DiscourseAi
|
4
|
-
module
|
4
|
+
module Tokenizer
|
5
5
|
# Tokenizer used in bge-m3, a capable multilingual long context embeddings model.
|
6
6
|
class BgeM3Tokenizer < BasicTokenizer
|
7
7
|
def self.tokenizer
|
8
|
-
@tokenizer ||=
|
8
|
+
@tokenizer ||=
|
9
|
+
::Tokenizers.from_file(
|
10
|
+
DiscourseAi::Tokenizers.vendor_path("bge-m3.json")
|
11
|
+
)
|
9
12
|
end
|
10
13
|
end
|
11
14
|
end
|
@@ -1,11 +1,14 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module DiscourseAi
|
4
|
-
module
|
4
|
+
module Tokenizer
|
5
5
|
# Tokenizer from Gemma3, which is said to be the same for Gemini
|
6
6
|
class GeminiTokenizer < BasicTokenizer
|
7
7
|
def self.tokenizer
|
8
|
-
@tokenizer ||=
|
8
|
+
@tokenizer ||=
|
9
|
+
::Tokenizers.from_file(
|
10
|
+
DiscourseAi::Tokenizers.vendor_path("gemma3.json")
|
11
|
+
)
|
9
12
|
end
|
10
13
|
end
|
11
14
|
end
|
@@ -1,12 +1,16 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module DiscourseAi
|
4
|
-
module
|
4
|
+
module Tokenizer
|
5
5
|
# Tokenizer from Llama3, popular open weights LLM
|
6
6
|
class Llama3Tokenizer < BasicTokenizer
|
7
7
|
def self.tokenizer
|
8
8
|
@tokenizer ||=
|
9
|
-
::Tokenizers.from_file(
|
9
|
+
::Tokenizers.from_file(
|
10
|
+
DiscourseAi::Tokenizers.vendor_path(
|
11
|
+
"Meta-Llama-3-70B-Instruct.json"
|
12
|
+
)
|
13
|
+
)
|
10
14
|
end
|
11
15
|
end
|
12
16
|
end
|
@@ -1,12 +1,16 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module DiscourseAi
|
4
|
-
module
|
4
|
+
module Tokenizer
|
5
5
|
# Tokenizer from Mistral Small 2503 LLM
|
6
6
|
class MistralTokenizer < BasicTokenizer
|
7
7
|
def self.tokenizer
|
8
8
|
@tokenizer ||=
|
9
|
-
::Tokenizers.from_file(
|
9
|
+
::Tokenizers.from_file(
|
10
|
+
DiscourseAi::Tokenizers.vendor_path(
|
11
|
+
"mistral-small-3.1-24b-2503.json"
|
12
|
+
)
|
13
|
+
)
|
10
14
|
end
|
11
15
|
end
|
12
16
|
end
|
@@ -1,12 +1,14 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module DiscourseAi
|
4
|
-
module
|
4
|
+
module Tokenizer
|
5
5
|
# Tokenizer from multilingual-e5-large, first multilingual embeddings model used in Discourse
|
6
6
|
class MultilingualE5LargeTokenizer < BasicTokenizer
|
7
7
|
def self.tokenizer
|
8
8
|
@tokenizer ||=
|
9
|
-
::Tokenizers.from_file(
|
9
|
+
::Tokenizers.from_file(
|
10
|
+
DiscourseAi::Tokenizers.vendor_path("multilingual-e5-large.json")
|
11
|
+
)
|
10
12
|
end
|
11
13
|
end
|
12
14
|
end
|
@@ -1,11 +1,14 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module DiscourseAi
|
4
|
-
module
|
4
|
+
module Tokenizer
|
5
5
|
# Tokenizer from Qwen3 LLM series. Also compatible with their embedding models
|
6
6
|
class QwenTokenizer < BasicTokenizer
|
7
7
|
def self.tokenizer
|
8
|
-
@tokenizer ||=
|
8
|
+
@tokenizer ||=
|
9
|
+
::Tokenizers.from_file(
|
10
|
+
DiscourseAi::Tokenizers.vendor_path("qwen3.json")
|
11
|
+
)
|
9
12
|
end
|
10
13
|
end
|
11
14
|
end
|
@@ -3,23 +3,31 @@
|
|
3
3
|
require "tokenizers"
|
4
4
|
require "tiktoken_ruby"
|
5
5
|
require_relative "tokenizers/version"
|
6
|
-
|
7
|
-
require_relative "
|
8
|
-
require_relative "
|
9
|
-
require_relative "
|
10
|
-
require_relative "
|
11
|
-
require_relative "
|
12
|
-
require_relative "
|
13
|
-
require_relative "
|
14
|
-
require_relative "
|
15
|
-
require_relative "
|
16
|
-
require_relative "
|
17
|
-
require_relative "
|
6
|
+
|
7
|
+
require_relative "tokenizer/basic_tokenizer"
|
8
|
+
require_relative "tokenizer/bert_tokenizer"
|
9
|
+
require_relative "tokenizer/anthropic_tokenizer"
|
10
|
+
require_relative "tokenizer/open_ai_tokenizer"
|
11
|
+
require_relative "tokenizer/all_mpnet_base_v2_tokenizer"
|
12
|
+
require_relative "tokenizer/multilingual_e5_large_tokenizer"
|
13
|
+
require_relative "tokenizer/bge_large_en_tokenizer"
|
14
|
+
require_relative "tokenizer/bge_m3_tokenizer"
|
15
|
+
require_relative "tokenizer/llama3_tokenizer"
|
16
|
+
require_relative "tokenizer/gemini_tokenizer"
|
17
|
+
require_relative "tokenizer/qwen_tokenizer"
|
18
|
+
require_relative "tokenizer/mistral_tokenizer"
|
18
19
|
|
19
20
|
module DiscourseAi
|
20
21
|
module Tokenizers
|
21
22
|
class Error < StandardError
|
22
23
|
end
|
23
|
-
|
24
|
+
|
25
|
+
def self.gem_root
|
26
|
+
@gem_root ||= File.expand_path("../../..", __FILE__)
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.vendor_path(filename)
|
30
|
+
File.join(gem_root, "vendor", filename)
|
31
|
+
end
|
24
32
|
end
|
25
33
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: discourse_ai-tokenizers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rafael Silva
|
@@ -96,19 +96,19 @@ files:
|
|
96
96
|
- LICENSE.txt
|
97
97
|
- README.md
|
98
98
|
- Rakefile
|
99
|
+
- lib/discourse_ai/tokenizer/all_mpnet_base_v2_tokenizer.rb
|
100
|
+
- lib/discourse_ai/tokenizer/anthropic_tokenizer.rb
|
101
|
+
- lib/discourse_ai/tokenizer/basic_tokenizer.rb
|
102
|
+
- lib/discourse_ai/tokenizer/bert_tokenizer.rb
|
103
|
+
- lib/discourse_ai/tokenizer/bge_large_en_tokenizer.rb
|
104
|
+
- lib/discourse_ai/tokenizer/bge_m3_tokenizer.rb
|
105
|
+
- lib/discourse_ai/tokenizer/gemini_tokenizer.rb
|
106
|
+
- lib/discourse_ai/tokenizer/llama3_tokenizer.rb
|
107
|
+
- lib/discourse_ai/tokenizer/mistral_tokenizer.rb
|
108
|
+
- lib/discourse_ai/tokenizer/multilingual_e5_large_tokenizer.rb
|
109
|
+
- lib/discourse_ai/tokenizer/open_ai_tokenizer.rb
|
110
|
+
- lib/discourse_ai/tokenizer/qwen_tokenizer.rb
|
99
111
|
- lib/discourse_ai/tokenizers.rb
|
100
|
-
- lib/discourse_ai/tokenizers/all_mpnet_base_v2_tokenizer.rb
|
101
|
-
- lib/discourse_ai/tokenizers/anthropic_tokenizer.rb
|
102
|
-
- lib/discourse_ai/tokenizers/basic_tokenizer.rb
|
103
|
-
- lib/discourse_ai/tokenizers/bert_tokenizer.rb
|
104
|
-
- lib/discourse_ai/tokenizers/bge_large_en_tokenizer.rb
|
105
|
-
- lib/discourse_ai/tokenizers/bge_m3_tokenizer.rb
|
106
|
-
- lib/discourse_ai/tokenizers/gemini_tokenizer.rb
|
107
|
-
- lib/discourse_ai/tokenizers/llama3_tokenizer.rb
|
108
|
-
- lib/discourse_ai/tokenizers/mistral_tokenizer.rb
|
109
|
-
- lib/discourse_ai/tokenizers/multilingual_e5_large_tokenizer.rb
|
110
|
-
- lib/discourse_ai/tokenizers/open_ai_tokenizer.rb
|
111
|
-
- lib/discourse_ai/tokenizers/qwen_tokenizer.rb
|
112
112
|
- lib/discourse_ai/tokenizers/version.rb
|
113
113
|
- sig/discourse_ai/tokenizers.rbs
|
114
114
|
- vendor/Meta-Llama-3-70B-Instruct.json
|