discourse_ai-tokenizers 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 12f6645ef62162c14c3c1c43af8dae6dc890e96853207cf32575106c4b080d7e
4
- data.tar.gz: 78c9b157f4a0bd490e4f8ea5f10f3c4793ef8f98baa05d8a773f0b5cd55e0e9c
3
+ metadata.gz: bc2fd76c9fd338fd19f6d56d4e21c98695d2ff8a2baf5626f36f7df7f98af3d9
4
+ data.tar.gz: d5e9b1ca74715a0346a3d22e2413834191488167e867d5bec795b6f9c9c25f5d
5
5
  SHA512:
6
- metadata.gz: 715b68088a90292e82280b0a144a8b3040bd5e4f9720937f876f63d8ba91f711db07c62c9a8be1f48746b1fd38606a2b69b063b712f5949069763e7a63a5f010
7
- data.tar.gz: 9e01d44c11d1f88edb838fb876d178b94e69497c73b38c70c4ca215a6296c0bffcf412c59924d1dcf7a64d147db3a30e4cc30d0b864ed02e65d7a0e07a758d03
6
+ metadata.gz: 4b06f6c801f878f173471a337f0d9d28c3321ba0b7b089876b83296353144257080bc0e1495bdbe020914f4cb6324696f9505a625cd01545b17d6e99624f190f
7
+ data.tar.gz: f7c37e3d464b16419c7218554a29514c63d84d1d4247969940d2f9d7f3316618ae593aa98df8941a44d2c721471f97073b75b37b2246a18349f43b6a324adb8a
@@ -5,7 +5,10 @@ module DiscourseAi
5
5
  # Tokenizer for the mpnet based embeddings models
6
6
  class AllMpnetBaseV2Tokenizer < BasicTokenizer
7
7
  def self.tokenizer
8
- @tokenizer ||= ::Tokenizers.from_file("vendor/all-mpnet-base-v2.json")
8
+ @tokenizer ||=
9
+ ::Tokenizers.from_file(
10
+ DiscourseAi::Tokenizers.vendor_path("all-mpnet-base-v2.json")
11
+ )
9
12
  end
10
13
  end
11
14
  end
@@ -6,7 +6,9 @@ module DiscourseAi
6
6
  class AnthropicTokenizer < BasicTokenizer
7
7
  def self.tokenizer
8
8
  @tokenizer ||=
9
- ::Tokenizers.from_file("vendor/claude-v1-tokenization.json")
9
+ ::Tokenizers.from_file(
10
+ DiscourseAi::Tokenizers.vendor_path("claude-v1-tokenization.json")
11
+ )
10
12
  end
11
13
  end
12
14
  end
@@ -5,7 +5,10 @@ module DiscourseAi
5
5
  # Bert tokenizer, useful for lots of embeddings and small classification models
6
6
  class BertTokenizer < BasicTokenizer
7
7
  def self.tokenizer
8
- @tokenizer ||= ::Tokenizers.from_file("vendor/bert-base-uncased.json")
8
+ @tokenizer ||=
9
+ ::Tokenizers.from_file(
10
+ DiscourseAi::Tokenizers.vendor_path("bert-base-uncased.json")
11
+ )
9
12
  end
10
13
  end
11
14
  end
@@ -5,7 +5,10 @@ module DiscourseAi
5
5
  # Tokenizer used in bge-large-en-v1.5, the most common embeddings model used for Discourse
6
6
  class BgeLargeEnTokenizer < BasicTokenizer
7
7
  def self.tokenizer
8
- @tokenizer ||= ::Tokenizers.from_file("vendor/bge-large-en.json")
8
+ @tokenizer ||=
9
+ ::Tokenizers.from_file(
10
+ DiscourseAi::Tokenizers.vendor_path("bge-large-en.json")
11
+ )
9
12
  end
10
13
  end
11
14
  end
@@ -5,7 +5,10 @@ module DiscourseAi
5
5
  # Tokenizer used in bge-m3, a capable multilingual long context embeddings model.
6
6
  class BgeM3Tokenizer < BasicTokenizer
7
7
  def self.tokenizer
8
- @tokenizer ||= ::Tokenizers.from_file("vendor/bge-m3.json")
8
+ @tokenizer ||=
9
+ ::Tokenizers.from_file(
10
+ DiscourseAi::Tokenizers.vendor_path("bge-m3.json")
11
+ )
9
12
  end
10
13
  end
11
14
  end
@@ -5,7 +5,10 @@ module DiscourseAi
5
5
  # Tokenizer from Gemma3, which is said to be the same for Gemini
6
6
  class GeminiTokenizer < BasicTokenizer
7
7
  def self.tokenizer
8
- @tokenizer ||= ::Tokenizers.from_file("vendor/gemma3.json")
8
+ @tokenizer ||=
9
+ ::Tokenizers.from_file(
10
+ DiscourseAi::Tokenizers.vendor_path("gemma3.json")
11
+ )
9
12
  end
10
13
  end
11
14
  end
@@ -6,7 +6,11 @@ module DiscourseAi
6
6
  class Llama3Tokenizer < BasicTokenizer
7
7
  def self.tokenizer
8
8
  @tokenizer ||=
9
- ::Tokenizers.from_file("vendor/Meta-Llama-3-70B-Instruct.json")
9
+ ::Tokenizers.from_file(
10
+ DiscourseAi::Tokenizers.vendor_path(
11
+ "Meta-Llama-3-70B-Instruct.json"
12
+ )
13
+ )
10
14
  end
11
15
  end
12
16
  end
@@ -6,7 +6,11 @@ module DiscourseAi
6
6
  class MistralTokenizer < BasicTokenizer
7
7
  def self.tokenizer
8
8
  @tokenizer ||=
9
- ::Tokenizers.from_file("vendor/mistral-small-3.1-24b-2503.json")
9
+ ::Tokenizers.from_file(
10
+ DiscourseAi::Tokenizers.vendor_path(
11
+ "mistral-small-3.1-24b-2503.json"
12
+ )
13
+ )
10
14
  end
11
15
  end
12
16
  end
@@ -6,7 +6,9 @@ module DiscourseAi
6
6
  class MultilingualE5LargeTokenizer < BasicTokenizer
7
7
  def self.tokenizer
8
8
  @tokenizer ||=
9
- ::Tokenizers.from_file("vendor/multilingual-e5-large.json")
9
+ ::Tokenizers.from_file(
10
+ DiscourseAi::Tokenizers.vendor_path("multilingual-e5-large.json")
11
+ )
10
12
  end
11
13
  end
12
14
  end
@@ -5,7 +5,10 @@ module DiscourseAi
5
5
  # Tokenizer from Qwen3 LLM series. Also compatible with their embedding models
6
6
  class QwenTokenizer < BasicTokenizer
7
7
  def self.tokenizer
8
- @tokenizer ||= ::Tokenizers.from_file("vendor/qwen3.json")
8
+ @tokenizer ||=
9
+ ::Tokenizers.from_file(
10
+ DiscourseAi::Tokenizers.vendor_path("qwen3.json")
11
+ )
9
12
  end
10
13
  end
11
14
  end
@@ -2,6 +2,6 @@
2
2
 
3
3
  module DiscourseAi
4
4
  module Tokenizers
5
- VERSION = "0.1.0"
5
+ VERSION = "0.1.1"
6
6
  end
7
7
  end
@@ -20,6 +20,13 @@ module DiscourseAi
20
20
  module Tokenizers
21
21
  class Error < StandardError
22
22
  end
23
- # Your code goes here...
23
+
24
+ def self.gem_root
25
+ @gem_root ||= File.expand_path("../../..", __FILE__)
26
+ end
27
+
28
+ def self.vendor_path(filename)
29
+ File.join(gem_root, "vendor", filename)
30
+ end
24
31
  end
25
32
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: discourse_ai-tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rafael Silva