discourse_ai-tokenizers 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/README.md +24 -24
- data/lib/discourse_ai/tokenizers/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d23181327ee259c76aa29f86a2b40702b39524705d11afd14239dfe6e3e90009
|
4
|
+
data.tar.gz: 76c2d7bbe1c4ebe97dff5576aec832e8664831ade0b819cf9afd244600f1be38
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 02ad662edd31f57b8cba0b1ea221bb7c9f1684d65b9fcd1002739eebf4c1393152ee3f17ea675a8bd596eed4ba9a4fc61cea20b49bd7f3c5b86f64d0e2772bbb
|
7
|
+
data.tar.gz: 9869f1d01ce0388ac2bec619060e7ace28ad5155ff336667072d965a9405217b0ad21e36b258f384ef5456f3e3c3ee35b9c10c536a11e562c4ca5cb948c1ff81
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -39,27 +39,27 @@ gem install discourse_ai-tokenizers
|
|
39
39
|
require 'discourse_ai/tokenizers'
|
40
40
|
|
41
41
|
# Get token count
|
42
|
-
DiscourseAi::
|
42
|
+
DiscourseAi::Tokenizer::OpenAiTokenizer.size("Hello world!")
|
43
43
|
# => 3
|
44
44
|
|
45
45
|
# Tokenize text
|
46
|
-
DiscourseAi::
|
46
|
+
DiscourseAi::Tokenizer::OpenAiTokenizer.tokenize("Hello world!")
|
47
47
|
# => [9906, 1917, 0]
|
48
48
|
|
49
49
|
# Encode text to token IDs
|
50
|
-
DiscourseAi::
|
50
|
+
DiscourseAi::Tokenizer::OpenAiTokenizer.encode("Hello world!")
|
51
51
|
# => [9906, 1917, 0]
|
52
52
|
|
53
53
|
# Decode token IDs back to text
|
54
|
-
DiscourseAi::
|
54
|
+
DiscourseAi::Tokenizer::OpenAiTokenizer.decode([9906, 1917, 0])
|
55
55
|
# => "Hello world!"
|
56
56
|
|
57
57
|
# Truncate text to token limit
|
58
|
-
DiscourseAi::
|
58
|
+
DiscourseAi::Tokenizer::OpenAiTokenizer.truncate("This is a long sentence", 5)
|
59
59
|
# => "This is a"
|
60
60
|
|
61
61
|
# Check if text is within token limit
|
62
|
-
DiscourseAi::
|
62
|
+
DiscourseAi::Tokenizer::OpenAiTokenizer.below_limit?("Short text", 10)
|
63
63
|
# => true
|
64
64
|
```
|
65
65
|
|
@@ -67,27 +67,27 @@ DiscourseAi::Tokenizers::OpenAiTokenizer.below_limit?("Short text", 10)
|
|
67
67
|
|
68
68
|
#### LLM Tokenizers
|
69
69
|
|
70
|
-
- `DiscourseAi::
|
71
|
-
- `DiscourseAi::
|
72
|
-
- `DiscourseAi::
|
73
|
-
- `DiscourseAi::
|
74
|
-
- `DiscourseAi::
|
75
|
-
- `DiscourseAi::
|
70
|
+
- `DiscourseAi::Tokenizer::AnthropicTokenizer` - Claude models
|
71
|
+
- `DiscourseAi::Tokenizer::OpenAiTokenizer` - GPT models
|
72
|
+
- `DiscourseAi::Tokenizer::GeminiTokenizer` - Google Gemini
|
73
|
+
- `DiscourseAi::Tokenizer::Llama3Tokenizer` - Meta Llama 3
|
74
|
+
- `DiscourseAi::Tokenizer::QwenTokenizer` - Alibaba Qwen
|
75
|
+
- `DiscourseAi::Tokenizer::MistralTokenizer` - Mistral models
|
76
76
|
|
77
77
|
#### Embedding Tokenizers
|
78
78
|
|
79
|
-
- `DiscourseAi::
|
80
|
-
- `DiscourseAi::
|
81
|
-
- `DiscourseAi::
|
82
|
-
- `DiscourseAi::
|
83
|
-
- `DiscourseAi::
|
79
|
+
- `DiscourseAi::Tokenizer::BertTokenizer` - BERT-based models
|
80
|
+
- `DiscourseAi::Tokenizer::AllMpnetBaseV2Tokenizer` - sentence-transformers/all-mpnet-base-v2
|
81
|
+
- `DiscourseAi::Tokenizer::BgeLargeEnTokenizer` - BAAI/bge-large-en
|
82
|
+
- `DiscourseAi::Tokenizer::BgeM3Tokenizer` - BAAI/bge-m3
|
83
|
+
- `DiscourseAi::Tokenizer::MultilingualE5LargeTokenizer` - intfloat/multilingual-e5-large
|
84
84
|
|
85
85
|
### Getting Available LLM Tokenizers
|
86
86
|
|
87
87
|
```ruby
|
88
88
|
# Get all available LLM tokenizers dynamically
|
89
|
-
DiscourseAi::
|
90
|
-
# => [DiscourseAi::
|
89
|
+
DiscourseAi::Tokenizer::BasicTokenizer.available_llm_tokenizers
|
90
|
+
# => [DiscourseAi::Tokenizer::AnthropicTokenizer, DiscourseAi::Tokenizer::OpenAiTokenizer, ...]
|
91
91
|
```
|
92
92
|
|
93
93
|
### Advanced Usage
|
@@ -96,10 +96,10 @@ DiscourseAi::Tokenizers::BasicTokenizer.available_llm_tokenizers
|
|
96
96
|
|
97
97
|
```ruby
|
98
98
|
# Strict mode ensures exact token limit compliance
|
99
|
-
DiscourseAi::
|
99
|
+
DiscourseAi::Tokenizer::OpenAiTokenizer.truncate("Long text here", 5, strict: true)
|
100
100
|
|
101
101
|
# Check limits with strict mode
|
102
|
-
DiscourseAi::
|
102
|
+
DiscourseAi::Tokenizer::OpenAiTokenizer.below_limit?("Text", 10, strict: true)
|
103
103
|
```
|
104
104
|
|
105
105
|
#### Unicode and Emoji Support
|
@@ -107,11 +107,11 @@ DiscourseAi::Tokenizers::OpenAiTokenizer.below_limit?("Text", 10, strict: true)
|
|
107
107
|
```ruby
|
108
108
|
# Handles unicode characters properly
|
109
109
|
text = "Hello 世界 🌍 👨👩👧👦"
|
110
|
-
DiscourseAi::
|
110
|
+
DiscourseAi::Tokenizer::OpenAiTokenizer.size(text)
|
111
111
|
# => 8
|
112
112
|
|
113
113
|
# Truncation preserves unicode integrity
|
114
|
-
truncated = DiscourseAi::
|
114
|
+
truncated = DiscourseAi::Tokenizer::OpenAiTokenizer.truncate(text, 5)
|
115
115
|
# => "Hello 世界 🌍"
|
116
116
|
```
|
117
117
|
|
@@ -157,4 +157,4 @@ The gem is available as open source under the terms of the [MIT License](https:/
|
|
157
157
|
|
158
158
|
## Code of Conduct
|
159
159
|
|
160
|
-
Everyone interacting in the DiscourseAi::
|
160
|
+
Everyone interacting in the DiscourseAi::Tokenizer project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](CODE_OF_CONDUCT.md).
|