sorbet-baml 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +123 -2
- data/Rakefile +2 -2
- data/docs-site/.gitignore +48 -0
- data/docs-site/Gemfile +5 -0
- data/docs-site/Gemfile.lock +140 -0
- data/docs-site/Rakefile +3 -0
- data/docs-site/bridgetown.config.yml +15 -0
- data/docs-site/config/initializers.rb +9 -0
- data/docs-site/config/puma.rb +9 -0
- data/docs-site/config.ru +5 -0
- data/docs-site/esbuild.config.js +11 -0
- data/docs-site/frontend/javascript/index.js +22 -0
- data/docs-site/frontend/styles/index.css +61 -0
- data/docs-site/package.json +18 -0
- data/docs-site/postcss.config.js +6 -0
- data/docs-site/server/roda_app.rb +9 -0
- data/docs-site/src/_components/head.liquid +26 -0
- data/docs-site/src/_components/nav.liquid +68 -0
- data/docs-site/src/_layouts/default.liquid +27 -0
- data/docs-site/src/_layouts/doc.liquid +39 -0
- data/docs-site/src/advanced-usage.md +598 -0
- data/docs-site/src/getting-started.md +170 -0
- data/docs-site/src/index.md +183 -0
- data/docs-site/src/troubleshooting.md +317 -0
- data/docs-site/src/type-mapping.md +236 -0
- data/docs-site/tailwind.config.js +85 -0
- data/examples/description_parameters.rb +16 -16
- data/lib/sorbet_baml/comment_extractor.rb +31 -39
- data/lib/sorbet_baml/converter.rb +66 -32
- data/lib/sorbet_baml/dependency_resolver.rb +11 -11
- data/lib/sorbet_baml/description_extension.rb +5 -5
- data/lib/sorbet_baml/description_extractor.rb +8 -10
- data/lib/sorbet_baml/dspy_tool_converter.rb +97 -0
- data/lib/sorbet_baml/dspy_tool_extensions.rb +23 -0
- data/lib/sorbet_baml/enum_extensions.rb +2 -2
- data/lib/sorbet_baml/struct_extensions.rb +2 -2
- data/lib/sorbet_baml/tool_extensions.rb +23 -0
- data/lib/sorbet_baml/type_mapper.rb +35 -37
- data/lib/sorbet_baml/version.rb +1 -1
- data/lib/sorbet_baml.rb +41 -13
- data/sorbet/config +2 -0
- data/sorbet/rbi/gems/anthropic@1.5.0.rbi +21252 -0
- data/sorbet/rbi/gems/async@2.27.3.rbi +9 -0
- data/sorbet/rbi/gems/bigdecimal@3.2.2.rbi +9 -0
- data/sorbet/rbi/gems/concurrent-ruby@1.3.5.rbi +424 -0
- data/sorbet/rbi/gems/connection_pool@2.5.3.rbi +9 -0
- data/sorbet/rbi/gems/console@1.33.0.rbi +9 -0
- data/sorbet/rbi/gems/dry-configurable@1.3.0.rbi +672 -0
- data/sorbet/rbi/gems/dry-core@1.1.0.rbi +1729 -0
- data/sorbet/rbi/gems/dry-logger@1.1.0.rbi +1317 -0
- data/sorbet/rbi/gems/dspy@0.19.1.rbi +6677 -0
- data/sorbet/rbi/gems/ffi@1.17.2.rbi +2174 -0
- data/sorbet/rbi/gems/fiber-annotation@0.2.0.rbi +9 -0
- data/sorbet/rbi/gems/fiber-local@1.1.0.rbi +9 -0
- data/sorbet/rbi/gems/fiber-storage@1.0.1.rbi +9 -0
- data/sorbet/rbi/gems/google-protobuf@4.32.0.rbi +9 -0
- data/sorbet/rbi/gems/googleapis-common-protos-types@1.20.0.rbi +9 -0
- data/sorbet/rbi/gems/informers@1.2.1.rbi +1875 -0
- data/sorbet/rbi/gems/io-event@1.12.1.rbi +9 -0
- data/sorbet/rbi/gems/metrics@0.13.0.rbi +9 -0
- data/sorbet/rbi/gems/onnxruntime@0.10.0.rbi +304 -0
- data/sorbet/rbi/gems/openai@0.16.0.rbi +68055 -0
- data/sorbet/rbi/gems/opentelemetry-api@1.6.0.rbi +9 -0
- data/sorbet/rbi/gems/opentelemetry-common@0.22.0.rbi +9 -0
- data/sorbet/rbi/gems/opentelemetry-exporter-otlp@0.30.0.rbi +9 -0
- data/sorbet/rbi/gems/opentelemetry-registry@0.4.0.rbi +9 -0
- data/sorbet/rbi/gems/opentelemetry-sdk@1.8.1.rbi +9 -0
- data/sorbet/rbi/gems/opentelemetry-semantic_conventions@1.11.0.rbi +9 -0
- data/sorbet/rbi/gems/polars-df@0.20.0.rbi +9 -0
- data/sorbet/rbi/gems/sorbet-result@1.4.0.rbi +242 -0
- data/sorbet/rbi/gems/sorbet-schema@0.9.2.rbi +743 -0
- data/sorbet/rbi/gems/sorbet-struct-comparable@1.3.0.rbi +48 -0
- data/sorbet/rbi/gems/tokenizers@0.5.5.rbi +754 -0
- data/sorbet/rbi/gems/traces@0.17.0.rbi +9 -0
- data/sorbet/rbi/gems/zeitwerk@2.7.3.rbi +1429 -0
- metadata +63 -2
|
@@ -0,0 +1,754 @@
|
|
|
1
|
+
# typed: true
|
|
2
|
+
|
|
3
|
+
# DO NOT EDIT MANUALLY
|
|
4
|
+
# This is an autogenerated file for types exported from the `tokenizers` gem.
|
|
5
|
+
# Please instead update this file by running `bin/tapioca gem tokenizers`.
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
# source://tokenizers//lib/tokenizers/decoders/bpe_decoder.rb#1
|
|
9
|
+
module Tokenizers
|
|
10
|
+
class << self
|
|
11
|
+
# source://tokenizers//lib/tokenizers.rb#59
|
|
12
|
+
def from_file(*_arg0, **_arg1, &_arg2); end
|
|
13
|
+
|
|
14
|
+
# source://tokenizers//lib/tokenizers.rb#55
|
|
15
|
+
def from_pretrained(*_arg0, **_arg1, &_arg2); end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# source://tokenizers//lib/tokenizers/added_token.rb#2
|
|
20
|
+
class Tokenizers::AddedToken
|
|
21
|
+
def content; end
|
|
22
|
+
def lstrip; end
|
|
23
|
+
def normalized; end
|
|
24
|
+
def rstrip; end
|
|
25
|
+
def single_word; end
|
|
26
|
+
def special; end
|
|
27
|
+
|
|
28
|
+
class << self
|
|
29
|
+
def _new(_arg0, _arg1); end
|
|
30
|
+
|
|
31
|
+
# source://tokenizers//lib/tokenizers/added_token.rb#3
|
|
32
|
+
def new(content, **kwargs); end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# source://tokenizers//lib/tokenizers/char_bpe_tokenizer.rb#2
|
|
37
|
+
class Tokenizers::CharBPETokenizer
|
|
38
|
+
# @return [CharBPETokenizer] a new instance of CharBPETokenizer
|
|
39
|
+
#
|
|
40
|
+
# source://tokenizers//lib/tokenizers/char_bpe_tokenizer.rb#3
|
|
41
|
+
def initialize(vocab, merges, unk_token: T.unsafe(nil), suffix: T.unsafe(nil)); end
|
|
42
|
+
|
|
43
|
+
# source://tokenizers//lib/tokenizers/char_bpe_tokenizer.rb#18
|
|
44
|
+
def decode(ids); end
|
|
45
|
+
|
|
46
|
+
# source://tokenizers//lib/tokenizers/char_bpe_tokenizer.rb#14
|
|
47
|
+
def encode(text, **options); end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# source://tokenizers//lib/tokenizers/decoders/bpe_decoder.rb#2
|
|
51
|
+
module Tokenizers::Decoders; end
|
|
52
|
+
|
|
53
|
+
# source://tokenizers//lib/tokenizers/decoders/bpe_decoder.rb#3
|
|
54
|
+
class Tokenizers::Decoders::BPEDecoder < ::Tokenizers::Decoders::Decoder
|
|
55
|
+
def suffix; end
|
|
56
|
+
def suffix=(_arg0); end
|
|
57
|
+
|
|
58
|
+
class << self
|
|
59
|
+
def _new(_arg0); end
|
|
60
|
+
|
|
61
|
+
# source://tokenizers//lib/tokenizers/decoders/bpe_decoder.rb#4
|
|
62
|
+
def new(suffix: T.unsafe(nil)); end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
class Tokenizers::Decoders::ByteFallback < ::Tokenizers::Decoders::Decoder
|
|
67
|
+
class << self
|
|
68
|
+
def new; end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
class Tokenizers::Decoders::ByteLevel < ::Tokenizers::Decoders::Decoder
|
|
73
|
+
class << self
|
|
74
|
+
def new; end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# source://tokenizers//lib/tokenizers/decoders/ctc.rb#3
|
|
79
|
+
class Tokenizers::Decoders::CTC < ::Tokenizers::Decoders::Decoder
|
|
80
|
+
def cleanup; end
|
|
81
|
+
def cleanup=(_arg0); end
|
|
82
|
+
def pad_token; end
|
|
83
|
+
def pad_token=(_arg0); end
|
|
84
|
+
def word_delimiter_token; end
|
|
85
|
+
def word_delimiter_token=(_arg0); end
|
|
86
|
+
|
|
87
|
+
class << self
|
|
88
|
+
def _new(_arg0, _arg1, _arg2); end
|
|
89
|
+
|
|
90
|
+
# source://tokenizers//lib/tokenizers/decoders/ctc.rb#4
|
|
91
|
+
def new(pad_token: T.unsafe(nil), word_delimiter_token: T.unsafe(nil), cleanup: T.unsafe(nil)); end
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
class Tokenizers::Decoders::Decoder
|
|
96
|
+
def decode(_arg0); end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
class Tokenizers::Decoders::Fuse < ::Tokenizers::Decoders::Decoder
|
|
100
|
+
class << self
|
|
101
|
+
def new; end
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# source://tokenizers//lib/tokenizers/decoders/metaspace.rb#3
|
|
106
|
+
class Tokenizers::Decoders::Metaspace < ::Tokenizers::Decoders::Decoder
|
|
107
|
+
def prepend_scheme; end
|
|
108
|
+
def prepend_scheme=(_arg0); end
|
|
109
|
+
def replacement; end
|
|
110
|
+
def replacement=(_arg0); end
|
|
111
|
+
def split; end
|
|
112
|
+
def split=(_arg0); end
|
|
113
|
+
|
|
114
|
+
class << self
|
|
115
|
+
def _new(_arg0, _arg1, _arg2); end
|
|
116
|
+
|
|
117
|
+
# source://tokenizers//lib/tokenizers/decoders/metaspace.rb#4
|
|
118
|
+
def new(replacement: T.unsafe(nil), prepend_scheme: T.unsafe(nil), split: T.unsafe(nil)); end
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
class Tokenizers::Decoders::Replace < ::Tokenizers::Decoders::Decoder
|
|
123
|
+
class << self
|
|
124
|
+
def new(_arg0, _arg1); end
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# source://tokenizers//lib/tokenizers/decoders/strip.rb#3
|
|
129
|
+
class Tokenizers::Decoders::Strip < ::Tokenizers::Decoders::Decoder
|
|
130
|
+
def content; end
|
|
131
|
+
def content=(_arg0); end
|
|
132
|
+
def start; end
|
|
133
|
+
def start=(_arg0); end
|
|
134
|
+
def stop; end
|
|
135
|
+
def stop=(_arg0); end
|
|
136
|
+
|
|
137
|
+
class << self
|
|
138
|
+
def _new(_arg0, _arg1, _arg2); end
|
|
139
|
+
|
|
140
|
+
# source://tokenizers//lib/tokenizers/decoders/strip.rb#4
|
|
141
|
+
def new(content: T.unsafe(nil), start: T.unsafe(nil), stop: T.unsafe(nil)); end
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# source://tokenizers//lib/tokenizers/decoders/word_piece.rb#3
|
|
146
|
+
class Tokenizers::Decoders::WordPiece < ::Tokenizers::Decoders::Decoder
|
|
147
|
+
def cleanup; end
|
|
148
|
+
def cleanup=(_arg0); end
|
|
149
|
+
def prefix; end
|
|
150
|
+
def prefix=(_arg0); end
|
|
151
|
+
|
|
152
|
+
class << self
|
|
153
|
+
def _new(_arg0, _arg1); end
|
|
154
|
+
|
|
155
|
+
# source://tokenizers//lib/tokenizers/decoders/word_piece.rb#4
|
|
156
|
+
def new(prefix: T.unsafe(nil), cleanup: T.unsafe(nil)); end
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
# source://tokenizers//lib/tokenizers/encoding.rb#2
|
|
161
|
+
class Tokenizers::Encoding
|
|
162
|
+
def _char_to_token(_arg0, _arg1); end
|
|
163
|
+
def _char_to_word(_arg0, _arg1); end
|
|
164
|
+
def _word_to_chars(_arg0, _arg1); end
|
|
165
|
+
def _word_to_tokens(_arg0, _arg1); end
|
|
166
|
+
def attention_mask; end
|
|
167
|
+
|
|
168
|
+
# source://tokenizers//lib/tokenizers/encoding.rb#11
|
|
169
|
+
def char_to_token(char_pos, sequence_index = T.unsafe(nil)); end
|
|
170
|
+
|
|
171
|
+
# source://tokenizers//lib/tokenizers/encoding.rb#15
|
|
172
|
+
def char_to_word(char_pos, sequence_index = T.unsafe(nil)); end
|
|
173
|
+
|
|
174
|
+
def ids; end
|
|
175
|
+
def n_sequences; end
|
|
176
|
+
def offsets; end
|
|
177
|
+
def overflowing; end
|
|
178
|
+
def sequence_ids; end
|
|
179
|
+
def special_tokens_mask; end
|
|
180
|
+
def token_to_chars(_arg0); end
|
|
181
|
+
def token_to_sequence(_arg0); end
|
|
182
|
+
def token_to_word(_arg0); end
|
|
183
|
+
def tokens; end
|
|
184
|
+
def type_ids; end
|
|
185
|
+
def word_ids; end
|
|
186
|
+
|
|
187
|
+
# source://tokenizers//lib/tokenizers/encoding.rb#7
|
|
188
|
+
def word_to_chars(word_index, sequence_index = T.unsafe(nil)); end
|
|
189
|
+
|
|
190
|
+
# source://tokenizers//lib/tokenizers/encoding.rb#3
|
|
191
|
+
def word_to_tokens(word_index, sequence_index = T.unsafe(nil)); end
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
# source://tokenizers//lib/tokenizers.rb#53
|
|
195
|
+
class Tokenizers::Error < ::StandardError; end
|
|
196
|
+
|
|
197
|
+
# source://tokenizers//lib/tokenizers/from_pretrained.rb#2
|
|
198
|
+
module Tokenizers::FromPretrained
|
|
199
|
+
# use Ruby for downloads
|
|
200
|
+
# this avoids the need to vendor OpenSSL on Linux
|
|
201
|
+
# and reduces the extension size by about half
|
|
202
|
+
#
|
|
203
|
+
# source://tokenizers//lib/tokenizers/from_pretrained.rb#9
|
|
204
|
+
def from_pretrained(identifier, revision: T.unsafe(nil), auth_token: T.unsafe(nil)); end
|
|
205
|
+
|
|
206
|
+
private
|
|
207
|
+
|
|
208
|
+
# source://tokenizers//lib/tokenizers/from_pretrained.rb#96
|
|
209
|
+
def cache_dir; end
|
|
210
|
+
|
|
211
|
+
# use same storage format as Rust version
|
|
212
|
+
# https://github.com/epwalsh/rust-cached-path
|
|
213
|
+
#
|
|
214
|
+
# source://tokenizers//lib/tokenizers/from_pretrained.rb#46
|
|
215
|
+
def cached_path(cache_dir, url, headers, options); end
|
|
216
|
+
|
|
217
|
+
# source://tokenizers//lib/tokenizers/from_pretrained.rb#115
|
|
218
|
+
def ensure_cache_dir; end
|
|
219
|
+
|
|
220
|
+
# @return [Boolean]
|
|
221
|
+
#
|
|
222
|
+
# source://tokenizers//lib/tokenizers/from_pretrained.rb#121
|
|
223
|
+
def mac?; end
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
# for user agent
|
|
227
|
+
#
|
|
228
|
+
# source://tokenizers//lib/tokenizers/from_pretrained.rb#4
|
|
229
|
+
Tokenizers::FromPretrained::TOKENIZERS_VERSION = T.let(T.unsafe(nil), String)
|
|
230
|
+
|
|
231
|
+
# source://tokenizers//lib/tokenizers/models/bpe.rb#2
|
|
232
|
+
module Tokenizers::Models; end
|
|
233
|
+
|
|
234
|
+
# source://tokenizers//lib/tokenizers/models/bpe.rb#3
|
|
235
|
+
class Tokenizers::Models::BPE < ::Tokenizers::Models::Model
|
|
236
|
+
def byte_fallback; end
|
|
237
|
+
def byte_fallback=(_arg0); end
|
|
238
|
+
def continuing_subword_prefix; end
|
|
239
|
+
def continuing_subword_prefix=(_arg0); end
|
|
240
|
+
def dropout; end
|
|
241
|
+
def dropout=(_arg0); end
|
|
242
|
+
def end_of_word_suffix; end
|
|
243
|
+
def end_of_word_suffix=(_arg0); end
|
|
244
|
+
def fuse_unk; end
|
|
245
|
+
def fuse_unk=(_arg0); end
|
|
246
|
+
def unk_token; end
|
|
247
|
+
def unk_token=(_arg0); end
|
|
248
|
+
|
|
249
|
+
class << self
|
|
250
|
+
def _from_file(_arg0, _arg1, _arg2); end
|
|
251
|
+
def _new(_arg0, _arg1, _arg2); end
|
|
252
|
+
|
|
253
|
+
# source://tokenizers//lib/tokenizers/models/bpe.rb#4
|
|
254
|
+
def new(vocab: T.unsafe(nil), merges: T.unsafe(nil), **kwargs); end
|
|
255
|
+
end
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
class Tokenizers::Models::Model; end
|
|
259
|
+
|
|
260
|
+
# source://tokenizers//lib/tokenizers/models/unigram.rb#3
|
|
261
|
+
class Tokenizers::Models::Unigram < ::Tokenizers::Models::Model
|
|
262
|
+
class << self
|
|
263
|
+
def _new(_arg0, _arg1, _arg2); end
|
|
264
|
+
|
|
265
|
+
# source://tokenizers//lib/tokenizers/models/unigram.rb#4
|
|
266
|
+
def new(vocab: T.unsafe(nil), unk_id: T.unsafe(nil), byte_fallback: T.unsafe(nil)); end
|
|
267
|
+
end
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
# source://tokenizers//lib/tokenizers/models/word_level.rb#3
|
|
271
|
+
class Tokenizers::Models::WordLevel < ::Tokenizers::Models::Model
|
|
272
|
+
def unk_token; end
|
|
273
|
+
def unk_token=(_arg0); end
|
|
274
|
+
|
|
275
|
+
class << self
|
|
276
|
+
def _from_file(_arg0, _arg1); end
|
|
277
|
+
def _new(_arg0, _arg1); end
|
|
278
|
+
|
|
279
|
+
# source://tokenizers//lib/tokenizers/models/word_level.rb#8
|
|
280
|
+
def from_file(vocab, unk_token: T.unsafe(nil)); end
|
|
281
|
+
|
|
282
|
+
# source://tokenizers//lib/tokenizers/models/word_level.rb#4
|
|
283
|
+
def new(vocab: T.unsafe(nil), unk_token: T.unsafe(nil)); end
|
|
284
|
+
|
|
285
|
+
def read_file(_arg0); end
|
|
286
|
+
end
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
# source://tokenizers//lib/tokenizers/models/word_piece.rb#3
|
|
290
|
+
class Tokenizers::Models::WordPiece < ::Tokenizers::Models::Model
|
|
291
|
+
def continuing_subword_prefix; end
|
|
292
|
+
def continuing_subword_prefix=(_arg0); end
|
|
293
|
+
def max_input_chars_per_word; end
|
|
294
|
+
def max_input_chars_per_word=(_arg0); end
|
|
295
|
+
def unk_token; end
|
|
296
|
+
def unk_token=(_arg0); end
|
|
297
|
+
|
|
298
|
+
class << self
|
|
299
|
+
def _from_file(_arg0, _arg1); end
|
|
300
|
+
def _new(_arg0, _arg1); end
|
|
301
|
+
|
|
302
|
+
# source://tokenizers//lib/tokenizers/models/word_piece.rb#4
|
|
303
|
+
def new(vocab: T.unsafe(nil), **kwargs); end
|
|
304
|
+
end
|
|
305
|
+
end
|
|
306
|
+
|
|
307
|
+
# source://tokenizers//lib/tokenizers/normalizers/bert_normalizer.rb#2
|
|
308
|
+
module Tokenizers::Normalizers; end
|
|
309
|
+
|
|
310
|
+
# source://tokenizers//lib/tokenizers/normalizers/bert_normalizer.rb#3
|
|
311
|
+
class Tokenizers::Normalizers::BertNormalizer < ::Tokenizers::Normalizers::Normalizer
|
|
312
|
+
def clean_text; end
|
|
313
|
+
def clean_text=(_arg0); end
|
|
314
|
+
def handle_chinese_chars; end
|
|
315
|
+
def handle_chinese_chars=(_arg0); end
|
|
316
|
+
def lowercase; end
|
|
317
|
+
def lowercase=(_arg0); end
|
|
318
|
+
def strip_accents; end
|
|
319
|
+
def strip_accents=(_arg0); end
|
|
320
|
+
|
|
321
|
+
class << self
|
|
322
|
+
def _new(_arg0, _arg1, _arg2, _arg3); end
|
|
323
|
+
|
|
324
|
+
# source://tokenizers//lib/tokenizers/normalizers/bert_normalizer.rb#4
|
|
325
|
+
def new(clean_text: T.unsafe(nil), handle_chinese_chars: T.unsafe(nil), strip_accents: T.unsafe(nil), lowercase: T.unsafe(nil)); end
|
|
326
|
+
end
|
|
327
|
+
end
|
|
328
|
+
|
|
329
|
+
class Tokenizers::Normalizers::Lowercase < ::Tokenizers::Normalizers::Normalizer
|
|
330
|
+
class << self
|
|
331
|
+
def new; end
|
|
332
|
+
end
|
|
333
|
+
end
|
|
334
|
+
|
|
335
|
+
class Tokenizers::Normalizers::NFC < ::Tokenizers::Normalizers::Normalizer
|
|
336
|
+
class << self
|
|
337
|
+
def new; end
|
|
338
|
+
end
|
|
339
|
+
end
|
|
340
|
+
|
|
341
|
+
class Tokenizers::Normalizers::NFD < ::Tokenizers::Normalizers::Normalizer
|
|
342
|
+
class << self
|
|
343
|
+
def new; end
|
|
344
|
+
end
|
|
345
|
+
end
|
|
346
|
+
|
|
347
|
+
class Tokenizers::Normalizers::NFKC < ::Tokenizers::Normalizers::Normalizer
|
|
348
|
+
class << self
|
|
349
|
+
def new; end
|
|
350
|
+
end
|
|
351
|
+
end
|
|
352
|
+
|
|
353
|
+
class Tokenizers::Normalizers::NFKD < ::Tokenizers::Normalizers::Normalizer
|
|
354
|
+
class << self
|
|
355
|
+
def new; end
|
|
356
|
+
end
|
|
357
|
+
end
|
|
358
|
+
|
|
359
|
+
class Tokenizers::Normalizers::Nmt < ::Tokenizers::Normalizers::Normalizer
|
|
360
|
+
class << self
|
|
361
|
+
def new; end
|
|
362
|
+
end
|
|
363
|
+
end
|
|
364
|
+
|
|
365
|
+
class Tokenizers::Normalizers::Normalizer
|
|
366
|
+
def normalize_str(_arg0); end
|
|
367
|
+
end
|
|
368
|
+
|
|
369
|
+
class Tokenizers::Normalizers::Precompiled < ::Tokenizers::Normalizers::Normalizer
|
|
370
|
+
class << self
|
|
371
|
+
def new(_arg0); end
|
|
372
|
+
end
|
|
373
|
+
end
|
|
374
|
+
|
|
375
|
+
# source://tokenizers//lib/tokenizers/normalizers/prepend.rb#3
|
|
376
|
+
class Tokenizers::Normalizers::Prepend < ::Tokenizers::Normalizers::Normalizer
|
|
377
|
+
def prepend; end
|
|
378
|
+
def prepend=(_arg0); end
|
|
379
|
+
|
|
380
|
+
class << self
|
|
381
|
+
def _new(_arg0); end
|
|
382
|
+
|
|
383
|
+
# source://tokenizers//lib/tokenizers/normalizers/prepend.rb#4
|
|
384
|
+
def new(prepend: T.unsafe(nil)); end
|
|
385
|
+
end
|
|
386
|
+
end
|
|
387
|
+
|
|
388
|
+
class Tokenizers::Normalizers::Replace < ::Tokenizers::Normalizers::Normalizer
|
|
389
|
+
class << self
|
|
390
|
+
def new(_arg0, _arg1); end
|
|
391
|
+
end
|
|
392
|
+
end
|
|
393
|
+
|
|
394
|
+
class Tokenizers::Normalizers::Sequence < ::Tokenizers::Normalizers::Normalizer
|
|
395
|
+
class << self
|
|
396
|
+
def new(_arg0); end
|
|
397
|
+
end
|
|
398
|
+
end
|
|
399
|
+
|
|
400
|
+
# source://tokenizers//lib/tokenizers/normalizers/strip.rb#3
|
|
401
|
+
class Tokenizers::Normalizers::Strip < ::Tokenizers::Normalizers::Normalizer
|
|
402
|
+
def left; end
|
|
403
|
+
def left=(_arg0); end
|
|
404
|
+
def right; end
|
|
405
|
+
def right=(_arg0); end
|
|
406
|
+
|
|
407
|
+
class << self
|
|
408
|
+
def _new(_arg0, _arg1); end
|
|
409
|
+
|
|
410
|
+
# source://tokenizers//lib/tokenizers/normalizers/strip.rb#4
|
|
411
|
+
def new(left: T.unsafe(nil), right: T.unsafe(nil)); end
|
|
412
|
+
end
|
|
413
|
+
end
|
|
414
|
+
|
|
415
|
+
class Tokenizers::Normalizers::StripAccents < ::Tokenizers::Normalizers::Normalizer
|
|
416
|
+
class << self
|
|
417
|
+
def new; end
|
|
418
|
+
end
|
|
419
|
+
end
|
|
420
|
+
|
|
421
|
+
# source://tokenizers//lib/tokenizers/pre_tokenizers/byte_level.rb#2
|
|
422
|
+
module Tokenizers::PreTokenizers; end
|
|
423
|
+
|
|
424
|
+
class Tokenizers::PreTokenizers::BertPreTokenizer < ::Tokenizers::PreTokenizers::PreTokenizer
|
|
425
|
+
class << self
|
|
426
|
+
def new; end
|
|
427
|
+
end
|
|
428
|
+
end
|
|
429
|
+
|
|
430
|
+
# source://tokenizers//lib/tokenizers/pre_tokenizers/byte_level.rb#3
|
|
431
|
+
class Tokenizers::PreTokenizers::ByteLevel < ::Tokenizers::PreTokenizers::PreTokenizer
|
|
432
|
+
def add_prefix_space; end
|
|
433
|
+
def add_prefix_space=(_arg0); end
|
|
434
|
+
def use_regex; end
|
|
435
|
+
def use_regex=(_arg0); end
|
|
436
|
+
|
|
437
|
+
class << self
|
|
438
|
+
def _new(_arg0, _arg1); end
|
|
439
|
+
def alphabet; end
|
|
440
|
+
|
|
441
|
+
# source://tokenizers//lib/tokenizers/pre_tokenizers/byte_level.rb#4
|
|
442
|
+
def new(add_prefix_space: T.unsafe(nil), use_regex: T.unsafe(nil)); end
|
|
443
|
+
end
|
|
444
|
+
end
|
|
445
|
+
|
|
446
|
+
class Tokenizers::PreTokenizers::CharDelimiterSplit < ::Tokenizers::PreTokenizers::PreTokenizer
|
|
447
|
+
def delimiter; end
|
|
448
|
+
def delimiter=(_arg0); end
|
|
449
|
+
|
|
450
|
+
class << self
|
|
451
|
+
def new(_arg0); end
|
|
452
|
+
end
|
|
453
|
+
end
|
|
454
|
+
|
|
455
|
+
# source://tokenizers//lib/tokenizers/pre_tokenizers/digits.rb#3
|
|
456
|
+
class Tokenizers::PreTokenizers::Digits < ::Tokenizers::PreTokenizers::PreTokenizer
|
|
457
|
+
def individual_digits; end
|
|
458
|
+
def individual_digits=(_arg0); end
|
|
459
|
+
|
|
460
|
+
class << self
|
|
461
|
+
def _new(_arg0); end
|
|
462
|
+
|
|
463
|
+
# source://tokenizers//lib/tokenizers/pre_tokenizers/digits.rb#4
|
|
464
|
+
def new(individual_digits: T.unsafe(nil)); end
|
|
465
|
+
end
|
|
466
|
+
end
|
|
467
|
+
|
|
468
|
+
# source://tokenizers//lib/tokenizers/pre_tokenizers/metaspace.rb#3
|
|
469
|
+
class Tokenizers::PreTokenizers::Metaspace < ::Tokenizers::PreTokenizers::PreTokenizer
|
|
470
|
+
def prepend_scheme; end
|
|
471
|
+
def prepend_scheme=(_arg0); end
|
|
472
|
+
def replacement; end
|
|
473
|
+
def replacement=(_arg0); end
|
|
474
|
+
def split; end
|
|
475
|
+
def split=(_arg0); end
|
|
476
|
+
|
|
477
|
+
class << self
|
|
478
|
+
def _new(_arg0, _arg1, _arg2); end
|
|
479
|
+
|
|
480
|
+
# source://tokenizers//lib/tokenizers/pre_tokenizers/metaspace.rb#4
|
|
481
|
+
def new(replacement: T.unsafe(nil), prepend_scheme: T.unsafe(nil), split: T.unsafe(nil)); end
|
|
482
|
+
end
|
|
483
|
+
end
|
|
484
|
+
|
|
485
|
+
class Tokenizers::PreTokenizers::PreTokenizer
|
|
486
|
+
def pre_tokenize_str(_arg0); end
|
|
487
|
+
end
|
|
488
|
+
|
|
489
|
+
# source://tokenizers//lib/tokenizers/pre_tokenizers/punctuation.rb#3
|
|
490
|
+
class Tokenizers::PreTokenizers::Punctuation < ::Tokenizers::PreTokenizers::PreTokenizer
|
|
491
|
+
class << self
|
|
492
|
+
def _new(_arg0); end
|
|
493
|
+
|
|
494
|
+
# source://tokenizers//lib/tokenizers/pre_tokenizers/punctuation.rb#4
|
|
495
|
+
def new(behavior: T.unsafe(nil)); end
|
|
496
|
+
end
|
|
497
|
+
end
|
|
498
|
+
|
|
499
|
+
class Tokenizers::PreTokenizers::Sequence < ::Tokenizers::PreTokenizers::PreTokenizer
|
|
500
|
+
class << self
|
|
501
|
+
def new(_arg0); end
|
|
502
|
+
end
|
|
503
|
+
end
|
|
504
|
+
|
|
505
|
+
# source://tokenizers//lib/tokenizers/pre_tokenizers/split.rb#3
|
|
506
|
+
class Tokenizers::PreTokenizers::Split < ::Tokenizers::PreTokenizers::PreTokenizer
|
|
507
|
+
class << self
|
|
508
|
+
def _new(_arg0, _arg1, _arg2); end
|
|
509
|
+
|
|
510
|
+
# source://tokenizers//lib/tokenizers/pre_tokenizers/split.rb#4
|
|
511
|
+
def new(pattern, behavior, invert: T.unsafe(nil)); end
|
|
512
|
+
end
|
|
513
|
+
end
|
|
514
|
+
|
|
515
|
+
class Tokenizers::PreTokenizers::UnicodeScripts < ::Tokenizers::PreTokenizers::PreTokenizer
|
|
516
|
+
class << self
|
|
517
|
+
def new; end
|
|
518
|
+
end
|
|
519
|
+
end
|
|
520
|
+
|
|
521
|
+
class Tokenizers::PreTokenizers::Whitespace < ::Tokenizers::PreTokenizers::PreTokenizer
|
|
522
|
+
class << self
|
|
523
|
+
def new; end
|
|
524
|
+
end
|
|
525
|
+
end
|
|
526
|
+
|
|
527
|
+
class Tokenizers::PreTokenizers::WhitespaceSplit < ::Tokenizers::PreTokenizers::PreTokenizer
|
|
528
|
+
class << self
|
|
529
|
+
def new; end
|
|
530
|
+
end
|
|
531
|
+
end
|
|
532
|
+
|
|
533
|
+
# source://tokenizers//lib/tokenizers/processors/byte_level.rb#2
|
|
534
|
+
module Tokenizers::Processors; end
|
|
535
|
+
|
|
536
|
+
class Tokenizers::Processors::BertProcessing < ::Tokenizers::Processors::PostProcessor
|
|
537
|
+
class << self
|
|
538
|
+
def new(_arg0, _arg1); end
|
|
539
|
+
end
|
|
540
|
+
end
|
|
541
|
+
|
|
542
|
+
# source://tokenizers//lib/tokenizers/processors/byte_level.rb#3
|
|
543
|
+
class Tokenizers::Processors::ByteLevel < ::Tokenizers::Processors::PostProcessor
|
|
544
|
+
class << self
|
|
545
|
+
def _new(_arg0); end
|
|
546
|
+
|
|
547
|
+
# source://tokenizers//lib/tokenizers/processors/byte_level.rb#4
|
|
548
|
+
def new(trim_offsets: T.unsafe(nil)); end
|
|
549
|
+
end
|
|
550
|
+
end
|
|
551
|
+
|
|
552
|
+
class Tokenizers::Processors::PostProcessor; end
|
|
553
|
+
|
|
554
|
+
# source://tokenizers//lib/tokenizers/processors/roberta_processing.rb#3
|
|
555
|
+
class Tokenizers::Processors::RobertaProcessing < ::Tokenizers::Processors::PostProcessor
|
|
556
|
+
class << self
|
|
557
|
+
def _new(_arg0, _arg1, _arg2, _arg3); end
|
|
558
|
+
|
|
559
|
+
# source://tokenizers//lib/tokenizers/processors/roberta_processing.rb#4
|
|
560
|
+
def new(sep, cls, trim_offsets: T.unsafe(nil), add_prefix_space: T.unsafe(nil)); end
|
|
561
|
+
end
|
|
562
|
+
end
|
|
563
|
+
|
|
564
|
+
# source://tokenizers//lib/tokenizers/processors/template_processing.rb#3
|
|
565
|
+
class Tokenizers::Processors::TemplateProcessing < ::Tokenizers::Processors::PostProcessor
|
|
566
|
+
class << self
|
|
567
|
+
def _new(_arg0, _arg1, _arg2); end
|
|
568
|
+
|
|
569
|
+
# source://tokenizers//lib/tokenizers/processors/template_processing.rb#4
|
|
570
|
+
def new(single: T.unsafe(nil), pair: T.unsafe(nil), special_tokens: T.unsafe(nil)); end
|
|
571
|
+
end
|
|
572
|
+
end
|
|
573
|
+
|
|
574
|
+
class Tokenizers::Regex
|
|
575
|
+
class << self
|
|
576
|
+
def new(_arg0); end
|
|
577
|
+
end
|
|
578
|
+
end
|
|
579
|
+
|
|
580
|
+
# source://tokenizers//lib/tokenizers/tokenizer.rb#2
|
|
581
|
+
class Tokenizers::Tokenizer
|
|
582
|
+
extend ::Tokenizers::FromPretrained
|
|
583
|
+
|
|
584
|
+
def _decode(_arg0, _arg1); end
|
|
585
|
+
def _decode_batch(_arg0, _arg1); end
|
|
586
|
+
def _enable_padding(_arg0); end
|
|
587
|
+
def _enable_truncation(_arg0, _arg1); end
|
|
588
|
+
def _encode(_arg0, _arg1, _arg2, _arg3); end
|
|
589
|
+
def _encode_batch(_arg0, _arg1, _arg2); end
|
|
590
|
+
def _save(_arg0, _arg1); end
|
|
591
|
+
def _to_s(_arg0); end
|
|
592
|
+
def _vocab(_arg0); end
|
|
593
|
+
def _vocab_size(_arg0); end
|
|
594
|
+
def add_special_tokens(_arg0); end
|
|
595
|
+
def add_tokens(_arg0); end
|
|
596
|
+
def added_tokens_decoder; end
|
|
597
|
+
|
|
598
|
+
# source://tokenizers//lib/tokenizers/tokenizer.rb#21
|
|
599
|
+
def decode(ids, skip_special_tokens: T.unsafe(nil)); end
|
|
600
|
+
|
|
601
|
+
# source://tokenizers//lib/tokenizers/tokenizer.rb#25
|
|
602
|
+
def decode_batch(sequences, skip_special_tokens: T.unsafe(nil)); end
|
|
603
|
+
|
|
604
|
+
def decoder; end
|
|
605
|
+
def decoder=(_arg0); end
|
|
606
|
+
|
|
607
|
+
# source://tokenizers//lib/tokenizers/tokenizer.rb#29
|
|
608
|
+
def enable_padding(**options); end
|
|
609
|
+
|
|
610
|
+
# source://tokenizers//lib/tokenizers/tokenizer.rb#33
|
|
611
|
+
def enable_truncation(max_length, **options); end
|
|
612
|
+
|
|
613
|
+
# source://tokenizers//lib/tokenizers/tokenizer.rb#13
|
|
614
|
+
def encode(sequence, pair = T.unsafe(nil), is_pretokenized: T.unsafe(nil), add_special_tokens: T.unsafe(nil)); end
|
|
615
|
+
|
|
616
|
+
# source://tokenizers//lib/tokenizers/tokenizer.rb#17
|
|
617
|
+
def encode_batch(input, is_pretokenized: T.unsafe(nil), add_special_tokens: T.unsafe(nil)); end
|
|
618
|
+
|
|
619
|
+
def id_to_token(_arg0); end
|
|
620
|
+
def model; end
|
|
621
|
+
def model=(_arg0); end
|
|
622
|
+
def no_padding; end
|
|
623
|
+
def no_truncation; end
|
|
624
|
+
def normalizer; end
|
|
625
|
+
def normalizer=(_arg0); end
|
|
626
|
+
def num_special_tokens_to_add(_arg0); end
|
|
627
|
+
def padding; end
|
|
628
|
+
def post_processor; end
|
|
629
|
+
def post_processor=(_arg0); end
|
|
630
|
+
def pre_tokenizer; end
|
|
631
|
+
def pre_tokenizer=(_arg0); end
|
|
632
|
+
|
|
633
|
+
# source://tokenizers//lib/tokenizers/tokenizer.rb#9
|
|
634
|
+
def save(path, pretty: T.unsafe(nil)); end
|
|
635
|
+
|
|
636
|
+
# source://tokenizers//lib/tokenizers/tokenizer.rb#5
|
|
637
|
+
def to_s(pretty: T.unsafe(nil)); end
|
|
638
|
+
|
|
639
|
+
def token_to_id(_arg0); end
|
|
640
|
+
def train(_arg0, _arg1); end
|
|
641
|
+
def truncation; end
|
|
642
|
+
|
|
643
|
+
# source://tokenizers//lib/tokenizers/tokenizer.rb#37
|
|
644
|
+
def vocab(with_added_tokens: T.unsafe(nil)); end
|
|
645
|
+
|
|
646
|
+
# source://tokenizers//lib/tokenizers/tokenizer.rb#41
|
|
647
|
+
def vocab_size(with_added_tokens: T.unsafe(nil)); end
|
|
648
|
+
|
|
649
|
+
class << self
|
|
650
|
+
def from_file(_arg0); end
|
|
651
|
+
def from_str(_arg0); end
|
|
652
|
+
def new(_arg0); end
|
|
653
|
+
end
|
|
654
|
+
end
|
|
655
|
+
|
|
656
|
+
# source://tokenizers//lib/tokenizers/trainers/bpe_trainer.rb#2
|
|
657
|
+
module Tokenizers::Trainers; end
|
|
658
|
+
|
|
659
|
+
# source://tokenizers//lib/tokenizers/trainers/bpe_trainer.rb#3
|
|
660
|
+
class Tokenizers::Trainers::BpeTrainer < ::Tokenizers::Trainers::Trainer
|
|
661
|
+
def continuing_subword_prefix; end
|
|
662
|
+
def continuing_subword_prefix=(_arg0); end
|
|
663
|
+
def end_of_word_suffix; end
|
|
664
|
+
def end_of_word_suffix=(_arg0); end
|
|
665
|
+
def initial_alphabet; end
|
|
666
|
+
def initial_alphabet=(_arg0); end
|
|
667
|
+
def limit_alphabet; end
|
|
668
|
+
def limit_alphabet=(_arg0); end
|
|
669
|
+
def min_frequency; end
|
|
670
|
+
def min_frequency=(_arg0); end
|
|
671
|
+
def show_progress; end
|
|
672
|
+
def show_progress=(_arg0); end
|
|
673
|
+
def special_tokens; end
|
|
674
|
+
def special_tokens=(_arg0); end
|
|
675
|
+
def vocab_size; end
|
|
676
|
+
def vocab_size=(_arg0); end
|
|
677
|
+
|
|
678
|
+
class << self
|
|
679
|
+
def _new(_arg0); end
|
|
680
|
+
|
|
681
|
+
# source://tokenizers//lib/tokenizers/trainers/bpe_trainer.rb#4
|
|
682
|
+
def new(**options); end
|
|
683
|
+
end
|
|
684
|
+
end
|
|
685
|
+
|
|
686
|
+
class Tokenizers::Trainers::Trainer; end
|
|
687
|
+
|
|
688
|
+
# source://tokenizers//lib/tokenizers/trainers/unigram_trainer.rb#3
|
|
689
|
+
class Tokenizers::Trainers::UnigramTrainer < ::Tokenizers::Trainers::Trainer
|
|
690
|
+
def initial_alphabet; end
|
|
691
|
+
def initial_alphabet=(_arg0); end
|
|
692
|
+
def show_progress; end
|
|
693
|
+
def show_progress=(_arg0); end
|
|
694
|
+
def special_tokens; end
|
|
695
|
+
def special_tokens=(_arg0); end
|
|
696
|
+
def vocab_size; end
|
|
697
|
+
def vocab_size=(_arg0); end
|
|
698
|
+
|
|
699
|
+
class << self
|
|
700
|
+
def _new(_arg0); end
|
|
701
|
+
|
|
702
|
+
# source://tokenizers//lib/tokenizers/trainers/unigram_trainer.rb#4
|
|
703
|
+
def new(vocab_size: T.unsafe(nil), show_progress: T.unsafe(nil), special_tokens: T.unsafe(nil), initial_alphabet: T.unsafe(nil), shrinking_factor: T.unsafe(nil), unk_token: T.unsafe(nil), max_piece_length: T.unsafe(nil), n_sub_iterations: T.unsafe(nil)); end
|
|
704
|
+
end
|
|
705
|
+
end
|
|
706
|
+
|
|
707
|
+
# source://tokenizers//lib/tokenizers/trainers/word_level_trainer.rb#3
|
|
708
|
+
class Tokenizers::Trainers::WordLevelTrainer < ::Tokenizers::Trainers::Trainer
|
|
709
|
+
def min_frequency; end
|
|
710
|
+
def min_frequency=(_arg0); end
|
|
711
|
+
def show_progress; end
|
|
712
|
+
def show_progress=(_arg0); end
|
|
713
|
+
def special_tokens; end
|
|
714
|
+
def special_tokens=(_arg0); end
|
|
715
|
+
def vocab_size; end
|
|
716
|
+
def vocab_size=(_arg0); end
|
|
717
|
+
|
|
718
|
+
class << self
|
|
719
|
+
def _new(_arg0); end
|
|
720
|
+
|
|
721
|
+
# source://tokenizers//lib/tokenizers/trainers/word_level_trainer.rb#4
|
|
722
|
+
def new(**options); end
|
|
723
|
+
end
|
|
724
|
+
end
|
|
725
|
+
|
|
726
|
+
# source://tokenizers//lib/tokenizers/trainers/word_piece_trainer.rb#3
|
|
727
|
+
class Tokenizers::Trainers::WordPieceTrainer < ::Tokenizers::Trainers::Trainer
|
|
728
|
+
def continuing_subword_prefix; end
|
|
729
|
+
def continuing_subword_prefix=(_arg0); end
|
|
730
|
+
def end_of_word_suffix; end
|
|
731
|
+
def end_of_word_suffix=(_arg0); end
|
|
732
|
+
def initial_alphabet; end
|
|
733
|
+
def initial_alphabet=(_arg0); end
|
|
734
|
+
def limit_alphabet; end
|
|
735
|
+
def limit_alphabet=(_arg0); end
|
|
736
|
+
def min_frequency; end
|
|
737
|
+
def min_frequency=(_arg0); end
|
|
738
|
+
def show_progress; end
|
|
739
|
+
def show_progress=(_arg0); end
|
|
740
|
+
def special_tokens; end
|
|
741
|
+
def special_tokens=(_arg0); end
|
|
742
|
+
def vocab_size; end
|
|
743
|
+
def vocab_size=(_arg0); end
|
|
744
|
+
|
|
745
|
+
class << self
|
|
746
|
+
def _new(_arg0); end
|
|
747
|
+
|
|
748
|
+
# source://tokenizers//lib/tokenizers/trainers/word_piece_trainer.rb#4
|
|
749
|
+
def new(vocab_size: T.unsafe(nil), min_frequency: T.unsafe(nil), show_progress: T.unsafe(nil), special_tokens: T.unsafe(nil), limit_alphabet: T.unsafe(nil), initial_alphabet: T.unsafe(nil), continuing_subword_prefix: T.unsafe(nil), end_of_word_suffix: T.unsafe(nil)); end
|
|
750
|
+
end
|
|
751
|
+
end
|
|
752
|
+
|
|
753
|
+
# source://tokenizers//lib/tokenizers/version.rb#2
|
|
754
|
+
Tokenizers::VERSION = T.let(T.unsafe(nil), String)
|