tokenizers 0.3.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/Cargo.lock +52 -23
- data/ext/tokenizers/Cargo.toml +4 -3
- data/ext/tokenizers/src/decoders.rs +72 -61
- data/ext/tokenizers/src/error.rs +5 -3
- data/ext/tokenizers/src/lib.rs +21 -33
- data/ext/tokenizers/src/models.rs +57 -51
- data/ext/tokenizers/src/normalizers.rs +90 -77
- data/ext/tokenizers/src/pre_tokenizers.rs +85 -73
- data/ext/tokenizers/src/processors.rs +43 -38
- data/ext/tokenizers/src/tokenizer.rs +35 -28
- data/ext/tokenizers/src/trainers.rs +82 -80
- data/ext/tokenizers/src/utils/normalization.rs +4 -3
- data/ext/tokenizers/src/utils/regex.rs +5 -3
- data/lib/tokenizers/from_pretrained.rb +1 -1
- data/lib/tokenizers/models/unigram.rb +2 -2
- data/lib/tokenizers/version.rb +1 -1
- data/lib/tokenizers.rb +2 -2
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0677a3662d1f3af9007bdc0524a71903270cfbb85c3bf0efaa2365946df6c5b1
|
4
|
+
data.tar.gz: 00e49737180957c88fb8f8418de40c8983715c733275df16c1ab2fe97355f92d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f1fd8f9a57be7cac938ae6bc46533f540b210ef6d46a3f46e168a257d2b3156000b9f451e1085a9c90dd54444abeaa69b5d2ae2905fdb5abfc85fb7d610f1427
|
7
|
+
data.tar.gz: c748fb9150431ecce025e9f98f870e8140f186a2cc796eb5df2dfef630db05ab45eb8f23f2dc5e50d39b4eadb5fc1f5e4902482f41b6dd469ff95eae4ec3d2f3
|
data/CHANGELOG.md
CHANGED
data/Cargo.lock
CHANGED
@@ -11,6 +11,15 @@ dependencies = [
|
|
11
11
|
"memchr",
|
12
12
|
]
|
13
13
|
|
14
|
+
[[package]]
|
15
|
+
name = "aho-corasick"
|
16
|
+
version = "1.0.5"
|
17
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
18
|
+
checksum = "0c378d78423fdad8089616f827526ee33c19f2fddbd5de1629152c9593ba4783"
|
19
|
+
dependencies = [
|
20
|
+
"memchr",
|
21
|
+
]
|
22
|
+
|
14
23
|
[[package]]
|
15
24
|
name = "autocfg"
|
16
25
|
version = "1.1.0"
|
@@ -25,9 +34,9 @@ checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
|
|
25
34
|
|
26
35
|
[[package]]
|
27
36
|
name = "bindgen"
|
28
|
-
version = "0.
|
37
|
+
version = "0.62.0"
|
29
38
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
30
|
-
checksum = "
|
39
|
+
checksum = "c6720a8b7b2d39dd533285ed438d458f65b31b5c257e6ac7bb3d7e82844dd722"
|
31
40
|
dependencies = [
|
32
41
|
"bitflags",
|
33
42
|
"cexpr",
|
@@ -40,6 +49,7 @@ dependencies = [
|
|
40
49
|
"regex",
|
41
50
|
"rustc-hash",
|
42
51
|
"shlex",
|
52
|
+
"syn 1.0.109",
|
43
53
|
]
|
44
54
|
|
45
55
|
[[package]]
|
@@ -352,31 +362,32 @@ checksum = "58093314a45e00c77d5c508f76e77c3396afbbc0d01506e7fae47b018bac2b1d"
|
|
352
362
|
|
353
363
|
[[package]]
|
354
364
|
name = "magnus"
|
355
|
-
version = "0.
|
365
|
+
version = "0.6.0"
|
356
366
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
357
|
-
checksum = "
|
367
|
+
checksum = "68e9585bfe236e88e6b10b6d8eb5349bd0e0009f3f9dff8d2e99a82601b33743"
|
358
368
|
dependencies = [
|
359
369
|
"magnus-macros",
|
360
370
|
"rb-sys",
|
361
371
|
"rb-sys-env",
|
372
|
+
"seq-macro",
|
362
373
|
]
|
363
374
|
|
364
375
|
[[package]]
|
365
376
|
name = "magnus-macros"
|
366
|
-
version = "0.
|
377
|
+
version = "0.6.0"
|
367
378
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
368
|
-
checksum = "
|
379
|
+
checksum = "5968c820e2960565f647819f5928a42d6e874551cab9d88d75e3e0660d7f71e3"
|
369
380
|
dependencies = [
|
370
381
|
"proc-macro2",
|
371
382
|
"quote",
|
372
|
-
"syn
|
383
|
+
"syn 2.0.13",
|
373
384
|
]
|
374
385
|
|
375
386
|
[[package]]
|
376
387
|
name = "memchr"
|
377
|
-
version = "2.
|
388
|
+
version = "2.6.3"
|
378
389
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
379
|
-
checksum = "
|
390
|
+
checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c"
|
380
391
|
|
381
392
|
[[package]]
|
382
393
|
name = "memoffset"
|
@@ -575,18 +586,18 @@ dependencies = [
|
|
575
586
|
|
576
587
|
[[package]]
|
577
588
|
name = "rb-sys"
|
578
|
-
version = "0.9.
|
589
|
+
version = "0.9.79"
|
579
590
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
580
|
-
checksum = "
|
591
|
+
checksum = "939fb78db3e4f26665c1d4c7b91ca66d3578335a19aba552d4a6445811d07072"
|
581
592
|
dependencies = [
|
582
593
|
"rb-sys-build",
|
583
594
|
]
|
584
595
|
|
585
596
|
[[package]]
|
586
597
|
name = "rb-sys-build"
|
587
|
-
version = "0.9.
|
598
|
+
version = "0.9.79"
|
588
599
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
589
|
-
checksum = "
|
600
|
+
checksum = "335a95eb0420d52fa94ef12019df3c2c250c6b19cbb3c60bd05cb7e9c362072c"
|
590
601
|
dependencies = [
|
591
602
|
"bindgen",
|
592
603
|
"lazy_static",
|
@@ -605,20 +616,32 @@ checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
|
|
605
616
|
|
606
617
|
[[package]]
|
607
618
|
name = "regex"
|
608
|
-
version = "1.
|
619
|
+
version = "1.9.5"
|
609
620
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
610
|
-
checksum = "
|
621
|
+
checksum = "697061221ea1b4a94a624f67d0ae2bfe4e22b8a17b6a192afb11046542cc8c47"
|
611
622
|
dependencies = [
|
612
|
-
"aho-corasick",
|
623
|
+
"aho-corasick 1.0.5",
|
624
|
+
"memchr",
|
625
|
+
"regex-automata",
|
626
|
+
"regex-syntax",
|
627
|
+
]
|
628
|
+
|
629
|
+
[[package]]
|
630
|
+
name = "regex-automata"
|
631
|
+
version = "0.3.8"
|
632
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
633
|
+
checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795"
|
634
|
+
dependencies = [
|
635
|
+
"aho-corasick 1.0.5",
|
613
636
|
"memchr",
|
614
637
|
"regex-syntax",
|
615
638
|
]
|
616
639
|
|
617
640
|
[[package]]
|
618
641
|
name = "regex-syntax"
|
619
|
-
version = "0.
|
642
|
+
version = "0.7.5"
|
620
643
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
621
|
-
checksum = "
|
644
|
+
checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da"
|
622
645
|
|
623
646
|
[[package]]
|
624
647
|
name = "rustc-hash"
|
@@ -638,6 +661,12 @@ version = "1.1.0"
|
|
638
661
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
639
662
|
checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
|
640
663
|
|
664
|
+
[[package]]
|
665
|
+
name = "seq-macro"
|
666
|
+
version = "0.3.5"
|
667
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
668
|
+
checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
|
669
|
+
|
641
670
|
[[package]]
|
642
671
|
name = "serde"
|
643
672
|
version = "1.0.159"
|
@@ -749,21 +778,21 @@ dependencies = [
|
|
749
778
|
|
750
779
|
[[package]]
|
751
780
|
name = "tokenizers"
|
752
|
-
version = "0.
|
781
|
+
version = "0.4.0"
|
753
782
|
dependencies = [
|
754
783
|
"magnus",
|
755
784
|
"onig",
|
756
785
|
"serde",
|
757
|
-
"tokenizers 0.
|
786
|
+
"tokenizers 0.14.0",
|
758
787
|
]
|
759
788
|
|
760
789
|
[[package]]
|
761
790
|
name = "tokenizers"
|
762
|
-
version = "0.
|
791
|
+
version = "0.14.0"
|
763
792
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
764
|
-
checksum = "
|
793
|
+
checksum = "12b515a66453a4d68f03398054f7204fd0dde6b93d3f20ea90b08025ab49b499"
|
765
794
|
dependencies = [
|
766
|
-
"aho-corasick",
|
795
|
+
"aho-corasick 0.7.20",
|
767
796
|
"derive_builder",
|
768
797
|
"esaxx-rs",
|
769
798
|
"getrandom",
|
data/ext/tokenizers/Cargo.toml
CHANGED
@@ -1,20 +1,21 @@
|
|
1
1
|
[package]
|
2
2
|
name = "tokenizers"
|
3
|
-
version = "0.
|
3
|
+
version = "0.4.0"
|
4
4
|
license = "Apache-2.0"
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
6
6
|
edition = "2021"
|
7
|
+
rust-version = "1.62.0"
|
7
8
|
publish = false
|
8
9
|
|
9
10
|
[lib]
|
10
11
|
crate-type = ["cdylib"]
|
11
12
|
|
12
13
|
[dependencies]
|
13
|
-
magnus = "0.
|
14
|
+
magnus = "0.6"
|
14
15
|
onig = { version = "6", default-features = false }
|
15
16
|
serde = { version = "1", features = ["rc", "derive"] }
|
16
17
|
|
17
18
|
[dependencies.tokenizers]
|
18
|
-
version = "=0.
|
19
|
+
version = "=0.14.0" # also update in from_pretrained.rb
|
19
20
|
default-features = false
|
20
21
|
features = ["progressbar", "onig", "esaxx_fast"]
|
@@ -1,9 +1,9 @@
|
|
1
1
|
use std::sync::{Arc, RwLock};
|
2
2
|
|
3
|
-
use magnus::
|
3
|
+
use magnus::value::Lazy;
|
4
4
|
use magnus::{
|
5
|
-
|
6
|
-
TypedData,
|
5
|
+
data_type_builder, function, method, Class, DataType, DataTypeFunctions, Module, Object, RClass, RModule,
|
6
|
+
Ruby, TypedData,
|
7
7
|
};
|
8
8
|
use serde::{Deserialize, Serialize};
|
9
9
|
use tk::decoders::bpe::BPEDecoder;
|
@@ -19,7 +19,7 @@ use tk::Decoder;
|
|
19
19
|
use tk::normalizers::replace::Replace;
|
20
20
|
|
21
21
|
use super::utils::*;
|
22
|
-
use super::{RbError, RbResult};
|
22
|
+
use super::{DECODERS, RbError, RbResult};
|
23
23
|
|
24
24
|
#[derive(DataTypeFunctions, Clone, Deserialize, Serialize)]
|
25
25
|
pub struct RbDecoder {
|
@@ -260,74 +260,85 @@ impl Decoder for RbDecoderWrapper {
|
|
260
260
|
}
|
261
261
|
|
262
262
|
unsafe impl TypedData for RbDecoder {
|
263
|
-
fn class() -> RClass {
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
})
|
263
|
+
fn class(ruby: &Ruby) -> RClass {
|
264
|
+
static CLASS: Lazy<RClass> = Lazy::new(|ruby| {
|
265
|
+
let class: RClass = ruby.get_inner(&DECODERS).const_get("Decoder").unwrap();
|
266
|
+
class.undef_default_alloc_func();
|
267
|
+
class
|
268
|
+
});
|
269
|
+
ruby.get_inner(&CLASS)
|
269
270
|
}
|
270
271
|
|
271
272
|
fn data_type() -> &'static DataType {
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
273
|
+
static DATA_TYPE: DataType = data_type_builder!(RbDecoder, "Tokenizers::Decoders::Decoder").build();
|
274
|
+
&DATA_TYPE
|
275
|
+
}
|
276
|
+
|
277
|
+
fn class_for(ruby: &Ruby, value: &Self) -> RClass {
|
278
|
+
static BPE_DECODER: Lazy<RClass> = Lazy::new(|ruby| {
|
279
|
+
let class: RClass = ruby.get_inner(&DECODERS).const_get("BPEDecoder").unwrap();
|
280
|
+
class.undef_default_alloc_func();
|
281
|
+
class
|
282
|
+
});
|
283
|
+
static BYTE_FALLBACK: Lazy<RClass> = Lazy::new(|ruby| {
|
284
|
+
let class: RClass = ruby.get_inner(&DECODERS).const_get("ByteFallback").unwrap();
|
285
|
+
class.undef_default_alloc_func();
|
286
|
+
class
|
287
|
+
});
|
288
|
+
static BYTE_LEVEL: Lazy<RClass> = Lazy::new(|ruby| {
|
289
|
+
let class: RClass = ruby.get_inner(&DECODERS).const_get("ByteLevel").unwrap();
|
290
|
+
class.undef_default_alloc_func();
|
291
|
+
class
|
292
|
+
});
|
293
|
+
static CTC: Lazy<RClass> = Lazy::new(|ruby| {
|
294
|
+
let class: RClass = ruby.get_inner(&DECODERS).const_get("CTC").unwrap();
|
295
|
+
class.undef_default_alloc_func();
|
296
|
+
class
|
297
|
+
});
|
298
|
+
static FUSE: Lazy<RClass> = Lazy::new(|ruby| {
|
299
|
+
let class: RClass = ruby.get_inner(&DECODERS).const_get("Fuse").unwrap();
|
300
|
+
class.undef_default_alloc_func();
|
301
|
+
class
|
302
|
+
});
|
303
|
+
static METASPACE: Lazy<RClass> = Lazy::new(|ruby| {
|
304
|
+
let class: RClass = ruby.get_inner(&DECODERS).const_get("Metaspace").unwrap();
|
305
|
+
class.undef_default_alloc_func();
|
306
|
+
class
|
307
|
+
});
|
308
|
+
static REPLACE: Lazy<RClass> = Lazy::new(|ruby| {
|
309
|
+
let class: RClass = ruby.get_inner(&DECODERS).const_get("Replace").unwrap();
|
310
|
+
class.undef_default_alloc_func();
|
311
|
+
class
|
312
|
+
});
|
313
|
+
static STRIP: Lazy<RClass> = Lazy::new(|ruby| {
|
314
|
+
let class: RClass = ruby.get_inner(&DECODERS).const_get("Strip").unwrap();
|
315
|
+
class.undef_default_alloc_func();
|
316
|
+
class
|
317
|
+
});
|
318
|
+
static WORD_PIECE: Lazy<RClass> = Lazy::new(|ruby| {
|
319
|
+
let class: RClass = ruby.get_inner(&DECODERS).const_get("WordPiece").unwrap();
|
320
|
+
class.undef_default_alloc_func();
|
321
|
+
class
|
322
|
+
});
|
276
323
|
match &value.decoder {
|
277
324
|
RbDecoderWrapper::Wrapped(inner) => match *inner.read().unwrap() {
|
278
|
-
DecoderWrapper::BPE(_) =>
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
DecoderWrapper::
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
}),
|
288
|
-
DecoderWrapper::ByteLevel(_) => *memoize!(RClass: {
|
289
|
-
let class: RClass = crate::decoders().const_get("ByteLevel").unwrap();
|
290
|
-
class.undef_alloc_func();
|
291
|
-
class
|
292
|
-
}),
|
293
|
-
DecoderWrapper::CTC(_) => *memoize!(RClass: {
|
294
|
-
let class: RClass = crate::decoders().const_get("CTC").unwrap();
|
295
|
-
class.undef_alloc_func();
|
296
|
-
class
|
297
|
-
}),
|
298
|
-
DecoderWrapper::Fuse(_) => *memoize!(RClass: {
|
299
|
-
let class: RClass = crate::decoders().const_get("Fuse").unwrap();
|
300
|
-
class.undef_alloc_func();
|
301
|
-
class
|
302
|
-
}),
|
303
|
-
DecoderWrapper::Metaspace(_) => *memoize!(RClass: {
|
304
|
-
let class: RClass = crate::decoders().const_get("Metaspace").unwrap();
|
305
|
-
class.undef_alloc_func();
|
306
|
-
class
|
307
|
-
}),
|
308
|
-
DecoderWrapper::Replace(_) => *memoize!(RClass: {
|
309
|
-
let class: RClass = crate::decoders().const_get("Replace").unwrap();
|
310
|
-
class.undef_alloc_func();
|
311
|
-
class
|
312
|
-
}),
|
313
|
-
DecoderWrapper::Strip(_) => *memoize!(RClass: {
|
314
|
-
let class: RClass = crate::decoders().const_get("Strip").unwrap();
|
315
|
-
class.undef_alloc_func();
|
316
|
-
class
|
317
|
-
}),
|
318
|
-
DecoderWrapper::WordPiece(_) => *memoize!(RClass: {
|
319
|
-
let class: RClass = crate::decoders().const_get("WordPiece").unwrap();
|
320
|
-
class.undef_alloc_func();
|
321
|
-
class
|
322
|
-
}),
|
325
|
+
DecoderWrapper::BPE(_) => ruby.get_inner(&BPE_DECODER),
|
326
|
+
DecoderWrapper::ByteFallback(_) => ruby.get_inner(&BYTE_FALLBACK),
|
327
|
+
DecoderWrapper::ByteLevel(_) => ruby.get_inner(&BYTE_LEVEL),
|
328
|
+
DecoderWrapper::CTC(_) => ruby.get_inner(&CTC),
|
329
|
+
DecoderWrapper::Fuse(_) => ruby.get_inner(&FUSE),
|
330
|
+
DecoderWrapper::Metaspace(_) => ruby.get_inner(&METASPACE),
|
331
|
+
DecoderWrapper::Replace(_) => ruby.get_inner(&REPLACE),
|
332
|
+
DecoderWrapper::Strip(_) => ruby.get_inner(&STRIP),
|
333
|
+
DecoderWrapper::WordPiece(_) => ruby.get_inner(&WORD_PIECE),
|
323
334
|
_ => todo!(),
|
324
335
|
},
|
325
336
|
}
|
326
337
|
}
|
327
338
|
}
|
328
339
|
|
329
|
-
pub fn
|
330
|
-
let decoder = module.define_class("Decoder",
|
340
|
+
pub fn init_decoders(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
341
|
+
let decoder = module.define_class("Decoder", ruby.class_object())?;
|
331
342
|
|
332
343
|
let class = module.define_class("BPEDecoder", decoder)?;
|
333
344
|
class.define_singleton_method("_new", function!(RbBPEDecoder::new, 1))?;
|
data/ext/tokenizers/src/error.rs
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
use magnus::{
|
1
|
+
use magnus::{prelude::*, value::Lazy, Error, ExceptionClass, Ruby};
|
2
2
|
|
3
|
-
use super::
|
3
|
+
use super::TOKENIZERS;
|
4
4
|
|
5
5
|
pub struct RbError {}
|
6
6
|
|
@@ -11,6 +11,8 @@ impl RbError {
|
|
11
11
|
}
|
12
12
|
}
|
13
13
|
|
14
|
+
static ERROR: Lazy<ExceptionClass> = Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Error").unwrap());
|
15
|
+
|
14
16
|
fn error() -> ExceptionClass {
|
15
|
-
|
17
|
+
Ruby::get().unwrap().get_inner(&ERROR)
|
16
18
|
}
|
data/ext/tokenizers/src/lib.rs
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
#![allow(clippy::new_ret_no_self)]
|
2
|
+
|
1
3
|
extern crate tokenizers as tk;
|
2
4
|
|
3
5
|
mod decoders;
|
@@ -16,43 +18,29 @@ use error::RbError;
|
|
16
18
|
use tokenizer::RbTokenizer;
|
17
19
|
use utils::RbRegex;
|
18
20
|
|
19
|
-
use magnus::{
|
21
|
+
use magnus::{function, method, prelude::*, value::Lazy, Error, RModule, Ruby};
|
20
22
|
|
21
23
|
type RbResult<T> = Result<T, Error>;
|
22
24
|
|
23
|
-
|
24
|
-
*memoize!(RModule: define_module("Tokenizers").unwrap())
|
25
|
-
}
|
25
|
+
static TOKENIZERS: Lazy<RModule> = Lazy::new(|ruby| ruby.class_object().const_get("Tokenizers").unwrap());
|
26
26
|
|
27
|
-
|
28
|
-
*memoize!(RModule: module().const_get("Decoders").unwrap())
|
29
|
-
}
|
27
|
+
static DECODERS: Lazy<RModule> = Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Decoders").unwrap());
|
30
28
|
|
31
|
-
|
32
|
-
*memoize!(RModule: module().const_get("Models").unwrap())
|
33
|
-
}
|
29
|
+
static MODELS: Lazy<RModule> = Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Models").unwrap());
|
34
30
|
|
35
|
-
|
36
|
-
*memoize!(RModule: module().const_get("Normalizers").unwrap())
|
37
|
-
}
|
31
|
+
static NORMALIZERS: Lazy<RModule> = Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Normalizers").unwrap());
|
38
32
|
|
39
|
-
|
40
|
-
*memoize!(RModule: module().const_get("PreTokenizers").unwrap())
|
41
|
-
}
|
33
|
+
static PRE_TOKENIZERS: Lazy<RModule> = Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("PreTokenizers").unwrap());
|
42
34
|
|
43
|
-
|
44
|
-
*memoize!(RModule: module().const_get("Processors").unwrap())
|
45
|
-
}
|
35
|
+
static PROCESSORS: Lazy<RModule> = Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Processors").unwrap());
|
46
36
|
|
47
|
-
|
48
|
-
*memoize!(RModule: module().const_get("Trainers").unwrap())
|
49
|
-
}
|
37
|
+
static TRAINERS: Lazy<RModule> = Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Trainers").unwrap());
|
50
38
|
|
51
39
|
#[magnus::init]
|
52
|
-
fn init() -> RbResult<()> {
|
53
|
-
let module =
|
40
|
+
fn init(ruby: &Ruby) -> RbResult<()> {
|
41
|
+
let module = ruby.get_inner(&TOKENIZERS);
|
54
42
|
|
55
|
-
let class = module.define_class("Tokenizer",
|
43
|
+
let class = module.define_class("Tokenizer", ruby.class_object())?;
|
56
44
|
class.define_singleton_method("new", function!(RbTokenizer::from_model, 1))?;
|
57
45
|
class.define_singleton_method("from_file", function!(RbTokenizer::from_file, 1))?;
|
58
46
|
class.define_method(
|
@@ -86,7 +74,7 @@ fn init() -> RbResult<()> {
|
|
86
74
|
class.define_method("_vocab_size", method!(RbTokenizer::vocab_size, 1))?;
|
87
75
|
class.define_method("_to_s", method!(RbTokenizer::to_str, 1))?;
|
88
76
|
|
89
|
-
let class = module.define_class("Encoding",
|
77
|
+
let class = module.define_class("Encoding", ruby.class_object())?;
|
90
78
|
class.define_method("n_sequences", method!(RbEncoding::n_sequences, 0))?;
|
91
79
|
class.define_method("ids", method!(RbEncoding::ids, 0))?;
|
92
80
|
class.define_method("tokens", method!(RbEncoding::tokens, 0))?;
|
@@ -111,7 +99,7 @@ fn init() -> RbResult<()> {
|
|
111
99
|
class.define_method("_char_to_token", method!(RbEncoding::char_to_token, 2))?;
|
112
100
|
class.define_method("_char_to_word", method!(RbEncoding::char_to_word, 2))?;
|
113
101
|
|
114
|
-
let class = module.define_class("Regex",
|
102
|
+
let class = module.define_class("Regex", ruby.class_object())?;
|
115
103
|
class.define_singleton_method("new", function!(RbRegex::new, 1))?;
|
116
104
|
|
117
105
|
let models = module.define_module("Models")?;
|
@@ -121,12 +109,12 @@ fn init() -> RbResult<()> {
|
|
121
109
|
let normalizers = module.define_module("Normalizers")?;
|
122
110
|
let trainers = module.define_module("Trainers")?;
|
123
111
|
|
124
|
-
models::
|
125
|
-
pre_tokenizers::
|
126
|
-
decoders::
|
127
|
-
processors::
|
128
|
-
normalizers::
|
129
|
-
trainers::
|
112
|
+
models::init_models(ruby, &models)?;
|
113
|
+
pre_tokenizers::init_pre_tokenizers(ruby, &pre_tokenizers)?;
|
114
|
+
decoders::init_decoders(ruby, &decoders)?;
|
115
|
+
processors::init_processors(ruby, &processors)?;
|
116
|
+
normalizers::init_normalizers(ruby, &normalizers)?;
|
117
|
+
trainers::init_trainers(ruby, &trainers)?;
|
130
118
|
|
131
119
|
Ok(())
|
132
120
|
}
|