tokenizers 0.3.3 → 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/Cargo.lock +52 -23
- data/ext/tokenizers/Cargo.toml +4 -3
- data/ext/tokenizers/src/decoders.rs +72 -61
- data/ext/tokenizers/src/error.rs +5 -3
- data/ext/tokenizers/src/lib.rs +21 -33
- data/ext/tokenizers/src/models.rs +57 -51
- data/ext/tokenizers/src/normalizers.rs +90 -77
- data/ext/tokenizers/src/pre_tokenizers.rs +85 -73
- data/ext/tokenizers/src/processors.rs +43 -38
- data/ext/tokenizers/src/tokenizer.rs +35 -28
- data/ext/tokenizers/src/trainers.rs +82 -80
- data/ext/tokenizers/src/utils/normalization.rs +4 -3
- data/ext/tokenizers/src/utils/regex.rs +5 -3
- data/lib/tokenizers/from_pretrained.rb +1 -1
- data/lib/tokenizers/models/unigram.rb +2 -2
- data/lib/tokenizers/version.rb +1 -1
- data/lib/tokenizers.rb +2 -2
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ae078880dfee0d026206156174a482b7e5345aea4784bb4a3e1298c499dd0e3d
|
4
|
+
data.tar.gz: baedf2cd55c0b4332232924bc2439e8ab9f6ba6703794e376f7f34f5724717c2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6292155935e06d70b9ab862d2493154ec21f3cc1ec9a7188e00517f026a3d79460f84b08c9701b6eab2b758ab27ce2a5a4fb90c517ec7a1817f5de31a0b95324
|
7
|
+
data.tar.gz: 99b04f81650ae8b12be1e82dc8989a37d9d90542cb461c7fadf0e618f8ac4592b614fa357a462d2e71cb8833e058678ff6e5e5d421b825c969559f5569c89cd5
|
data/CHANGELOG.md
CHANGED
data/Cargo.lock
CHANGED
@@ -11,6 +11,15 @@ dependencies = [
|
|
11
11
|
"memchr",
|
12
12
|
]
|
13
13
|
|
14
|
+
[[package]]
|
15
|
+
name = "aho-corasick"
|
16
|
+
version = "1.0.5"
|
17
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
18
|
+
checksum = "0c378d78423fdad8089616f827526ee33c19f2fddbd5de1629152c9593ba4783"
|
19
|
+
dependencies = [
|
20
|
+
"memchr",
|
21
|
+
]
|
22
|
+
|
14
23
|
[[package]]
|
15
24
|
name = "autocfg"
|
16
25
|
version = "1.1.0"
|
@@ -25,9 +34,9 @@ checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
|
|
25
34
|
|
26
35
|
[[package]]
|
27
36
|
name = "bindgen"
|
28
|
-
version = "0.
|
37
|
+
version = "0.62.0"
|
29
38
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
30
|
-
checksum = "
|
39
|
+
checksum = "c6720a8b7b2d39dd533285ed438d458f65b31b5c257e6ac7bb3d7e82844dd722"
|
31
40
|
dependencies = [
|
32
41
|
"bitflags",
|
33
42
|
"cexpr",
|
@@ -40,6 +49,7 @@ dependencies = [
|
|
40
49
|
"regex",
|
41
50
|
"rustc-hash",
|
42
51
|
"shlex",
|
52
|
+
"syn 1.0.109",
|
43
53
|
]
|
44
54
|
|
45
55
|
[[package]]
|
@@ -352,31 +362,32 @@ checksum = "58093314a45e00c77d5c508f76e77c3396afbbc0d01506e7fae47b018bac2b1d"
|
|
352
362
|
|
353
363
|
[[package]]
|
354
364
|
name = "magnus"
|
355
|
-
version = "0.
|
365
|
+
version = "0.6.0"
|
356
366
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
357
|
-
checksum = "
|
367
|
+
checksum = "68e9585bfe236e88e6b10b6d8eb5349bd0e0009f3f9dff8d2e99a82601b33743"
|
358
368
|
dependencies = [
|
359
369
|
"magnus-macros",
|
360
370
|
"rb-sys",
|
361
371
|
"rb-sys-env",
|
372
|
+
"seq-macro",
|
362
373
|
]
|
363
374
|
|
364
375
|
[[package]]
|
365
376
|
name = "magnus-macros"
|
366
|
-
version = "0.
|
377
|
+
version = "0.6.0"
|
367
378
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
368
|
-
checksum = "
|
379
|
+
checksum = "5968c820e2960565f647819f5928a42d6e874551cab9d88d75e3e0660d7f71e3"
|
369
380
|
dependencies = [
|
370
381
|
"proc-macro2",
|
371
382
|
"quote",
|
372
|
-
"syn
|
383
|
+
"syn 2.0.13",
|
373
384
|
]
|
374
385
|
|
375
386
|
[[package]]
|
376
387
|
name = "memchr"
|
377
|
-
version = "2.
|
388
|
+
version = "2.6.3"
|
378
389
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
379
|
-
checksum = "
|
390
|
+
checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c"
|
380
391
|
|
381
392
|
[[package]]
|
382
393
|
name = "memoffset"
|
@@ -575,18 +586,18 @@ dependencies = [
|
|
575
586
|
|
576
587
|
[[package]]
|
577
588
|
name = "rb-sys"
|
578
|
-
version = "0.9.
|
589
|
+
version = "0.9.79"
|
579
590
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
580
|
-
checksum = "
|
591
|
+
checksum = "939fb78db3e4f26665c1d4c7b91ca66d3578335a19aba552d4a6445811d07072"
|
581
592
|
dependencies = [
|
582
593
|
"rb-sys-build",
|
583
594
|
]
|
584
595
|
|
585
596
|
[[package]]
|
586
597
|
name = "rb-sys-build"
|
587
|
-
version = "0.9.
|
598
|
+
version = "0.9.79"
|
588
599
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
589
|
-
checksum = "
|
600
|
+
checksum = "335a95eb0420d52fa94ef12019df3c2c250c6b19cbb3c60bd05cb7e9c362072c"
|
590
601
|
dependencies = [
|
591
602
|
"bindgen",
|
592
603
|
"lazy_static",
|
@@ -605,20 +616,32 @@ checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
|
|
605
616
|
|
606
617
|
[[package]]
|
607
618
|
name = "regex"
|
608
|
-
version = "1.
|
619
|
+
version = "1.9.5"
|
620
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
621
|
+
checksum = "697061221ea1b4a94a624f67d0ae2bfe4e22b8a17b6a192afb11046542cc8c47"
|
622
|
+
dependencies = [
|
623
|
+
"aho-corasick 1.0.5",
|
624
|
+
"memchr",
|
625
|
+
"regex-automata",
|
626
|
+
"regex-syntax",
|
627
|
+
]
|
628
|
+
|
629
|
+
[[package]]
|
630
|
+
name = "regex-automata"
|
631
|
+
version = "0.3.8"
|
609
632
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
610
|
-
checksum = "
|
633
|
+
checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795"
|
611
634
|
dependencies = [
|
612
|
-
"aho-corasick",
|
635
|
+
"aho-corasick 1.0.5",
|
613
636
|
"memchr",
|
614
637
|
"regex-syntax",
|
615
638
|
]
|
616
639
|
|
617
640
|
[[package]]
|
618
641
|
name = "regex-syntax"
|
619
|
-
version = "0.
|
642
|
+
version = "0.7.5"
|
620
643
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
621
|
-
checksum = "
|
644
|
+
checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da"
|
622
645
|
|
623
646
|
[[package]]
|
624
647
|
name = "rustc-hash"
|
@@ -638,6 +661,12 @@ version = "1.1.0"
|
|
638
661
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
639
662
|
checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
|
640
663
|
|
664
|
+
[[package]]
|
665
|
+
name = "seq-macro"
|
666
|
+
version = "0.3.5"
|
667
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
668
|
+
checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
|
669
|
+
|
641
670
|
[[package]]
|
642
671
|
name = "serde"
|
643
672
|
version = "1.0.159"
|
@@ -749,21 +778,21 @@ dependencies = [
|
|
749
778
|
|
750
779
|
[[package]]
|
751
780
|
name = "tokenizers"
|
752
|
-
version = "0.
|
781
|
+
version = "0.4.1"
|
753
782
|
dependencies = [
|
754
783
|
"magnus",
|
755
784
|
"onig",
|
756
785
|
"serde",
|
757
|
-
"tokenizers 0.
|
786
|
+
"tokenizers 0.14.0",
|
758
787
|
]
|
759
788
|
|
760
789
|
[[package]]
|
761
790
|
name = "tokenizers"
|
762
|
-
version = "0.
|
791
|
+
version = "0.14.0"
|
763
792
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
764
|
-
checksum = "
|
793
|
+
checksum = "12b515a66453a4d68f03398054f7204fd0dde6b93d3f20ea90b08025ab49b499"
|
765
794
|
dependencies = [
|
766
|
-
"aho-corasick",
|
795
|
+
"aho-corasick 0.7.20",
|
767
796
|
"derive_builder",
|
768
797
|
"esaxx-rs",
|
769
798
|
"getrandom",
|
data/ext/tokenizers/Cargo.toml
CHANGED
@@ -1,20 +1,21 @@
|
|
1
1
|
[package]
|
2
2
|
name = "tokenizers"
|
3
|
-
version = "0.
|
3
|
+
version = "0.4.1"
|
4
4
|
license = "Apache-2.0"
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
6
6
|
edition = "2021"
|
7
|
+
rust-version = "1.62.0"
|
7
8
|
publish = false
|
8
9
|
|
9
10
|
[lib]
|
10
11
|
crate-type = ["cdylib"]
|
11
12
|
|
12
13
|
[dependencies]
|
13
|
-
magnus = "0.
|
14
|
+
magnus = "0.6"
|
14
15
|
onig = { version = "6", default-features = false }
|
15
16
|
serde = { version = "1", features = ["rc", "derive"] }
|
16
17
|
|
17
18
|
[dependencies.tokenizers]
|
18
|
-
version = "=0.
|
19
|
+
version = "=0.14.0" # also update in from_pretrained.rb
|
19
20
|
default-features = false
|
20
21
|
features = ["progressbar", "onig", "esaxx_fast"]
|
@@ -1,9 +1,9 @@
|
|
1
1
|
use std::sync::{Arc, RwLock};
|
2
2
|
|
3
|
-
use magnus::
|
3
|
+
use magnus::value::Lazy;
|
4
4
|
use magnus::{
|
5
|
-
|
6
|
-
TypedData,
|
5
|
+
data_type_builder, function, method, Class, DataType, DataTypeFunctions, Module, Object, RClass, RModule,
|
6
|
+
Ruby, TypedData,
|
7
7
|
};
|
8
8
|
use serde::{Deserialize, Serialize};
|
9
9
|
use tk::decoders::bpe::BPEDecoder;
|
@@ -19,7 +19,7 @@ use tk::Decoder;
|
|
19
19
|
use tk::normalizers::replace::Replace;
|
20
20
|
|
21
21
|
use super::utils::*;
|
22
|
-
use super::{RbError, RbResult};
|
22
|
+
use super::{DECODERS, RbError, RbResult};
|
23
23
|
|
24
24
|
#[derive(DataTypeFunctions, Clone, Deserialize, Serialize)]
|
25
25
|
pub struct RbDecoder {
|
@@ -260,74 +260,85 @@ impl Decoder for RbDecoderWrapper {
|
|
260
260
|
}
|
261
261
|
|
262
262
|
unsafe impl TypedData for RbDecoder {
|
263
|
-
fn class() -> RClass {
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
})
|
263
|
+
fn class(ruby: &Ruby) -> RClass {
|
264
|
+
static CLASS: Lazy<RClass> = Lazy::new(|ruby| {
|
265
|
+
let class: RClass = ruby.get_inner(&DECODERS).const_get("Decoder").unwrap();
|
266
|
+
class.undef_default_alloc_func();
|
267
|
+
class
|
268
|
+
});
|
269
|
+
ruby.get_inner(&CLASS)
|
269
270
|
}
|
270
271
|
|
271
272
|
fn data_type() -> &'static DataType {
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
273
|
+
static DATA_TYPE: DataType = data_type_builder!(RbDecoder, "Tokenizers::Decoders::Decoder").build();
|
274
|
+
&DATA_TYPE
|
275
|
+
}
|
276
|
+
|
277
|
+
fn class_for(ruby: &Ruby, value: &Self) -> RClass {
|
278
|
+
static BPE_DECODER: Lazy<RClass> = Lazy::new(|ruby| {
|
279
|
+
let class: RClass = ruby.get_inner(&DECODERS).const_get("BPEDecoder").unwrap();
|
280
|
+
class.undef_default_alloc_func();
|
281
|
+
class
|
282
|
+
});
|
283
|
+
static BYTE_FALLBACK: Lazy<RClass> = Lazy::new(|ruby| {
|
284
|
+
let class: RClass = ruby.get_inner(&DECODERS).const_get("ByteFallback").unwrap();
|
285
|
+
class.undef_default_alloc_func();
|
286
|
+
class
|
287
|
+
});
|
288
|
+
static BYTE_LEVEL: Lazy<RClass> = Lazy::new(|ruby| {
|
289
|
+
let class: RClass = ruby.get_inner(&DECODERS).const_get("ByteLevel").unwrap();
|
290
|
+
class.undef_default_alloc_func();
|
291
|
+
class
|
292
|
+
});
|
293
|
+
static CTC: Lazy<RClass> = Lazy::new(|ruby| {
|
294
|
+
let class: RClass = ruby.get_inner(&DECODERS).const_get("CTC").unwrap();
|
295
|
+
class.undef_default_alloc_func();
|
296
|
+
class
|
297
|
+
});
|
298
|
+
static FUSE: Lazy<RClass> = Lazy::new(|ruby| {
|
299
|
+
let class: RClass = ruby.get_inner(&DECODERS).const_get("Fuse").unwrap();
|
300
|
+
class.undef_default_alloc_func();
|
301
|
+
class
|
302
|
+
});
|
303
|
+
static METASPACE: Lazy<RClass> = Lazy::new(|ruby| {
|
304
|
+
let class: RClass = ruby.get_inner(&DECODERS).const_get("Metaspace").unwrap();
|
305
|
+
class.undef_default_alloc_func();
|
306
|
+
class
|
307
|
+
});
|
308
|
+
static REPLACE: Lazy<RClass> = Lazy::new(|ruby| {
|
309
|
+
let class: RClass = ruby.get_inner(&DECODERS).const_get("Replace").unwrap();
|
310
|
+
class.undef_default_alloc_func();
|
311
|
+
class
|
312
|
+
});
|
313
|
+
static STRIP: Lazy<RClass> = Lazy::new(|ruby| {
|
314
|
+
let class: RClass = ruby.get_inner(&DECODERS).const_get("Strip").unwrap();
|
315
|
+
class.undef_default_alloc_func();
|
316
|
+
class
|
317
|
+
});
|
318
|
+
static WORD_PIECE: Lazy<RClass> = Lazy::new(|ruby| {
|
319
|
+
let class: RClass = ruby.get_inner(&DECODERS).const_get("WordPiece").unwrap();
|
320
|
+
class.undef_default_alloc_func();
|
321
|
+
class
|
322
|
+
});
|
276
323
|
match &value.decoder {
|
277
324
|
RbDecoderWrapper::Wrapped(inner) => match *inner.read().unwrap() {
|
278
|
-
DecoderWrapper::BPE(_) =>
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
DecoderWrapper::
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
}),
|
288
|
-
DecoderWrapper::ByteLevel(_) => *memoize!(RClass: {
|
289
|
-
let class: RClass = crate::decoders().const_get("ByteLevel").unwrap();
|
290
|
-
class.undef_alloc_func();
|
291
|
-
class
|
292
|
-
}),
|
293
|
-
DecoderWrapper::CTC(_) => *memoize!(RClass: {
|
294
|
-
let class: RClass = crate::decoders().const_get("CTC").unwrap();
|
295
|
-
class.undef_alloc_func();
|
296
|
-
class
|
297
|
-
}),
|
298
|
-
DecoderWrapper::Fuse(_) => *memoize!(RClass: {
|
299
|
-
let class: RClass = crate::decoders().const_get("Fuse").unwrap();
|
300
|
-
class.undef_alloc_func();
|
301
|
-
class
|
302
|
-
}),
|
303
|
-
DecoderWrapper::Metaspace(_) => *memoize!(RClass: {
|
304
|
-
let class: RClass = crate::decoders().const_get("Metaspace").unwrap();
|
305
|
-
class.undef_alloc_func();
|
306
|
-
class
|
307
|
-
}),
|
308
|
-
DecoderWrapper::Replace(_) => *memoize!(RClass: {
|
309
|
-
let class: RClass = crate::decoders().const_get("Replace").unwrap();
|
310
|
-
class.undef_alloc_func();
|
311
|
-
class
|
312
|
-
}),
|
313
|
-
DecoderWrapper::Strip(_) => *memoize!(RClass: {
|
314
|
-
let class: RClass = crate::decoders().const_get("Strip").unwrap();
|
315
|
-
class.undef_alloc_func();
|
316
|
-
class
|
317
|
-
}),
|
318
|
-
DecoderWrapper::WordPiece(_) => *memoize!(RClass: {
|
319
|
-
let class: RClass = crate::decoders().const_get("WordPiece").unwrap();
|
320
|
-
class.undef_alloc_func();
|
321
|
-
class
|
322
|
-
}),
|
325
|
+
DecoderWrapper::BPE(_) => ruby.get_inner(&BPE_DECODER),
|
326
|
+
DecoderWrapper::ByteFallback(_) => ruby.get_inner(&BYTE_FALLBACK),
|
327
|
+
DecoderWrapper::ByteLevel(_) => ruby.get_inner(&BYTE_LEVEL),
|
328
|
+
DecoderWrapper::CTC(_) => ruby.get_inner(&CTC),
|
329
|
+
DecoderWrapper::Fuse(_) => ruby.get_inner(&FUSE),
|
330
|
+
DecoderWrapper::Metaspace(_) => ruby.get_inner(&METASPACE),
|
331
|
+
DecoderWrapper::Replace(_) => ruby.get_inner(&REPLACE),
|
332
|
+
DecoderWrapper::Strip(_) => ruby.get_inner(&STRIP),
|
333
|
+
DecoderWrapper::WordPiece(_) => ruby.get_inner(&WORD_PIECE),
|
323
334
|
_ => todo!(),
|
324
335
|
},
|
325
336
|
}
|
326
337
|
}
|
327
338
|
}
|
328
339
|
|
329
|
-
pub fn
|
330
|
-
let decoder = module.define_class("Decoder",
|
340
|
+
pub fn init_decoders(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
341
|
+
let decoder = module.define_class("Decoder", ruby.class_object())?;
|
331
342
|
|
332
343
|
let class = module.define_class("BPEDecoder", decoder)?;
|
333
344
|
class.define_singleton_method("_new", function!(RbBPEDecoder::new, 1))?;
|
data/ext/tokenizers/src/error.rs
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
use magnus::{
|
1
|
+
use magnus::{prelude::*, value::Lazy, Error, ExceptionClass, Ruby};
|
2
2
|
|
3
|
-
use super::
|
3
|
+
use super::TOKENIZERS;
|
4
4
|
|
5
5
|
pub struct RbError {}
|
6
6
|
|
@@ -11,6 +11,8 @@ impl RbError {
|
|
11
11
|
}
|
12
12
|
}
|
13
13
|
|
14
|
+
static ERROR: Lazy<ExceptionClass> = Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Error").unwrap());
|
15
|
+
|
14
16
|
fn error() -> ExceptionClass {
|
15
|
-
|
17
|
+
Ruby::get().unwrap().get_inner(&ERROR)
|
16
18
|
}
|
data/ext/tokenizers/src/lib.rs
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
#![allow(clippy::new_ret_no_self)]
|
2
|
+
|
1
3
|
extern crate tokenizers as tk;
|
2
4
|
|
3
5
|
mod decoders;
|
@@ -16,43 +18,29 @@ use error::RbError;
|
|
16
18
|
use tokenizer::RbTokenizer;
|
17
19
|
use utils::RbRegex;
|
18
20
|
|
19
|
-
use magnus::{
|
21
|
+
use magnus::{function, method, prelude::*, value::Lazy, Error, RModule, Ruby};
|
20
22
|
|
21
23
|
type RbResult<T> = Result<T, Error>;
|
22
24
|
|
23
|
-
|
24
|
-
*memoize!(RModule: define_module("Tokenizers").unwrap())
|
25
|
-
}
|
25
|
+
static TOKENIZERS: Lazy<RModule> = Lazy::new(|ruby| ruby.class_object().const_get("Tokenizers").unwrap());
|
26
26
|
|
27
|
-
|
28
|
-
*memoize!(RModule: module().const_get("Decoders").unwrap())
|
29
|
-
}
|
27
|
+
static DECODERS: Lazy<RModule> = Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Decoders").unwrap());
|
30
28
|
|
31
|
-
|
32
|
-
*memoize!(RModule: module().const_get("Models").unwrap())
|
33
|
-
}
|
29
|
+
static MODELS: Lazy<RModule> = Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Models").unwrap());
|
34
30
|
|
35
|
-
|
36
|
-
*memoize!(RModule: module().const_get("Normalizers").unwrap())
|
37
|
-
}
|
31
|
+
static NORMALIZERS: Lazy<RModule> = Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Normalizers").unwrap());
|
38
32
|
|
39
|
-
|
40
|
-
*memoize!(RModule: module().const_get("PreTokenizers").unwrap())
|
41
|
-
}
|
33
|
+
static PRE_TOKENIZERS: Lazy<RModule> = Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("PreTokenizers").unwrap());
|
42
34
|
|
43
|
-
|
44
|
-
*memoize!(RModule: module().const_get("Processors").unwrap())
|
45
|
-
}
|
35
|
+
static PROCESSORS: Lazy<RModule> = Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Processors").unwrap());
|
46
36
|
|
47
|
-
|
48
|
-
*memoize!(RModule: module().const_get("Trainers").unwrap())
|
49
|
-
}
|
37
|
+
static TRAINERS: Lazy<RModule> = Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Trainers").unwrap());
|
50
38
|
|
51
39
|
#[magnus::init]
|
52
|
-
fn init() -> RbResult<()> {
|
53
|
-
let module =
|
40
|
+
fn init(ruby: &Ruby) -> RbResult<()> {
|
41
|
+
let module = ruby.define_module("Tokenizers")?;
|
54
42
|
|
55
|
-
let class = module.define_class("Tokenizer",
|
43
|
+
let class = module.define_class("Tokenizer", ruby.class_object())?;
|
56
44
|
class.define_singleton_method("new", function!(RbTokenizer::from_model, 1))?;
|
57
45
|
class.define_singleton_method("from_file", function!(RbTokenizer::from_file, 1))?;
|
58
46
|
class.define_method(
|
@@ -86,7 +74,7 @@ fn init() -> RbResult<()> {
|
|
86
74
|
class.define_method("_vocab_size", method!(RbTokenizer::vocab_size, 1))?;
|
87
75
|
class.define_method("_to_s", method!(RbTokenizer::to_str, 1))?;
|
88
76
|
|
89
|
-
let class = module.define_class("Encoding",
|
77
|
+
let class = module.define_class("Encoding", ruby.class_object())?;
|
90
78
|
class.define_method("n_sequences", method!(RbEncoding::n_sequences, 0))?;
|
91
79
|
class.define_method("ids", method!(RbEncoding::ids, 0))?;
|
92
80
|
class.define_method("tokens", method!(RbEncoding::tokens, 0))?;
|
@@ -111,7 +99,7 @@ fn init() -> RbResult<()> {
|
|
111
99
|
class.define_method("_char_to_token", method!(RbEncoding::char_to_token, 2))?;
|
112
100
|
class.define_method("_char_to_word", method!(RbEncoding::char_to_word, 2))?;
|
113
101
|
|
114
|
-
let class = module.define_class("Regex",
|
102
|
+
let class = module.define_class("Regex", ruby.class_object())?;
|
115
103
|
class.define_singleton_method("new", function!(RbRegex::new, 1))?;
|
116
104
|
|
117
105
|
let models = module.define_module("Models")?;
|
@@ -121,12 +109,12 @@ fn init() -> RbResult<()> {
|
|
121
109
|
let normalizers = module.define_module("Normalizers")?;
|
122
110
|
let trainers = module.define_module("Trainers")?;
|
123
111
|
|
124
|
-
models::
|
125
|
-
pre_tokenizers::
|
126
|
-
decoders::
|
127
|
-
processors::
|
128
|
-
normalizers::
|
129
|
-
trainers::
|
112
|
+
models::init_models(ruby, &models)?;
|
113
|
+
pre_tokenizers::init_pre_tokenizers(ruby, &pre_tokenizers)?;
|
114
|
+
decoders::init_decoders(ruby, &decoders)?;
|
115
|
+
processors::init_processors(ruby, &processors)?;
|
116
|
+
normalizers::init_normalizers(ruby, &normalizers)?;
|
117
|
+
trainers::init_trainers(ruby, &trainers)?;
|
130
118
|
|
131
119
|
Ok(())
|
132
120
|
}
|