tokenizers 0.5.4 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/Cargo.lock +202 -126
- data/ext/tokenizers/Cargo.toml +4 -3
- data/ext/tokenizers/src/encoding.rs +10 -8
- data/ext/tokenizers/src/models.rs +37 -24
- data/ext/tokenizers/src/normalizers.rs +1 -2
- data/ext/tokenizers/src/pre_tokenizers.rs +5 -5
- data/ext/tokenizers/src/tokenizer.rs +65 -53
- data/ext/tokenizers/src/trainers.rs +60 -50
- data/ext/tokenizers/src/utils/normalization.rs +3 -2
- data/ext/tokenizers/src/utils/regex.rs +5 -4
- data/lib/tokenizers/from_pretrained.rb +2 -2
- data/lib/tokenizers/trainers/unigram_trainer.rb +10 -9
- data/lib/tokenizers/trainers/word_piece_trainer.rb +10 -9
- data/lib/tokenizers/version.rb +1 -1
- metadata +4 -4
@@ -1,13 +1,11 @@
|
|
1
|
-
use std::collections::HashSet;
|
2
1
|
use std::sync::{Arc, RwLock};
|
3
2
|
|
4
3
|
use crate::models::RbModel;
|
5
4
|
use crate::tokenizer::RbAddedToken;
|
6
5
|
use magnus::prelude::*;
|
7
6
|
use magnus::{
|
8
|
-
data_type_builder,
|
9
|
-
|
10
|
-
TryConvert, TypedData, Value,
|
7
|
+
data_type_builder, function, method, value::Lazy, Class, DataType, DataTypeFunctions, Error,
|
8
|
+
Module, Object, RArray, RClass, RHash, RModule, Ruby, TryConvert, TypedData, Value,
|
11
9
|
};
|
12
10
|
use serde::{Deserialize, Serialize};
|
13
11
|
use tk::models::TrainerWrapper;
|
@@ -391,10 +389,10 @@ where
|
|
391
389
|
pub struct RbBpeTrainer {}
|
392
390
|
|
393
391
|
impl RbBpeTrainer {
|
394
|
-
pub fn new(kwargs: RHash) -> RbResult<RbTrainer> {
|
392
|
+
pub fn new(ruby: &Ruby, kwargs: RHash) -> RbResult<RbTrainer> {
|
395
393
|
let mut builder = tk::models::bpe::BpeTrainer::builder();
|
396
394
|
|
397
|
-
let value: Value = kwargs.delete(
|
395
|
+
let value: Value = kwargs.delete(ruby.to_symbol("special_tokens"))?;
|
398
396
|
if !value.is_nil() {
|
399
397
|
builder = builder.special_tokens(
|
400
398
|
RArray::try_convert(value)?
|
@@ -410,46 +408,50 @@ impl RbBpeTrainer {
|
|
410
408
|
);
|
411
409
|
}
|
412
410
|
|
413
|
-
let value: Value = kwargs.delete(
|
411
|
+
let value: Value = kwargs.delete(ruby.to_symbol("initial_alphabet"))?;
|
414
412
|
if !value.is_nil() {
|
415
|
-
let
|
416
|
-
|
417
|
-
|
413
|
+
let alphabet = Vec::<String>::try_convert(value)?;
|
414
|
+
builder = builder.initial_alphabet(
|
415
|
+
alphabet
|
416
|
+
.into_iter()
|
417
|
+
.filter_map(|s| s.chars().next())
|
418
|
+
.collect(),
|
419
|
+
);
|
418
420
|
}
|
419
421
|
|
420
|
-
let value: Value = kwargs.delete(
|
422
|
+
let value: Value = kwargs.delete(ruby.to_symbol("vocab_size"))?;
|
421
423
|
if !value.is_nil() {
|
422
424
|
builder = builder.vocab_size(TryConvert::try_convert(value)?);
|
423
425
|
}
|
424
426
|
|
425
|
-
let value: Value = kwargs.delete(
|
427
|
+
let value: Value = kwargs.delete(ruby.to_symbol("min_frequency"))?;
|
426
428
|
if !value.is_nil() {
|
427
429
|
builder = builder.min_frequency(TryConvert::try_convert(value)?);
|
428
430
|
}
|
429
431
|
|
430
|
-
let value: Value = kwargs.delete(
|
432
|
+
let value: Value = kwargs.delete(ruby.to_symbol("show_progress"))?;
|
431
433
|
if !value.is_nil() {
|
432
434
|
builder = builder.show_progress(TryConvert::try_convert(value)?);
|
433
435
|
}
|
434
436
|
|
435
|
-
let value: Value = kwargs.delete(
|
437
|
+
let value: Value = kwargs.delete(ruby.to_symbol("limit_alphabet"))?;
|
436
438
|
if !value.is_nil() {
|
437
439
|
builder = builder.limit_alphabet(TryConvert::try_convert(value)?);
|
438
440
|
}
|
439
441
|
|
440
|
-
let value: Value = kwargs.delete(
|
442
|
+
let value: Value = kwargs.delete(ruby.to_symbol("continuing_subword_prefix"))?;
|
441
443
|
if !value.is_nil() {
|
442
444
|
builder = builder.continuing_subword_prefix(TryConvert::try_convert(value)?);
|
443
445
|
}
|
444
446
|
|
445
|
-
let value: Value = kwargs.delete(
|
447
|
+
let value: Value = kwargs.delete(ruby.to_symbol("end_of_word_suffix"))?;
|
446
448
|
if !value.is_nil() {
|
447
449
|
builder = builder.end_of_word_suffix(TryConvert::try_convert(value)?);
|
448
450
|
}
|
449
451
|
|
450
452
|
if !kwargs.is_empty() {
|
451
453
|
// TODO improve message
|
452
|
-
return Err(Error::new(
|
454
|
+
return Err(Error::new(ruby.exception_arg_error(), "unknown keyword"));
|
453
455
|
}
|
454
456
|
|
455
457
|
Ok(builder.build().into())
|
@@ -459,10 +461,10 @@ impl RbBpeTrainer {
|
|
459
461
|
pub struct RbUnigramTrainer {}
|
460
462
|
|
461
463
|
impl RbUnigramTrainer {
|
462
|
-
pub fn new(kwargs: RHash) -> RbResult<RbTrainer> {
|
464
|
+
pub fn new(ruby: &Ruby, kwargs: RHash) -> RbResult<RbTrainer> {
|
463
465
|
let mut builder = tk::models::unigram::UnigramTrainer::builder();
|
464
466
|
|
465
|
-
let value: Value = kwargs.delete(
|
467
|
+
let value: Value = kwargs.delete(ruby.to_symbol("special_tokens"))?;
|
466
468
|
if !value.is_nil() {
|
467
469
|
builder.special_tokens(
|
468
470
|
RArray::try_convert(value)?
|
@@ -478,56 +480,60 @@ impl RbUnigramTrainer {
|
|
478
480
|
);
|
479
481
|
}
|
480
482
|
|
481
|
-
let value: Value = kwargs.delete(
|
483
|
+
let value: Value = kwargs.delete(ruby.to_symbol("initial_alphabet"))?;
|
482
484
|
if !value.is_nil() {
|
483
|
-
let
|
484
|
-
|
485
|
-
|
485
|
+
let alphabet = Vec::<String>::try_convert(value)?;
|
486
|
+
builder.initial_alphabet(
|
487
|
+
alphabet
|
488
|
+
.into_iter()
|
489
|
+
.filter_map(|s| s.chars().next())
|
490
|
+
.collect(),
|
491
|
+
);
|
486
492
|
}
|
487
493
|
|
488
|
-
let value: Value = kwargs.delete(
|
494
|
+
let value: Value = kwargs.delete(ruby.to_symbol("vocab_size"))?;
|
489
495
|
if !value.is_nil() {
|
490
496
|
builder.vocab_size(TryConvert::try_convert(value)?);
|
491
497
|
}
|
492
498
|
|
493
|
-
let value: Value = kwargs.delete(
|
499
|
+
let value: Value = kwargs.delete(ruby.to_symbol("show_progress"))?;
|
494
500
|
if !value.is_nil() {
|
495
501
|
builder.show_progress(TryConvert::try_convert(value)?);
|
496
502
|
}
|
497
503
|
|
498
|
-
let value: Value = kwargs.delete(
|
504
|
+
let value: Value = kwargs.delete(ruby.to_symbol("n_sub_iterations"))?;
|
499
505
|
if !value.is_nil() {
|
500
506
|
builder.n_sub_iterations(TryConvert::try_convert(value)?);
|
501
507
|
}
|
502
508
|
|
503
|
-
let value: Value = kwargs.delete(
|
509
|
+
let value: Value = kwargs.delete(ruby.to_symbol("unk_token"))?;
|
504
510
|
if !value.is_nil() {
|
505
511
|
builder.unk_token(Some(TryConvert::try_convert(value)?));
|
506
512
|
}
|
507
513
|
|
508
|
-
let value: Value = kwargs.delete(
|
514
|
+
let value: Value = kwargs.delete(ruby.to_symbol("max_piece_length"))?;
|
509
515
|
if !value.is_nil() {
|
510
516
|
builder.max_piece_length(TryConvert::try_convert(value)?);
|
511
517
|
}
|
512
518
|
|
513
|
-
let value: Value = kwargs.delete(
|
519
|
+
let value: Value = kwargs.delete(ruby.to_symbol("seed_size"))?;
|
514
520
|
if !value.is_nil() {
|
515
521
|
builder.seed_size(TryConvert::try_convert(value)?);
|
516
522
|
}
|
517
523
|
|
518
|
-
let value: Value = kwargs.delete(
|
524
|
+
let value: Value = kwargs.delete(ruby.to_symbol("shrinking_factor"))?;
|
519
525
|
if !value.is_nil() {
|
520
526
|
builder.shrinking_factor(TryConvert::try_convert(value)?);
|
521
527
|
}
|
522
528
|
|
523
529
|
if !kwargs.is_empty() {
|
524
530
|
// TODO improve message
|
525
|
-
return Err(Error::new(
|
531
|
+
return Err(Error::new(ruby.exception_arg_error(), "unknown keyword"));
|
526
532
|
}
|
527
533
|
|
528
534
|
let trainer = builder
|
529
535
|
.build()
|
530
|
-
.map_err(|_| Error::new(
|
536
|
+
.map_err(|_| Error::new(ruby.exception_arg_error(), "Cannot build UnigramTrainer"))?;
|
531
537
|
Ok(trainer.into())
|
532
538
|
}
|
533
539
|
}
|
@@ -535,10 +541,10 @@ impl RbUnigramTrainer {
|
|
535
541
|
pub struct RbWordLevelTrainer {}
|
536
542
|
|
537
543
|
impl RbWordLevelTrainer {
|
538
|
-
pub fn new(kwargs: RHash) -> RbResult<RbTrainer> {
|
544
|
+
pub fn new(ruby: &Ruby, kwargs: RHash) -> RbResult<RbTrainer> {
|
539
545
|
let mut builder = tk::models::wordlevel::WordLevelTrainer::builder();
|
540
546
|
|
541
|
-
let value: Value = kwargs.delete(
|
547
|
+
let value: Value = kwargs.delete(ruby.to_symbol("special_tokens"))?;
|
542
548
|
if !value.is_nil() {
|
543
549
|
builder.special_tokens(
|
544
550
|
RArray::try_convert(value)?
|
@@ -554,17 +560,17 @@ impl RbWordLevelTrainer {
|
|
554
560
|
);
|
555
561
|
}
|
556
562
|
|
557
|
-
let value: Value = kwargs.delete(
|
563
|
+
let value: Value = kwargs.delete(ruby.to_symbol("vocab_size"))?;
|
558
564
|
if !value.is_nil() {
|
559
565
|
builder.vocab_size(TryConvert::try_convert(value)?);
|
560
566
|
}
|
561
567
|
|
562
|
-
let value: Value = kwargs.delete(
|
568
|
+
let value: Value = kwargs.delete(ruby.to_symbol("min_frequency"))?;
|
563
569
|
if !value.is_nil() {
|
564
570
|
builder.min_frequency(TryConvert::try_convert(value)?);
|
565
571
|
}
|
566
572
|
|
567
|
-
let value: Value = kwargs.delete(
|
573
|
+
let value: Value = kwargs.delete(ruby.to_symbol("show_progress"))?;
|
568
574
|
if !value.is_nil() {
|
569
575
|
builder.show_progress(TryConvert::try_convert(value)?);
|
570
576
|
}
|
@@ -579,10 +585,10 @@ impl RbWordLevelTrainer {
|
|
579
585
|
pub struct RbWordPieceTrainer {}
|
580
586
|
|
581
587
|
impl RbWordPieceTrainer {
|
582
|
-
pub fn new(kwargs: RHash) -> RbResult<RbTrainer> {
|
588
|
+
pub fn new(ruby: &Ruby, kwargs: RHash) -> RbResult<RbTrainer> {
|
583
589
|
let mut builder = tk::models::wordpiece::WordPieceTrainer::builder();
|
584
590
|
|
585
|
-
let value: Value = kwargs.delete(
|
591
|
+
let value: Value = kwargs.delete(ruby.to_symbol("special_tokens"))?;
|
586
592
|
if !value.is_nil() {
|
587
593
|
builder = builder.special_tokens(
|
588
594
|
RArray::try_convert(value)?
|
@@ -598,46 +604,50 @@ impl RbWordPieceTrainer {
|
|
598
604
|
);
|
599
605
|
}
|
600
606
|
|
601
|
-
let value: Value = kwargs.delete(
|
607
|
+
let value: Value = kwargs.delete(ruby.to_symbol("initial_alphabet"))?;
|
602
608
|
if !value.is_nil() {
|
603
|
-
let
|
604
|
-
|
605
|
-
|
609
|
+
let alphabet = Vec::<String>::try_convert(value)?;
|
610
|
+
builder = builder.initial_alphabet(
|
611
|
+
alphabet
|
612
|
+
.into_iter()
|
613
|
+
.filter_map(|s| s.chars().next())
|
614
|
+
.collect(),
|
615
|
+
);
|
606
616
|
}
|
607
617
|
|
608
|
-
let value: Value = kwargs.delete(
|
618
|
+
let value: Value = kwargs.delete(ruby.to_symbol("vocab_size"))?;
|
609
619
|
if !value.is_nil() {
|
610
620
|
builder = builder.vocab_size(TryConvert::try_convert(value)?);
|
611
621
|
}
|
612
622
|
|
613
|
-
let value: Value = kwargs.delete(
|
623
|
+
let value: Value = kwargs.delete(ruby.to_symbol("min_frequency"))?;
|
614
624
|
if !value.is_nil() {
|
615
625
|
builder = builder.min_frequency(TryConvert::try_convert(value)?);
|
616
626
|
}
|
617
627
|
|
618
|
-
let value: Value = kwargs.delete(
|
628
|
+
let value: Value = kwargs.delete(ruby.to_symbol("show_progress"))?;
|
619
629
|
if !value.is_nil() {
|
620
630
|
builder = builder.show_progress(TryConvert::try_convert(value)?);
|
621
631
|
}
|
622
632
|
|
623
|
-
let value: Value = kwargs.delete(
|
633
|
+
let value: Value = kwargs.delete(ruby.to_symbol("limit_alphabet"))?;
|
624
634
|
if !value.is_nil() {
|
625
635
|
builder = builder.limit_alphabet(TryConvert::try_convert(value)?);
|
626
636
|
}
|
627
637
|
|
628
|
-
let value: Value = kwargs.delete(
|
638
|
+
let value: Value = kwargs.delete(ruby.to_symbol("continuing_subword_prefix"))?;
|
629
639
|
if !value.is_nil() {
|
630
640
|
builder = builder.continuing_subword_prefix(TryConvert::try_convert(value)?);
|
631
641
|
}
|
632
642
|
|
633
|
-
let value: Value = kwargs.delete(
|
643
|
+
let value: Value = kwargs.delete(ruby.to_symbol("end_of_word_suffix"))?;
|
634
644
|
if !value.is_nil() {
|
635
645
|
builder = builder.end_of_word_suffix(TryConvert::try_convert(value)?);
|
636
646
|
}
|
637
647
|
|
638
648
|
if !kwargs.is_empty() {
|
639
649
|
// TODO improve message
|
640
|
-
return Err(Error::new(
|
650
|
+
return Err(Error::new(ruby.exception_arg_error(), "unknown keyword"));
|
641
651
|
}
|
642
652
|
|
643
653
|
Ok(builder.build().into())
|
@@ -1,7 +1,7 @@
|
|
1
1
|
use super::regex::{regex, RbRegex};
|
2
2
|
use crate::RbResult;
|
3
3
|
use magnus::prelude::*;
|
4
|
-
use magnus::{
|
4
|
+
use magnus::{Error, Ruby, TryConvert, Value};
|
5
5
|
use tk::normalizer::SplitDelimiterBehavior;
|
6
6
|
use tk::pattern::Pattern;
|
7
7
|
|
@@ -62,6 +62,7 @@ pub struct RbSplitDelimiterBehavior(pub SplitDelimiterBehavior);
|
|
62
62
|
|
63
63
|
impl TryConvert for RbSplitDelimiterBehavior {
|
64
64
|
fn try_convert(obj: Value) -> RbResult<Self> {
|
65
|
+
let ruby = Ruby::get_with(obj);
|
65
66
|
let s = String::try_convert(obj)?;
|
66
67
|
|
67
68
|
Ok(Self(match s.as_str() {
|
@@ -71,7 +72,7 @@ impl TryConvert for RbSplitDelimiterBehavior {
|
|
71
72
|
"merged_with_next" => Ok(SplitDelimiterBehavior::MergedWithNext),
|
72
73
|
"contiguous" => Ok(SplitDelimiterBehavior::Contiguous),
|
73
74
|
_ => Err(Error::new(
|
74
|
-
|
75
|
+
ruby.exception_arg_error(),
|
75
76
|
"Wrong value for SplitDelimiterBehavior, expected one of: \
|
76
77
|
`removed, isolated, merged_with_previous, merged_with_next, contiguous`",
|
77
78
|
)),
|
@@ -1,5 +1,5 @@
|
|
1
1
|
use crate::{RbResult, TOKENIZERS};
|
2
|
-
use magnus::{
|
2
|
+
use magnus::{prelude::*, value::Lazy, Error, RClass, Ruby};
|
3
3
|
use onig::Regex;
|
4
4
|
|
5
5
|
#[magnus::wrap(class = "Tokenizers::Regex")]
|
@@ -9,10 +9,11 @@ pub struct RbRegex {
|
|
9
9
|
}
|
10
10
|
|
11
11
|
impl RbRegex {
|
12
|
-
pub fn new(s: String) -> RbResult<Self> {
|
12
|
+
pub fn new(ruby: &Ruby, s: String) -> RbResult<Self> {
|
13
13
|
Ok(Self {
|
14
|
-
inner: Regex::new(&s)
|
15
|
-
|
14
|
+
inner: Regex::new(&s).map_err(|e| {
|
15
|
+
Error::new(ruby.exception_runtime_error(), e.description().to_owned())
|
16
|
+
})?,
|
16
17
|
pattern: s,
|
17
18
|
})
|
18
19
|
}
|
@@ -1,13 +1,13 @@
|
|
1
1
|
module Tokenizers
|
2
2
|
module FromPretrained
|
3
3
|
# for user agent
|
4
|
-
TOKENIZERS_VERSION = "0.
|
4
|
+
TOKENIZERS_VERSION = "0.22.0"
|
5
5
|
|
6
6
|
# use Ruby for downloads
|
7
7
|
# this avoids the need to vendor OpenSSL on Linux
|
8
8
|
# and reduces the extension size by about half
|
9
9
|
def from_pretrained(identifier, revision: "main", auth_token: nil)
|
10
|
-
require "cgi"
|
10
|
+
require "cgi/escape"
|
11
11
|
require "digest"
|
12
12
|
require "fileutils"
|
13
13
|
require "json"
|
@@ -1,15 +1,16 @@
|
|
1
1
|
module Tokenizers
|
2
2
|
module Trainers
|
3
3
|
class UnigramTrainer
|
4
|
-
def self.new(
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
4
|
+
def self.new(
|
5
|
+
vocab_size: 8000,
|
6
|
+
show_progress: true,
|
7
|
+
special_tokens: [],
|
8
|
+
initial_alphabet: [],
|
9
|
+
shrinking_factor: 0.75,
|
10
|
+
unk_token: nil,
|
11
|
+
max_piece_length: 16,
|
12
|
+
n_sub_iterations: 2
|
13
|
+
)
|
13
14
|
_new({
|
14
15
|
vocab_size: vocab_size,
|
15
16
|
show_progress: show_progress,
|
@@ -1,15 +1,16 @@
|
|
1
1
|
module Tokenizers
|
2
2
|
module Trainers
|
3
3
|
class WordPieceTrainer
|
4
|
-
def self.new(
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
4
|
+
def self.new(
|
5
|
+
vocab_size: 30000,
|
6
|
+
min_frequency: 0,
|
7
|
+
show_progress: true,
|
8
|
+
special_tokens: [],
|
9
|
+
limit_alphabet: nil,
|
10
|
+
initial_alphabet: [],
|
11
|
+
continuing_subword_prefix: "##",
|
12
|
+
end_of_word_suffix: nil
|
13
|
+
)
|
13
14
|
_new({
|
14
15
|
vocab_size: vocab_size,
|
15
16
|
min_frequency: min_frequency,
|
data/lib/tokenizers/version.rb
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
bindir: bin
|
9
9
|
cert_chain: []
|
10
|
-
date:
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
11
11
|
dependencies:
|
12
12
|
- !ruby/object:Gem::Dependency
|
13
13
|
name: rb_sys
|
@@ -91,14 +91,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
91
91
|
requirements:
|
92
92
|
- - ">="
|
93
93
|
- !ruby/object:Gem::Version
|
94
|
-
version: '3.
|
94
|
+
version: '3.2'
|
95
95
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
96
96
|
requirements:
|
97
97
|
- - ">="
|
98
98
|
- !ruby/object:Gem::Version
|
99
99
|
version: '0'
|
100
100
|
requirements: []
|
101
|
-
rubygems_version: 3.6.
|
101
|
+
rubygems_version: 3.6.9
|
102
102
|
specification_version: 4
|
103
103
|
summary: Fast state-of-the-art tokenizers for Ruby
|
104
104
|
test_files: []
|