tokenizers 0.5.4 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,11 @@
1
- use std::collections::HashSet;
2
1
  use std::sync::{Arc, RwLock};
3
2
 
4
3
  use crate::models::RbModel;
5
4
  use crate::tokenizer::RbAddedToken;
6
5
  use magnus::prelude::*;
7
6
  use magnus::{
8
- data_type_builder, exception, function, method, value::Lazy, Class, DataType,
9
- DataTypeFunctions, Error, Module, Object, RArray, RClass, RHash, RModule, Ruby, Symbol,
10
- TryConvert, TypedData, Value,
7
+ data_type_builder, function, method, value::Lazy, Class, DataType, DataTypeFunctions, Error,
8
+ Module, Object, RArray, RClass, RHash, RModule, Ruby, TryConvert, TypedData, Value,
11
9
  };
12
10
  use serde::{Deserialize, Serialize};
13
11
  use tk::models::TrainerWrapper;
@@ -391,10 +389,10 @@ where
391
389
  pub struct RbBpeTrainer {}
392
390
 
393
391
  impl RbBpeTrainer {
394
- pub fn new(kwargs: RHash) -> RbResult<RbTrainer> {
392
+ pub fn new(ruby: &Ruby, kwargs: RHash) -> RbResult<RbTrainer> {
395
393
  let mut builder = tk::models::bpe::BpeTrainer::builder();
396
394
 
397
- let value: Value = kwargs.delete(Symbol::new("special_tokens"))?;
395
+ let value: Value = kwargs.delete(ruby.to_symbol("special_tokens"))?;
398
396
  if !value.is_nil() {
399
397
  builder = builder.special_tokens(
400
398
  RArray::try_convert(value)?
@@ -410,46 +408,50 @@ impl RbBpeTrainer {
410
408
  );
411
409
  }
412
410
 
413
- let value: Value = kwargs.delete(Symbol::new("initial_alphabet"))?;
411
+ let value: Value = kwargs.delete(ruby.to_symbol("initial_alphabet"))?;
414
412
  if !value.is_nil() {
415
- let arr = <Vec<char>>::try_convert(value)?;
416
- let set: HashSet<char> = HashSet::from_iter(arr);
417
- builder = builder.initial_alphabet(set);
413
+ let alphabet = Vec::<String>::try_convert(value)?;
414
+ builder = builder.initial_alphabet(
415
+ alphabet
416
+ .into_iter()
417
+ .filter_map(|s| s.chars().next())
418
+ .collect(),
419
+ );
418
420
  }
419
421
 
420
- let value: Value = kwargs.delete(Symbol::new("vocab_size"))?;
422
+ let value: Value = kwargs.delete(ruby.to_symbol("vocab_size"))?;
421
423
  if !value.is_nil() {
422
424
  builder = builder.vocab_size(TryConvert::try_convert(value)?);
423
425
  }
424
426
 
425
- let value: Value = kwargs.delete(Symbol::new("min_frequency"))?;
427
+ let value: Value = kwargs.delete(ruby.to_symbol("min_frequency"))?;
426
428
  if !value.is_nil() {
427
429
  builder = builder.min_frequency(TryConvert::try_convert(value)?);
428
430
  }
429
431
 
430
- let value: Value = kwargs.delete(Symbol::new("show_progress"))?;
432
+ let value: Value = kwargs.delete(ruby.to_symbol("show_progress"))?;
431
433
  if !value.is_nil() {
432
434
  builder = builder.show_progress(TryConvert::try_convert(value)?);
433
435
  }
434
436
 
435
- let value: Value = kwargs.delete(Symbol::new("limit_alphabet"))?;
437
+ let value: Value = kwargs.delete(ruby.to_symbol("limit_alphabet"))?;
436
438
  if !value.is_nil() {
437
439
  builder = builder.limit_alphabet(TryConvert::try_convert(value)?);
438
440
  }
439
441
 
440
- let value: Value = kwargs.delete(Symbol::new("continuing_subword_prefix"))?;
442
+ let value: Value = kwargs.delete(ruby.to_symbol("continuing_subword_prefix"))?;
441
443
  if !value.is_nil() {
442
444
  builder = builder.continuing_subword_prefix(TryConvert::try_convert(value)?);
443
445
  }
444
446
 
445
- let value: Value = kwargs.delete(Symbol::new("end_of_word_suffix"))?;
447
+ let value: Value = kwargs.delete(ruby.to_symbol("end_of_word_suffix"))?;
446
448
  if !value.is_nil() {
447
449
  builder = builder.end_of_word_suffix(TryConvert::try_convert(value)?);
448
450
  }
449
451
 
450
452
  if !kwargs.is_empty() {
451
453
  // TODO improve message
452
- return Err(Error::new(exception::arg_error(), "unknown keyword"));
454
+ return Err(Error::new(ruby.exception_arg_error(), "unknown keyword"));
453
455
  }
454
456
 
455
457
  Ok(builder.build().into())
@@ -459,10 +461,10 @@ impl RbBpeTrainer {
459
461
  pub struct RbUnigramTrainer {}
460
462
 
461
463
  impl RbUnigramTrainer {
462
- pub fn new(kwargs: RHash) -> RbResult<RbTrainer> {
464
+ pub fn new(ruby: &Ruby, kwargs: RHash) -> RbResult<RbTrainer> {
463
465
  let mut builder = tk::models::unigram::UnigramTrainer::builder();
464
466
 
465
- let value: Value = kwargs.delete(Symbol::new("special_tokens"))?;
467
+ let value: Value = kwargs.delete(ruby.to_symbol("special_tokens"))?;
466
468
  if !value.is_nil() {
467
469
  builder.special_tokens(
468
470
  RArray::try_convert(value)?
@@ -478,56 +480,60 @@ impl RbUnigramTrainer {
478
480
  );
479
481
  }
480
482
 
481
- let value: Value = kwargs.delete(Symbol::new("initial_alphabet"))?;
483
+ let value: Value = kwargs.delete(ruby.to_symbol("initial_alphabet"))?;
482
484
  if !value.is_nil() {
483
- let arr = <Vec<char>>::try_convert(value)?;
484
- let set: HashSet<char> = HashSet::from_iter(arr);
485
- builder.initial_alphabet(set);
485
+ let alphabet = Vec::<String>::try_convert(value)?;
486
+ builder.initial_alphabet(
487
+ alphabet
488
+ .into_iter()
489
+ .filter_map(|s| s.chars().next())
490
+ .collect(),
491
+ );
486
492
  }
487
493
 
488
- let value: Value = kwargs.delete(Symbol::new("vocab_size"))?;
494
+ let value: Value = kwargs.delete(ruby.to_symbol("vocab_size"))?;
489
495
  if !value.is_nil() {
490
496
  builder.vocab_size(TryConvert::try_convert(value)?);
491
497
  }
492
498
 
493
- let value: Value = kwargs.delete(Symbol::new("show_progress"))?;
499
+ let value: Value = kwargs.delete(ruby.to_symbol("show_progress"))?;
494
500
  if !value.is_nil() {
495
501
  builder.show_progress(TryConvert::try_convert(value)?);
496
502
  }
497
503
 
498
- let value: Value = kwargs.delete(Symbol::new("n_sub_iterations"))?;
504
+ let value: Value = kwargs.delete(ruby.to_symbol("n_sub_iterations"))?;
499
505
  if !value.is_nil() {
500
506
  builder.n_sub_iterations(TryConvert::try_convert(value)?);
501
507
  }
502
508
 
503
- let value: Value = kwargs.delete(Symbol::new("unk_token"))?;
509
+ let value: Value = kwargs.delete(ruby.to_symbol("unk_token"))?;
504
510
  if !value.is_nil() {
505
511
  builder.unk_token(Some(TryConvert::try_convert(value)?));
506
512
  }
507
513
 
508
- let value: Value = kwargs.delete(Symbol::new("max_piece_length"))?;
514
+ let value: Value = kwargs.delete(ruby.to_symbol("max_piece_length"))?;
509
515
  if !value.is_nil() {
510
516
  builder.max_piece_length(TryConvert::try_convert(value)?);
511
517
  }
512
518
 
513
- let value: Value = kwargs.delete(Symbol::new("seed_size"))?;
519
+ let value: Value = kwargs.delete(ruby.to_symbol("seed_size"))?;
514
520
  if !value.is_nil() {
515
521
  builder.seed_size(TryConvert::try_convert(value)?);
516
522
  }
517
523
 
518
- let value: Value = kwargs.delete(Symbol::new("shrinking_factor"))?;
524
+ let value: Value = kwargs.delete(ruby.to_symbol("shrinking_factor"))?;
519
525
  if !value.is_nil() {
520
526
  builder.shrinking_factor(TryConvert::try_convert(value)?);
521
527
  }
522
528
 
523
529
  if !kwargs.is_empty() {
524
530
  // TODO improve message
525
- return Err(Error::new(exception::arg_error(), "unknown keyword"));
531
+ return Err(Error::new(ruby.exception_arg_error(), "unknown keyword"));
526
532
  }
527
533
 
528
534
  let trainer = builder
529
535
  .build()
530
- .map_err(|_| Error::new(exception::arg_error(), "Cannot build UnigramTrainer"))?;
536
+ .map_err(|_| Error::new(ruby.exception_arg_error(), "Cannot build UnigramTrainer"))?;
531
537
  Ok(trainer.into())
532
538
  }
533
539
  }
@@ -535,10 +541,10 @@ impl RbUnigramTrainer {
535
541
  pub struct RbWordLevelTrainer {}
536
542
 
537
543
  impl RbWordLevelTrainer {
538
- pub fn new(kwargs: RHash) -> RbResult<RbTrainer> {
544
+ pub fn new(ruby: &Ruby, kwargs: RHash) -> RbResult<RbTrainer> {
539
545
  let mut builder = tk::models::wordlevel::WordLevelTrainer::builder();
540
546
 
541
- let value: Value = kwargs.delete(Symbol::new("special_tokens"))?;
547
+ let value: Value = kwargs.delete(ruby.to_symbol("special_tokens"))?;
542
548
  if !value.is_nil() {
543
549
  builder.special_tokens(
544
550
  RArray::try_convert(value)?
@@ -554,17 +560,17 @@ impl RbWordLevelTrainer {
554
560
  );
555
561
  }
556
562
 
557
- let value: Value = kwargs.delete(Symbol::new("vocab_size"))?;
563
+ let value: Value = kwargs.delete(ruby.to_symbol("vocab_size"))?;
558
564
  if !value.is_nil() {
559
565
  builder.vocab_size(TryConvert::try_convert(value)?);
560
566
  }
561
567
 
562
- let value: Value = kwargs.delete(Symbol::new("min_frequency"))?;
568
+ let value: Value = kwargs.delete(ruby.to_symbol("min_frequency"))?;
563
569
  if !value.is_nil() {
564
570
  builder.min_frequency(TryConvert::try_convert(value)?);
565
571
  }
566
572
 
567
- let value: Value = kwargs.delete(Symbol::new("show_progress"))?;
573
+ let value: Value = kwargs.delete(ruby.to_symbol("show_progress"))?;
568
574
  if !value.is_nil() {
569
575
  builder.show_progress(TryConvert::try_convert(value)?);
570
576
  }
@@ -579,10 +585,10 @@ impl RbWordLevelTrainer {
579
585
  pub struct RbWordPieceTrainer {}
580
586
 
581
587
  impl RbWordPieceTrainer {
582
- pub fn new(kwargs: RHash) -> RbResult<RbTrainer> {
588
+ pub fn new(ruby: &Ruby, kwargs: RHash) -> RbResult<RbTrainer> {
583
589
  let mut builder = tk::models::wordpiece::WordPieceTrainer::builder();
584
590
 
585
- let value: Value = kwargs.delete(Symbol::new("special_tokens"))?;
591
+ let value: Value = kwargs.delete(ruby.to_symbol("special_tokens"))?;
586
592
  if !value.is_nil() {
587
593
  builder = builder.special_tokens(
588
594
  RArray::try_convert(value)?
@@ -598,46 +604,50 @@ impl RbWordPieceTrainer {
598
604
  );
599
605
  }
600
606
 
601
- let value: Value = kwargs.delete(Symbol::new("initial_alphabet"))?;
607
+ let value: Value = kwargs.delete(ruby.to_symbol("initial_alphabet"))?;
602
608
  if !value.is_nil() {
603
- let arr = <Vec<char>>::try_convert(value)?;
604
- let set: HashSet<char> = HashSet::from_iter(arr);
605
- builder = builder.initial_alphabet(set);
609
+ let alphabet = Vec::<String>::try_convert(value)?;
610
+ builder = builder.initial_alphabet(
611
+ alphabet
612
+ .into_iter()
613
+ .filter_map(|s| s.chars().next())
614
+ .collect(),
615
+ );
606
616
  }
607
617
 
608
- let value: Value = kwargs.delete(Symbol::new("vocab_size"))?;
618
+ let value: Value = kwargs.delete(ruby.to_symbol("vocab_size"))?;
609
619
  if !value.is_nil() {
610
620
  builder = builder.vocab_size(TryConvert::try_convert(value)?);
611
621
  }
612
622
 
613
- let value: Value = kwargs.delete(Symbol::new("min_frequency"))?;
623
+ let value: Value = kwargs.delete(ruby.to_symbol("min_frequency"))?;
614
624
  if !value.is_nil() {
615
625
  builder = builder.min_frequency(TryConvert::try_convert(value)?);
616
626
  }
617
627
 
618
- let value: Value = kwargs.delete(Symbol::new("show_progress"))?;
628
+ let value: Value = kwargs.delete(ruby.to_symbol("show_progress"))?;
619
629
  if !value.is_nil() {
620
630
  builder = builder.show_progress(TryConvert::try_convert(value)?);
621
631
  }
622
632
 
623
- let value: Value = kwargs.delete(Symbol::new("limit_alphabet"))?;
633
+ let value: Value = kwargs.delete(ruby.to_symbol("limit_alphabet"))?;
624
634
  if !value.is_nil() {
625
635
  builder = builder.limit_alphabet(TryConvert::try_convert(value)?);
626
636
  }
627
637
 
628
- let value: Value = kwargs.delete(Symbol::new("continuing_subword_prefix"))?;
638
+ let value: Value = kwargs.delete(ruby.to_symbol("continuing_subword_prefix"))?;
629
639
  if !value.is_nil() {
630
640
  builder = builder.continuing_subword_prefix(TryConvert::try_convert(value)?);
631
641
  }
632
642
 
633
- let value: Value = kwargs.delete(Symbol::new("end_of_word_suffix"))?;
643
+ let value: Value = kwargs.delete(ruby.to_symbol("end_of_word_suffix"))?;
634
644
  if !value.is_nil() {
635
645
  builder = builder.end_of_word_suffix(TryConvert::try_convert(value)?);
636
646
  }
637
647
 
638
648
  if !kwargs.is_empty() {
639
649
  // TODO improve message
640
- return Err(Error::new(exception::arg_error(), "unknown keyword"));
650
+ return Err(Error::new(ruby.exception_arg_error(), "unknown keyword"));
641
651
  }
642
652
 
643
653
  Ok(builder.build().into())
@@ -1,7 +1,7 @@
1
1
  use super::regex::{regex, RbRegex};
2
2
  use crate::RbResult;
3
3
  use magnus::prelude::*;
4
- use magnus::{exception, Error, TryConvert, Value};
4
+ use magnus::{Error, Ruby, TryConvert, Value};
5
5
  use tk::normalizer::SplitDelimiterBehavior;
6
6
  use tk::pattern::Pattern;
7
7
 
@@ -62,6 +62,7 @@ pub struct RbSplitDelimiterBehavior(pub SplitDelimiterBehavior);
62
62
 
63
63
  impl TryConvert for RbSplitDelimiterBehavior {
64
64
  fn try_convert(obj: Value) -> RbResult<Self> {
65
+ let ruby = Ruby::get_with(obj);
65
66
  let s = String::try_convert(obj)?;
66
67
 
67
68
  Ok(Self(match s.as_str() {
@@ -71,7 +72,7 @@ impl TryConvert for RbSplitDelimiterBehavior {
71
72
  "merged_with_next" => Ok(SplitDelimiterBehavior::MergedWithNext),
72
73
  "contiguous" => Ok(SplitDelimiterBehavior::Contiguous),
73
74
  _ => Err(Error::new(
74
- exception::arg_error(),
75
+ ruby.exception_arg_error(),
75
76
  "Wrong value for SplitDelimiterBehavior, expected one of: \
76
77
  `removed, isolated, merged_with_previous, merged_with_next, contiguous`",
77
78
  )),
@@ -1,5 +1,5 @@
1
1
  use crate::{RbResult, TOKENIZERS};
2
- use magnus::{exception, prelude::*, value::Lazy, Error, RClass, Ruby};
2
+ use magnus::{prelude::*, value::Lazy, Error, RClass, Ruby};
3
3
  use onig::Regex;
4
4
 
5
5
  #[magnus::wrap(class = "Tokenizers::Regex")]
@@ -9,10 +9,11 @@ pub struct RbRegex {
9
9
  }
10
10
 
11
11
  impl RbRegex {
12
- pub fn new(s: String) -> RbResult<Self> {
12
+ pub fn new(ruby: &Ruby, s: String) -> RbResult<Self> {
13
13
  Ok(Self {
14
- inner: Regex::new(&s)
15
- .map_err(|e| Error::new(exception::runtime_error(), e.description().to_owned()))?,
14
+ inner: Regex::new(&s).map_err(|e| {
15
+ Error::new(ruby.exception_runtime_error(), e.description().to_owned())
16
+ })?,
16
17
  pattern: s,
17
18
  })
18
19
  }
@@ -1,13 +1,13 @@
1
1
  module Tokenizers
2
2
  module FromPretrained
3
3
  # for user agent
4
- TOKENIZERS_VERSION = "0.21.0"
4
+ TOKENIZERS_VERSION = "0.22.0"
5
5
 
6
6
  # use Ruby for downloads
7
7
  # this avoids the need to vendor OpenSSL on Linux
8
8
  # and reduces the extension size by about half
9
9
  def from_pretrained(identifier, revision: "main", auth_token: nil)
10
- require "cgi"
10
+ require "cgi/escape"
11
11
  require "digest"
12
12
  require "fileutils"
13
13
  require "json"
@@ -1,15 +1,16 @@
1
1
  module Tokenizers
2
2
  module Trainers
3
3
  class UnigramTrainer
4
- def self.new(vocab_size: 8000,
5
- show_progress: true,
6
- special_tokens: [],
7
- initial_alphabet: [],
8
- shrinking_factor: 0.75,
9
- unk_token: nil,
10
- max_piece_length: 16,
11
- n_sub_iterations: 2)
12
-
4
+ def self.new(
5
+ vocab_size: 8000,
6
+ show_progress: true,
7
+ special_tokens: [],
8
+ initial_alphabet: [],
9
+ shrinking_factor: 0.75,
10
+ unk_token: nil,
11
+ max_piece_length: 16,
12
+ n_sub_iterations: 2
13
+ )
13
14
  _new({
14
15
  vocab_size: vocab_size,
15
16
  show_progress: show_progress,
@@ -1,15 +1,16 @@
1
1
  module Tokenizers
2
2
  module Trainers
3
3
  class WordPieceTrainer
4
- def self.new(vocab_size: 30000,
5
- min_frequency: 0,
6
- show_progress: true,
7
- special_tokens: [],
8
- limit_alphabet: nil,
9
- initial_alphabet: [],
10
- continuing_subword_prefix: "##",
11
- end_of_word_suffix: nil)
12
-
4
+ def self.new(
5
+ vocab_size: 30000,
6
+ min_frequency: 0,
7
+ show_progress: true,
8
+ special_tokens: [],
9
+ limit_alphabet: nil,
10
+ initial_alphabet: [],
11
+ continuing_subword_prefix: "##",
12
+ end_of_word_suffix: nil
13
+ )
13
14
  _new({
14
15
  vocab_size: vocab_size,
15
16
  min_frequency: min_frequency,
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.5.4"
2
+ VERSION = "0.6.0"
3
3
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.4
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  bindir: bin
9
9
  cert_chain: []
10
- date: 2024-12-29 00:00:00.000000000 Z
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: rb_sys
@@ -91,14 +91,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
91
91
  requirements:
92
92
  - - ">="
93
93
  - !ruby/object:Gem::Version
94
- version: '3.1'
94
+ version: '3.2'
95
95
  required_rubygems_version: !ruby/object:Gem::Requirement
96
96
  requirements:
97
97
  - - ">="
98
98
  - !ruby/object:Gem::Version
99
99
  version: '0'
100
100
  requirements: []
101
- rubygems_version: 3.6.2
101
+ rubygems_version: 3.6.9
102
102
  specification_version: 4
103
103
  summary: Fast state-of-the-art tokenizers for Ruby
104
104
  test_files: []