sentencepiece 0.0.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -2
- data/README.md +1 -2
- data/ext/sentencepiece/extconf.rb +5 -1
- data/ext/sentencepiece/sentencepiece.cpp +1 -1
- data/ext/sentencepiece/sentencepiece.hpp +2 -1
- data/lib/sentencepiece/version.rb +3 -1
- data/sig/sentencepiece.rbs +60 -1
- metadata +4 -6
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 45cc838c2108753b87e03e3aee4bf3665f1e8c6e0cd407e5a68355b44657c582
|
|
4
|
+
data.tar.gz: 14d29d47f2a78b57218ceaf45c12a30de270f000c5d6d756b6b66a767a2ee128
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ac0b780ec734901f030bbfdcf91949401b40064d89480fa54d303d7d83f707e42e0ba897cf342ea4650402b350deac2da4c1017cac7c278adb1272e5cee0834e
|
|
7
|
+
data.tar.gz: b03e10826137a4d3d1002906af0e8e46014eafed08da76fb5494e7b5311deaf002b40b7771b7299ef1b39cb2349e1fa9f5f3262020479d2655a2ede9c3db4f03
|
data/CHANGELOG.md
CHANGED
|
@@ -1,6 +1,13 @@
|
|
|
1
|
-
## [
|
|
1
|
+
## [[0.2.0](https://github.com/yoshoku/sentencepiece.rb/compare/v0.1.0...v0.2.0)] - 2025-12-29
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
- Change to specify `c++17` with -std option as recent sentencepiece uses `std::string_view`.
|
|
4
|
+
|
|
5
|
+
## [[0.1.0](https://github.com/yoshoku/sentencepiece.rb/compare/v0.0.2...v0.1.0)] - 2023-03-26
|
|
6
|
+
|
|
7
|
+
- Add API documentation.
|
|
8
|
+
- Add type signatures.
|
|
9
|
+
|
|
10
|
+
## [[0.0.2](https://github.com/yoshoku/sentencepiece.rb/compare/v0.0.1...v0.0.2)] - 2023-03-26
|
|
4
11
|
|
|
5
12
|
- Add SentencePieceTrainer class.
|
|
6
13
|
- Add some encoding and decoding methods to SentencePieceProcessor.
|
data/README.md
CHANGED
|
@@ -3,12 +3,11 @@
|
|
|
3
3
|
[](https://github.com/yoshoku/sentencepiece.rb/actions/workflows/main.yml)
|
|
4
4
|
[](https://badge.fury.io/rb/sentencepiece)
|
|
5
5
|
[](https://github.com/yoshoku/sentencepiece.rb/blob/main/LICENSE.txt)
|
|
6
|
+
[](https://yoshoku.github.io/sentencepiece.rb/doc/)
|
|
6
7
|
|
|
7
8
|
sentencepiece.rb provides Ruby bindings for the [SentencePiece](https://github.com/google/sentencepiece),
|
|
8
9
|
an unsupervised text tokenizer and detokenizer for neural network-based text generation.
|
|
9
10
|
|
|
10
|
-
It is still **under development** and may undergo many changes in the future.
|
|
11
|
-
|
|
12
11
|
## Installation
|
|
13
12
|
|
|
14
13
|
Install SentencePiece using your OS package manager;
|
|
@@ -9,6 +9,10 @@ abort 'libsentencepiece_train is not found.' unless have_library('sentencepiece_
|
|
|
9
9
|
# abort 'sentencepiece_processor.h is not found.' unless have_header('sentencepiece_processor.h')
|
|
10
10
|
# abort 'sentencepiece_trainer.h is not found.' unless have_header('sentencepiece_trainer.h')
|
|
11
11
|
|
|
12
|
-
$CXXFLAGS <<
|
|
12
|
+
$CXXFLAGS << if /mswin/ =~ RUBY_PLATFORM
|
|
13
|
+
'/std:c++17'
|
|
14
|
+
else
|
|
15
|
+
' -std=c++17'
|
|
16
|
+
end
|
|
13
17
|
|
|
14
18
|
create_makefile('sentencepiece/sentencepiece')
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* sentencepiece.rb provides Ruby bindings for the SentencePiece.
|
|
3
3
|
*
|
|
4
|
-
* Copyright (c) 2023 Atsushi Tatsuma
|
|
4
|
+
* Copyright (c) 2023-2025 Atsushi Tatsuma
|
|
5
5
|
*
|
|
6
6
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
7
|
* you may not use this file except in compliance with the License.
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* sentencepiece.rb provides Ruby bindings for the SentencePiece.
|
|
3
3
|
*
|
|
4
|
-
* Copyright (c) 2023 Atsushi Tatsuma
|
|
4
|
+
* Copyright (c) 2023-2025 Atsushi Tatsuma
|
|
5
5
|
*
|
|
6
6
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
7
|
* you may not use this file except in compliance with the License.
|
|
@@ -79,6 +79,7 @@ public:
|
|
|
79
79
|
rb_define_method(rb_cSentencePieceProcessor, "bos_id", RUBY_METHOD_FUNC(_sentencepiece_processor_bos_id), 0);
|
|
80
80
|
rb_define_method(rb_cSentencePieceProcessor, "eos_id", RUBY_METHOD_FUNC(_sentencepiece_processor_eos_id), 0);
|
|
81
81
|
rb_define_method(rb_cSentencePieceProcessor, "pad_id", RUBY_METHOD_FUNC(_sentencepiece_processor_pad_id), 0);
|
|
82
|
+
rb_define_alias(rb_cSentencePieceProcessor, "vocab_size", "piece_size");
|
|
82
83
|
return rb_cSentencePieceProcessor;
|
|
83
84
|
};
|
|
84
85
|
|
data/sig/sentencepiece.rbs
CHANGED
|
@@ -1,4 +1,63 @@
|
|
|
1
1
|
module SentencePiece
|
|
2
2
|
VERSION: String
|
|
3
|
-
|
|
3
|
+
|
|
4
|
+
class SentencePieceTrainer
|
|
5
|
+
def self.train: (String args) -> void
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
class SentencePieceProcessor
|
|
9
|
+
def initialize: (?model_file: String model_file) -> void
|
|
10
|
+
|
|
11
|
+
def load: (String model_file) -> void
|
|
12
|
+
|
|
13
|
+
def encode: (String text, ?out_type: String out_type) -> (Array[Integer] | Array[String])
|
|
14
|
+
| (Array[String] text, ?out_type: String out_type) -> (Array[Array[Integer]] | Array[Array[String]])
|
|
15
|
+
|
|
16
|
+
def encode_as_ids: (String text) -> Array[Integer]
|
|
17
|
+
|
|
18
|
+
def encode_as_pieces: (String text) -> Array[String]
|
|
19
|
+
|
|
20
|
+
def encode_as_serialized_proto: (String text) -> String
|
|
21
|
+
|
|
22
|
+
def nbest_encode_as_ids: (String text, nbest_size: Integer nbest_size) -> Array[Array[Integer]]
|
|
23
|
+
|
|
24
|
+
def nbest_encode_as_pieces: (String text, nbest_size: Integer nbest_size) -> Array[Array[String]]
|
|
25
|
+
|
|
26
|
+
def nbest_encode_as_serialized_proto: (String text, nbest_size: Integer nbest_size) -> String
|
|
27
|
+
|
|
28
|
+
def sample_encode_as_ids: (String text, nbest_size: Integer nbest_size, alpha: Float alpha) -> Array[Integer]
|
|
29
|
+
|
|
30
|
+
def sample_encode_as_pieces: (String text, nbest_size: Integer nbest_size, alpha: Float alpha) -> Array[String]
|
|
31
|
+
|
|
32
|
+
def sample_encode_as_serialized_proto: (String text, nbest_size: Integer nbest_size, alpha: Float alpha) -> String
|
|
33
|
+
|
|
34
|
+
def decode: (Array[Integer], ?out_type: String out_type) -> String
|
|
35
|
+
| (Array[Array[Integer]], ?out_type: String out_type) -> Array[String]
|
|
36
|
+
| (Array[String], ?out_type: String out_type) -> String
|
|
37
|
+
| (Array[Array[String]], ?out_type: String out_type) -> Array[String]
|
|
38
|
+
|
|
39
|
+
def decode_ids: (Array[Integer]) -> String
|
|
40
|
+
|
|
41
|
+
def decode_ids_as_serialized_proto: (Array[Integer] ids) -> String
|
|
42
|
+
|
|
43
|
+
def decode_pieces: (Array[String]) -> String
|
|
44
|
+
|
|
45
|
+
def decode_pieces_as_serialized_proto: (Array[String] pieces) -> String
|
|
46
|
+
|
|
47
|
+
def id_to_piece: (Integer id) -> String
|
|
48
|
+
| (Array[Integer] ids) -> Array[String]
|
|
49
|
+
|
|
50
|
+
def piece_to_id: (String piece) -> Integer
|
|
51
|
+
| (Array[String] pieces) -> Array[Integer]
|
|
52
|
+
|
|
53
|
+
def piece_size: () -> Integer
|
|
54
|
+
|
|
55
|
+
def bos_id: () -> Integer
|
|
56
|
+
|
|
57
|
+
def eos_id: () -> Integer
|
|
58
|
+
|
|
59
|
+
def pad_id: () -> Integer
|
|
60
|
+
|
|
61
|
+
def unk_id: () -> Integer
|
|
62
|
+
end
|
|
4
63
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: sentencepiece
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0
|
|
4
|
+
version: 0.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- yoshoku
|
|
8
|
-
autorequire:
|
|
9
8
|
bindir: exe
|
|
10
9
|
cert_chain: []
|
|
11
|
-
date:
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
12
11
|
dependencies: []
|
|
13
12
|
description: |
|
|
14
13
|
sentencepiece.rb provides Ruby bindings for the SentencePiece,
|
|
@@ -36,8 +35,8 @@ metadata:
|
|
|
36
35
|
homepage_uri: https://github.com/yoshoku/sentencepiece.rb
|
|
37
36
|
source_code_uri: https://github.com/yoshoku/sentencepiece.rb
|
|
38
37
|
changelog_uri: https://github.com/yoshoku/sentencepiece.rb/blob/main/CHANGELOG.md
|
|
38
|
+
documentation_uri: https://yoshoku.github.io/sentencepiece.rb/doc/
|
|
39
39
|
rubygems_mfa_required: 'true'
|
|
40
|
-
post_install_message:
|
|
41
40
|
rdoc_options: []
|
|
42
41
|
require_paths:
|
|
43
42
|
- lib
|
|
@@ -52,8 +51,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
52
51
|
- !ruby/object:Gem::Version
|
|
53
52
|
version: '0'
|
|
54
53
|
requirements: []
|
|
55
|
-
rubygems_version:
|
|
56
|
-
signing_key:
|
|
54
|
+
rubygems_version: 4.0.3
|
|
57
55
|
specification_version: 4
|
|
58
56
|
summary: Ruby bindings for the SentencePiece
|
|
59
57
|
test_files: []
|