blingfire 0.1.3 → 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +19 -0
- data/README.md +31 -13
- data/lib/blingfire/ffi.rb +8 -0
- data/lib/blingfire/model.rb +13 -1
- data/lib/blingfire/version.rb +1 -1
- data/lib/blingfire.rb +28 -5
- data/vendor/blingfiretokdll.dll +0 -0
- data/vendor/libblingfiretokdll.arm64.dylib +0 -0
- data/vendor/libblingfiretokdll.arm64.so +0 -0
- data/vendor/libblingfiretokdll.dylib +0 -0
- data/vendor/libblingfiretokdll.so +0 -0
- metadata +9 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4c6b1fa4c1af8020140b480f8c3579bd62232b25d3cbe470a5149ecac7279a8c
|
4
|
+
data.tar.gz: ea2c66e829368d3858759edb0e0b5644e2f1abc14458d6a8a11e618aacb33951
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 714e09d5190152a1009c33000651675405a3f6402f1be056ecce74d73c1aa16c265d3c68245cb0118b5fe6e693d35f21b64b372b8f9d91ae445f063d92c22678
|
7
|
+
data.tar.gz: 3ec81c37b184e4363b4faf16d89a9869362c0655e19be451540dada56f32cba607fc7fe94dbe9210da3a864dc22dd692fc161619d2c80e6cdf0847b8f8029305
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,22 @@
|
|
1
|
+
## 0.1.7 (2021-09-24)
|
2
|
+
|
3
|
+
- Updated Bling Fire to 0.1.8
|
4
|
+
- Added `ids_to_text` method
|
5
|
+
|
6
|
+
## 0.1.6 (2021-06-07)
|
7
|
+
|
8
|
+
- Updated Bling Fire to 0.1.7
|
9
|
+
- Added `prefix` option
|
10
|
+
|
11
|
+
## 0.1.5 (2021-03-14)
|
12
|
+
|
13
|
+
- Updated Bling Fire to 0.1.5
|
14
|
+
- Added ARM shared library for Linux
|
15
|
+
|
16
|
+
## 0.1.4 (2020-12-28)
|
17
|
+
|
18
|
+
- Added ARM shared library for Mac
|
19
|
+
|
1
20
|
## 0.1.3 (2020-10-01)
|
2
21
|
|
3
22
|
- Added `text_to_words_with_offsets` method
|
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
[Bling Fire](https://github.com/microsoft/BlingFire) - high speed text tokenization - for Ruby
|
4
4
|
|
5
|
-
[![Build Status](https://
|
5
|
+
[![Build Status](https://github.com/ankane/blingfire/workflows/build/badge.svg?branch=master)](https://github.com/ankane/blingfire/actions)
|
6
6
|
|
7
7
|
## Installation
|
8
8
|
|
@@ -46,18 +46,16 @@ sentences, start_offsets, end_offsets = model.text_to_sentences_with_offsets(tex
|
|
46
46
|
|
47
47
|
## Pre-trained Models
|
48
48
|
|
49
|
-
|
50
|
-
|
51
|
-
- [BERT Base](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/bert_base_tok.bin)
|
52
|
-
- [
|
53
|
-
- [
|
54
|
-
- [
|
55
|
-
- [
|
56
|
-
- [
|
57
|
-
- [
|
58
|
-
- [
|
59
|
-
- [XLNet](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/xlnet.bin)
|
60
|
-
- [XLNet No Norm](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/xlnet_nonorm.bin)
|
49
|
+
Bling Fire comes with a default model that follows the tokenization logic of NLTK with a few changes. You can also download other models:
|
50
|
+
|
51
|
+
- [BERT Base](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/bert_base_tok.bin), [BERT Base Cased](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/bert_base_cased_tok.bin), [BERT Chinese](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/bert_chinese.bin), [BERT Multilingual Cased](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/bert_multi_cased.bin)
|
52
|
+
- [GPT-2](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/gpt2.bin)
|
53
|
+
- [Laser 100k](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/laser100k.bin), [Laser 250k](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/laser250k.bin), [Laser 500k](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/laser500k.bin)
|
54
|
+
- [RoBERTa](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/roberta.bin)
|
55
|
+
- [Syllab](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/syllab.bin)
|
56
|
+
- [URI 100k](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/uri100k.bin), [URI 250k](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/uri250k.bin), [URI 500k](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/uri500k.bin)
|
57
|
+
- [XLM-RoBERTa](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/xlm_roberta_base.bin)
|
58
|
+
- [XLNet](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/xlnet.bin), [XLNet No Norm](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/xlnet_nonorm.bin)
|
61
59
|
- [WBD](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/wbd_chuni.bin)
|
62
60
|
|
63
61
|
Load a model
|
@@ -78,6 +76,26 @@ Get offsets for ids
|
|
78
76
|
ids, start_offsets, end_offsets = model.text_to_ids_with_offsets(text)
|
79
77
|
```
|
80
78
|
|
79
|
+
Disable prefix space
|
80
|
+
|
81
|
+
```ruby
|
82
|
+
model = BlingFire.load_model("roberta.bin", prefix: false)
|
83
|
+
```
|
84
|
+
|
85
|
+
## Ids to Text [experimental]
|
86
|
+
|
87
|
+
Load a model
|
88
|
+
|
89
|
+
```ruby
|
90
|
+
model = BlingFire.load_model("bert_base_tok.i2w")
|
91
|
+
```
|
92
|
+
|
93
|
+
Convert ids to text
|
94
|
+
|
95
|
+
```ruby
|
96
|
+
model.ids_to_text(ids)
|
97
|
+
```
|
98
|
+
|
81
99
|
## History
|
82
100
|
|
83
101
|
View the [changelog](https://github.com/ankane/blingfire/blob/master/CHANGELOG.md)
|
data/lib/blingfire/ffi.rb
CHANGED
@@ -10,6 +10,8 @@ module BlingFire
|
|
10
10
|
raise e
|
11
11
|
end
|
12
12
|
|
13
|
+
typealias "bool", "char"
|
14
|
+
|
13
15
|
# https://github.com/microsoft/BlingFire/blob/master/blingfiretools/blingfiretokdll/blingfiretokdll.cpp
|
14
16
|
|
15
17
|
# version
|
@@ -40,5 +42,11 @@ module BlingFire
|
|
40
42
|
|
41
43
|
# free model
|
42
44
|
extern "int FreeModel(void* ModelPtr)"
|
45
|
+
|
46
|
+
# prefix
|
47
|
+
extern "int SetNoDummyPrefix(void* ModelPtr, bool fNoDummyPrefix)"
|
48
|
+
|
49
|
+
# ids to text
|
50
|
+
extern "int IdsToText(void* ModelPtr, int32_t * pIdsArr, int IdsCount, char * pOutUtf8Str, int MaxOutUtf8StrByteCount, bool SkipSpecialTokens)"
|
43
51
|
end
|
44
52
|
end
|
data/lib/blingfire/model.rb
CHANGED
@@ -1,11 +1,15 @@
|
|
1
1
|
module BlingFire
|
2
2
|
class Model
|
3
|
-
def initialize(path = nil)
|
3
|
+
def initialize(path = nil, prefix: nil)
|
4
4
|
@handle = nil
|
5
5
|
if path
|
6
6
|
raise Error, "Model not found" unless File.exist?(path)
|
7
7
|
@handle = FFI.LoadModel(path)
|
8
8
|
ObjectSpace.define_finalizer(self, self.class.finalize(@handle))
|
9
|
+
|
10
|
+
BlingFire.change_settings_dummy_prefix(@handle, prefix) unless prefix.nil?
|
11
|
+
else
|
12
|
+
raise Error, "prefix option requires path" unless prefix.nil?
|
9
13
|
end
|
10
14
|
end
|
11
15
|
|
@@ -57,6 +61,14 @@ module BlingFire
|
|
57
61
|
end
|
58
62
|
end
|
59
63
|
|
64
|
+
def ids_to_text(ids, skip_special_tokens: true, output_buffer_size: nil)
|
65
|
+
if @handle
|
66
|
+
BlingFire.ids_to_text(@handle, ids, skip_special_tokens: skip_special_tokens, output_buffer_size: output_buffer_size)
|
67
|
+
else
|
68
|
+
raise "Not implemented"
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
60
72
|
def to_ptr
|
61
73
|
@handle
|
62
74
|
end
|
data/lib/blingfire/version.rb
CHANGED
data/lib/blingfire.rb
CHANGED
@@ -15,9 +15,17 @@ module BlingFire
|
|
15
15
|
if Gem.win_platform?
|
16
16
|
"blingfiretokdll.dll"
|
17
17
|
elsif RbConfig::CONFIG["host_os"] =~ /darwin/i
|
18
|
-
"
|
18
|
+
if RbConfig::CONFIG["host_cpu"] =~ /arm/i
|
19
|
+
"libblingfiretokdll.arm64.dylib"
|
20
|
+
else
|
21
|
+
"libblingfiretokdll.dylib"
|
22
|
+
end
|
19
23
|
else
|
20
|
-
"
|
24
|
+
if RbConfig::CONFIG["host_cpu"] =~ /aarch64/i
|
25
|
+
"libblingfiretokdll.arm64.so"
|
26
|
+
else
|
27
|
+
"libblingfiretokdll.so"
|
28
|
+
end
|
21
29
|
end
|
22
30
|
vendor_lib = File.expand_path("../vendor/#{lib_name}", __dir__)
|
23
31
|
self.ffi_lib = [vendor_lib]
|
@@ -30,8 +38,8 @@ module BlingFire
|
|
30
38
|
FFI.GetBlingFireTokVersion
|
31
39
|
end
|
32
40
|
|
33
|
-
def load_model(path)
|
34
|
-
Model.new(path)
|
41
|
+
def load_model(path, **options)
|
42
|
+
Model.new(path, **options)
|
35
43
|
end
|
36
44
|
|
37
45
|
def text_to_words(text)
|
@@ -105,6 +113,15 @@ module BlingFire
|
|
105
113
|
[result].concat(unpack_offsets(start_offsets, end_offsets, result, text))
|
106
114
|
end
|
107
115
|
|
116
|
+
def ids_to_text(model, ids, skip_special_tokens: true, output_buffer_size: nil)
|
117
|
+
output_buffer_size ||= ids.size * 32
|
118
|
+
c_ids = Fiddle::Pointer[ids.pack("i*")]
|
119
|
+
out = Fiddle::Pointer.malloc(output_buffer_size)
|
120
|
+
out_size = FFI.IdsToText(model, c_ids, ids.size, out, output_buffer_size, skip_special_tokens ? 1 : 0)
|
121
|
+
check_status out_size, out
|
122
|
+
encode_utf8(out.to_str(out_size - 1))
|
123
|
+
end
|
124
|
+
|
108
125
|
def free_model(model)
|
109
126
|
FFI.FreeModel(model)
|
110
127
|
end
|
@@ -118,6 +135,12 @@ module BlingFire
|
|
118
135
|
encode_utf8(out.to_str(out_size))
|
119
136
|
end
|
120
137
|
|
138
|
+
def change_settings_dummy_prefix(model, value)
|
139
|
+
# use opposite of value
|
140
|
+
ret = FFI.SetNoDummyPrefix(model, value ? 0 : 1)
|
141
|
+
raise Error, "Bad status: #{ret}" if ret != 1
|
142
|
+
end
|
143
|
+
|
121
144
|
private
|
122
145
|
|
123
146
|
def check_status(ret, ptr)
|
@@ -163,7 +186,7 @@ module BlingFire
|
|
163
186
|
# TODO see if more efficient to store next_pos in variable
|
164
187
|
pos = 0
|
165
188
|
text.each_char.with_index do |c, i|
|
166
|
-
while pos == start_bytes[starts.size]
|
189
|
+
while pos == start_bytes[starts.size] || start_bytes[starts.size] == -1
|
167
190
|
starts << i
|
168
191
|
end
|
169
192
|
pos += c.bytesize
|
data/vendor/blingfiretokdll.dll
CHANGED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: blingfire
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-09-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -52,7 +52,7 @@ dependencies:
|
|
52
52
|
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '5'
|
55
|
-
description:
|
55
|
+
description:
|
56
56
|
email: andrew@chartkick.com
|
57
57
|
executables: []
|
58
58
|
extensions: []
|
@@ -67,13 +67,15 @@ files:
|
|
67
67
|
- lib/blingfire/version.rb
|
68
68
|
- vendor/LICENSE
|
69
69
|
- vendor/blingfiretokdll.dll
|
70
|
+
- vendor/libblingfiretokdll.arm64.dylib
|
71
|
+
- vendor/libblingfiretokdll.arm64.so
|
70
72
|
- vendor/libblingfiretokdll.dylib
|
71
73
|
- vendor/libblingfiretokdll.so
|
72
74
|
homepage: https://github.com/ankane/blingfire
|
73
75
|
licenses:
|
74
76
|
- MIT
|
75
77
|
metadata: {}
|
76
|
-
post_install_message:
|
78
|
+
post_install_message:
|
77
79
|
rdoc_options: []
|
78
80
|
require_paths:
|
79
81
|
- lib
|
@@ -88,8 +90,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
88
90
|
- !ruby/object:Gem::Version
|
89
91
|
version: '0'
|
90
92
|
requirements: []
|
91
|
-
rubygems_version: 3.
|
92
|
-
signing_key:
|
93
|
+
rubygems_version: 3.2.22
|
94
|
+
signing_key:
|
93
95
|
specification_version: 4
|
94
96
|
summary: High speed text tokenization for Ruby
|
95
97
|
test_files: []
|