blingfire 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +6 -2
- data/lib/blingfire.rb +24 -23
- data/lib/blingfire/model.rb +1 -0
- data/lib/blingfire/version.rb +1 -1
- data/vendor/blingfiretokdll.dll +0 -0
- data/vendor/libblingfiretokdll.dylib +0 -0
- data/vendor/libblingfiretokdll.so +0 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 225c7163bbbd6bc06e0b7d54b2a0a64c3b622c174e5c6cb0b8fa54b6e68740a3
|
4
|
+
data.tar.gz: b9d8ce2d21a450c53b1628bdab97b07aa2c39dd1b758fa763c39f052a944084c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 98eed6c71ee974381772e032b27d022cd9c9485a83e8dae25c21817a19489953c7c5dfd4bb4498d796b65a569bd3468f55df558603d904ed9293a4c21e7041d5
|
7
|
+
data.tar.gz: 15389737eec2ad109f528bf7cd064184205bc17d822d108031f29eb7cf9d421c0a84f40a3d170461e7fa3c853b1fbc912b28f07e403924f9744cb8f6c2294f2d
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,8 @@
|
|
1
|
-
#
|
1
|
+
# Bling Fire
|
2
2
|
|
3
|
-
[
|
3
|
+
[Bling Fire](https://github.com/microsoft/BlingFire) - high speed text tokenization - for Ruby
|
4
|
+
|
5
|
+
[](https://travis-ci.org/ankane/blingfire) [](https://ci.appveyor.com/project/ankane/blingfire/branch/master)
|
4
6
|
|
5
7
|
## Installation
|
6
8
|
|
@@ -39,6 +41,8 @@ BlingFire comes with a default model that follows the tokenization logic of NLTK
|
|
39
41
|
- [BERT Chinese](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/bert_chinese.bin)
|
40
42
|
- [BERT Multilingual Cased](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/bert_multi_cased.bin)
|
41
43
|
- [WBD](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/wbd_chuni.bin)
|
44
|
+
- [XLNet](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/xlnet.bin)
|
45
|
+
- [XLNet No Norm](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/xlnet_nonorm.bin)
|
42
46
|
|
43
47
|
Load a model
|
44
48
|
|
data/lib/blingfire.rb
CHANGED
@@ -35,42 +35,34 @@ module BlingFire
|
|
35
35
|
end
|
36
36
|
|
37
37
|
def text_to_words(text)
|
38
|
-
text
|
39
|
-
|
40
|
-
|
41
|
-
check_status out_size
|
42
|
-
encode_utf8(out[0, out_size - 1]).split(" ")
|
38
|
+
text_to(text, " ") do |t, out|
|
39
|
+
FFI.TextToWords(t, t.bytesize, out, out.size)
|
40
|
+
end
|
43
41
|
end
|
44
42
|
|
45
43
|
def text_to_words_with_model(model, text)
|
46
|
-
text
|
47
|
-
|
48
|
-
|
49
|
-
check_status out_size
|
50
|
-
encode_utf8(out[0, out_size - 1]).split(" ")
|
44
|
+
text_to(text, " ") do |t, out|
|
45
|
+
FFI.TextToWordsWithModel(t, t.bytesize, out, out.size, model)
|
46
|
+
end
|
51
47
|
end
|
52
48
|
|
53
49
|
def text_to_sentences(text)
|
54
|
-
text
|
55
|
-
|
56
|
-
|
57
|
-
check_status out_size
|
58
|
-
encode_utf8(out[0, out_size - 1]).split("\n")
|
50
|
+
text_to(text, "\n") do |t, out|
|
51
|
+
FFI.TextToSentences(t, t.bytesize, out, out.size)
|
52
|
+
end
|
59
53
|
end
|
60
54
|
|
61
55
|
def text_to_sentences_with_model(model, text)
|
62
|
-
text
|
63
|
-
|
64
|
-
|
65
|
-
check_status out_size
|
66
|
-
encode_utf8(out[0, out_size - 1]).split("\n")
|
56
|
+
text_to(text, "\n") do |t, out|
|
57
|
+
FFI.TextToSentencesWithModel(t, t.bytesize, out, out.size, model)
|
58
|
+
end
|
67
59
|
end
|
68
60
|
|
69
61
|
def text_to_ids(model, text, max_len = nil, unk_id = 0)
|
70
62
|
text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
|
71
63
|
ids = Fiddle::Pointer.malloc((max_len || text.size) * Fiddle::SIZEOF_INT)
|
72
64
|
out_size = FFI.TextToIds(model, text, text.bytesize, ids, ids.size, unk_id)
|
73
|
-
check_status out_size
|
65
|
+
check_status out_size, ids
|
74
66
|
ids[0, (max_len || out_size) * Fiddle::SIZEOF_INT].unpack("i!*")
|
75
67
|
end
|
76
68
|
|
@@ -80,8 +72,17 @@ module BlingFire
|
|
80
72
|
|
81
73
|
private
|
82
74
|
|
83
|
-
def check_status(ret)
|
84
|
-
raise Error, "
|
75
|
+
def check_status(ret, ptr)
|
76
|
+
raise Error, "Not enough memory allocated" if ret == -1 || ret > ptr.size
|
77
|
+
end
|
78
|
+
|
79
|
+
def text_to(text, sep)
|
80
|
+
text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
|
81
|
+
# TODO allocate less, and try again if needed
|
82
|
+
out = Fiddle::Pointer.malloc([text.bytesize * 1.5, 20].max)
|
83
|
+
out_size = yield(text, out)
|
84
|
+
check_status out_size, out
|
85
|
+
encode_utf8(out.to_str(out_size - 1)).split(sep)
|
85
86
|
end
|
86
87
|
|
87
88
|
def encode_utf8(text)
|
data/lib/blingfire/model.rb
CHANGED
data/lib/blingfire/version.rb
CHANGED
data/vendor/blingfiretokdll.dll
CHANGED
Binary file
|
Binary file
|
Binary file
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: blingfire
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-05-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|