blingfire 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +6 -2
- data/lib/blingfire.rb +24 -23
- data/lib/blingfire/model.rb +1 -0
- data/lib/blingfire/version.rb +1 -1
- data/vendor/blingfiretokdll.dll +0 -0
- data/vendor/libblingfiretokdll.dylib +0 -0
- data/vendor/libblingfiretokdll.so +0 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 225c7163bbbd6bc06e0b7d54b2a0a64c3b622c174e5c6cb0b8fa54b6e68740a3
|
4
|
+
data.tar.gz: b9d8ce2d21a450c53b1628bdab97b07aa2c39dd1b758fa763c39f052a944084c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 98eed6c71ee974381772e032b27d022cd9c9485a83e8dae25c21817a19489953c7c5dfd4bb4498d796b65a569bd3468f55df558603d904ed9293a4c21e7041d5
|
7
|
+
data.tar.gz: 15389737eec2ad109f528bf7cd064184205bc17d822d108031f29eb7cf9d421c0a84f40a3d170461e7fa3c853b1fbc912b28f07e403924f9744cb8f6c2294f2d
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,8 @@
|
|
1
|
-
#
|
1
|
+
# Bling Fire
|
2
2
|
|
3
|
-
[
|
3
|
+
[Bling Fire](https://github.com/microsoft/BlingFire) - high speed text tokenization - for Ruby
|
4
|
+
|
5
|
+
[![Build Status](https://travis-ci.org/ankane/blingfire.svg?branch=master)](https://travis-ci.org/ankane/blingfire) [![Build status](https://ci.appveyor.com/api/projects/status/3gyca4gsjw2w9ns1/branch/master?svg=true)](https://ci.appveyor.com/project/ankane/blingfire/branch/master)
|
4
6
|
|
5
7
|
## Installation
|
6
8
|
|
@@ -39,6 +41,8 @@ BlingFire comes with a default model that follows the tokenization logic of NLTK
|
|
39
41
|
- [BERT Chinese](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/bert_chinese.bin)
|
40
42
|
- [BERT Multilingual Cased](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/bert_multi_cased.bin)
|
41
43
|
- [WBD](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/wbd_chuni.bin)
|
44
|
+
- [XLNet](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/xlnet.bin)
|
45
|
+
- [XLNet No Norm](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/xlnet_nonorm.bin)
|
42
46
|
|
43
47
|
Load a model
|
44
48
|
|
data/lib/blingfire.rb
CHANGED
@@ -35,42 +35,34 @@ module BlingFire
|
|
35
35
|
end
|
36
36
|
|
37
37
|
def text_to_words(text)
|
38
|
-
text
|
39
|
-
|
40
|
-
|
41
|
-
check_status out_size
|
42
|
-
encode_utf8(out[0, out_size - 1]).split(" ")
|
38
|
+
text_to(text, " ") do |t, out|
|
39
|
+
FFI.TextToWords(t, t.bytesize, out, out.size)
|
40
|
+
end
|
43
41
|
end
|
44
42
|
|
45
43
|
def text_to_words_with_model(model, text)
|
46
|
-
text
|
47
|
-
|
48
|
-
|
49
|
-
check_status out_size
|
50
|
-
encode_utf8(out[0, out_size - 1]).split(" ")
|
44
|
+
text_to(text, " ") do |t, out|
|
45
|
+
FFI.TextToWordsWithModel(t, t.bytesize, out, out.size, model)
|
46
|
+
end
|
51
47
|
end
|
52
48
|
|
53
49
|
def text_to_sentences(text)
|
54
|
-
text
|
55
|
-
|
56
|
-
|
57
|
-
check_status out_size
|
58
|
-
encode_utf8(out[0, out_size - 1]).split("\n")
|
50
|
+
text_to(text, "\n") do |t, out|
|
51
|
+
FFI.TextToSentences(t, t.bytesize, out, out.size)
|
52
|
+
end
|
59
53
|
end
|
60
54
|
|
61
55
|
def text_to_sentences_with_model(model, text)
|
62
|
-
text
|
63
|
-
|
64
|
-
|
65
|
-
check_status out_size
|
66
|
-
encode_utf8(out[0, out_size - 1]).split("\n")
|
56
|
+
text_to(text, "\n") do |t, out|
|
57
|
+
FFI.TextToSentencesWithModel(t, t.bytesize, out, out.size, model)
|
58
|
+
end
|
67
59
|
end
|
68
60
|
|
69
61
|
def text_to_ids(model, text, max_len = nil, unk_id = 0)
|
70
62
|
text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
|
71
63
|
ids = Fiddle::Pointer.malloc((max_len || text.size) * Fiddle::SIZEOF_INT)
|
72
64
|
out_size = FFI.TextToIds(model, text, text.bytesize, ids, ids.size, unk_id)
|
73
|
-
check_status out_size
|
65
|
+
check_status out_size, ids
|
74
66
|
ids[0, (max_len || out_size) * Fiddle::SIZEOF_INT].unpack("i!*")
|
75
67
|
end
|
76
68
|
|
@@ -80,8 +72,17 @@ module BlingFire
|
|
80
72
|
|
81
73
|
private
|
82
74
|
|
83
|
-
def check_status(ret)
|
84
|
-
raise Error, "
|
75
|
+
def check_status(ret, ptr)
|
76
|
+
raise Error, "Not enough memory allocated" if ret == -1 || ret > ptr.size
|
77
|
+
end
|
78
|
+
|
79
|
+
def text_to(text, sep)
|
80
|
+
text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
|
81
|
+
# TODO allocate less, and try again if needed
|
82
|
+
out = Fiddle::Pointer.malloc([text.bytesize * 1.5, 20].max)
|
83
|
+
out_size = yield(text, out)
|
84
|
+
check_status out_size, out
|
85
|
+
encode_utf8(out.to_str(out_size - 1)).split(sep)
|
85
86
|
end
|
86
87
|
|
87
88
|
def encode_utf8(text)
|
data/lib/blingfire/model.rb
CHANGED
data/lib/blingfire/version.rb
CHANGED
data/vendor/blingfiretokdll.dll
CHANGED
Binary file
|
Binary file
|
Binary file
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: blingfire
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-05-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|