blingfire 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/LICENSE.txt +18 -18
- data/README.md +19 -1
- data/lib/blingfire.rb +87 -1
- data/lib/blingfire/ffi.rb +26 -4
- data/lib/blingfire/model.rb +25 -0
- data/lib/blingfire/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: af6d8ad14f2c3f56fb148eca079a81bb48b3cec80790bc184fc99afd03572f12
|
4
|
+
data.tar.gz: 92f351ac35b186b54b2d2c261427b3827ccb40613d3dd3fc1f57ce6eec83aa57
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5714bc6333a94669d1b6b69682dd3671933ffc0611107eeb2ce174263c329b42b57c92205afac8519b6626dcae631f724682fccfde57c6514c9cac9233653c62
|
7
|
+
data.tar.gz: 46a6de8f8a081c893db5d446b6e981a170efc4cdcfdaac158f58ba423592f0274990345066a703a687e67d2f2496cb46b2a46cfe8a0f476f19b30afd37fd4697
|
data/CHANGELOG.md
CHANGED
data/LICENSE.txt
CHANGED
@@ -1,22 +1,22 @@
|
|
1
|
-
Copyright (c) 2020 Andrew Kane
|
2
|
-
|
3
1
|
MIT License
|
4
2
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
3
|
+
Copyright (c) Microsoft Corporation. All rights reserved.
|
4
|
+
Copyright (c) 2020 Andrew Kane
|
5
|
+
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
8
|
+
in the Software without restriction, including without limitation the rights
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
11
|
+
furnished to do so, subject to the following conditions:
|
12
12
|
|
13
|
-
The above copyright notice and this permission notice shall be
|
14
|
-
|
13
|
+
The above copyright notice and this permission notice shall be included in all
|
14
|
+
copies or substantial portions of the Software.
|
15
15
|
|
16
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
OF
|
22
|
-
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
22
|
+
SOFTWARE
|
data/README.md
CHANGED
@@ -32,6 +32,18 @@ Tokenize sentences
|
|
32
32
|
model.text_to_sentences(text)
|
33
33
|
```
|
34
34
|
|
35
|
+
Get offsets for words
|
36
|
+
|
37
|
+
```ruby
|
38
|
+
words, start_offsets, end_offsets = model.text_to_words_with_offsets(text)
|
39
|
+
```
|
40
|
+
|
41
|
+
Get offsets for sentences
|
42
|
+
|
43
|
+
```ruby
|
44
|
+
sentences, start_offsets, end_offsets = model.text_to_sentences_with_offsets(text)
|
45
|
+
```
|
46
|
+
|
35
47
|
## Pre-trained Models
|
36
48
|
|
37
49
|
BlingFire comes with a default model that follows the tokenization logic of NLTK with a few changes. You can also download other models:
|
@@ -60,6 +72,12 @@ Convert text to ids
|
|
60
72
|
model.text_to_ids(text)
|
61
73
|
```
|
62
74
|
|
75
|
+
Get offsets for ids
|
76
|
+
|
77
|
+
```ruby
|
78
|
+
ids, start_offsets, end_offsets = model.text_to_ids_with_offsets(text)
|
79
|
+
```
|
80
|
+
|
63
81
|
## History
|
64
82
|
|
65
83
|
View the [changelog](https://github.com/ankane/blingfire/blob/master/CHANGELOG.md)
|
@@ -79,6 +97,6 @@ To get started with development:
|
|
79
97
|
git clone https://github.com/ankane/blingfire.git
|
80
98
|
cd blingfire
|
81
99
|
bundle install
|
82
|
-
bundle exec rake vendor:all
|
100
|
+
bundle exec rake vendor:all download:models
|
83
101
|
bundle exec rake test
|
84
102
|
```
|
data/lib/blingfire.rb
CHANGED
@@ -46,6 +46,18 @@ module BlingFire
|
|
46
46
|
end
|
47
47
|
end
|
48
48
|
|
49
|
+
def text_to_words_with_offsets(text)
|
50
|
+
text_to_with_offsets(text, " ") do |t, out, start_offsets, end_offsets|
|
51
|
+
FFI.TextToWordsWithOffsets(t, t.bytesize, out, start_offsets, end_offsets, out.size)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def text_to_words_with_offsets_with_model(model, text)
|
56
|
+
text_to_with_offsets(text, " ") do |t, out, start_offsets, end_offsets|
|
57
|
+
FFI.TextToWordsWithOffsetsWithModel(t, t.bytesize, out, start_offsets, end_offsets, out.size, model)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
49
61
|
def text_to_sentences(text)
|
50
62
|
text_to(text, "\n") do |t, out|
|
51
63
|
FFI.TextToSentences(t, t.bytesize, out, out.size)
|
@@ -58,6 +70,18 @@ module BlingFire
|
|
58
70
|
end
|
59
71
|
end
|
60
72
|
|
73
|
+
def text_to_sentences_with_offsets(text)
|
74
|
+
text_to_with_offsets(text, "\n") do |t, out, start_offsets, end_offsets|
|
75
|
+
FFI.TextToSentencesWithOffsets(t, t.bytesize, out, start_offsets, end_offsets, out.size)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def text_to_sentences_with_offsets_with_model(model, text)
|
80
|
+
text_to_with_offsets(text, "\n") do |t, out, start_offsets, end_offsets|
|
81
|
+
FFI.TextToSentencesWithOffsetsWithModel(t, t.bytesize, out, start_offsets, end_offsets, out.size, model)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
61
85
|
def text_to_ids(model, text, max_len = nil, unk_id = 0)
|
62
86
|
text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
|
63
87
|
ids = Fiddle::Pointer.malloc((max_len || text.size) * Fiddle::SIZEOF_INT)
|
@@ -66,10 +90,34 @@ module BlingFire
|
|
66
90
|
ids[0, (max_len || out_size) * Fiddle::SIZEOF_INT].unpack("i!*")
|
67
91
|
end
|
68
92
|
|
93
|
+
def text_to_ids_with_offsets(model, text, max_len = nil, unk_id = 0)
|
94
|
+
text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
|
95
|
+
ids = Fiddle::Pointer.malloc((max_len || text.size) * Fiddle::SIZEOF_INT)
|
96
|
+
|
97
|
+
start_offsets = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT * ids.size)
|
98
|
+
end_offsets = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT * ids.size)
|
99
|
+
|
100
|
+
out_size = FFI.TextToIdsWithOffsets(model, text, text.bytesize, ids, start_offsets, end_offsets, ids.size, unk_id)
|
101
|
+
|
102
|
+
check_status out_size, ids
|
103
|
+
|
104
|
+
result = ids[0, (max_len || out_size) * Fiddle::SIZEOF_INT].unpack("i!*")
|
105
|
+
[result].concat(unpack_offsets(start_offsets, end_offsets, result, text))
|
106
|
+
end
|
107
|
+
|
69
108
|
def free_model(model)
|
70
109
|
FFI.FreeModel(model)
|
71
110
|
end
|
72
111
|
|
112
|
+
def normalize_spaces(text)
|
113
|
+
u_space = 0x20
|
114
|
+
text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
|
115
|
+
out = Fiddle::Pointer.malloc([text.bytesize * 1.5, 20].max)
|
116
|
+
out_size = FFI.NormalizeSpaces(text, text.bytesize, out, out.size, u_space)
|
117
|
+
check_status out_size, out
|
118
|
+
encode_utf8(out.to_str(out_size))
|
119
|
+
end
|
120
|
+
|
73
121
|
private
|
74
122
|
|
75
123
|
def check_status(ret, ptr)
|
@@ -79,14 +127,52 @@ module BlingFire
|
|
79
127
|
def text_to(text, sep)
|
80
128
|
text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
|
81
129
|
# TODO allocate less, and try again if needed
|
82
|
-
out = Fiddle::Pointer.malloc([text.bytesize *
|
130
|
+
out = Fiddle::Pointer.malloc([text.bytesize * 3, 20].max)
|
83
131
|
out_size = yield(text, out)
|
84
132
|
check_status out_size, out
|
85
133
|
encode_utf8(out.to_str(out_size - 1)).split(sep)
|
86
134
|
end
|
87
135
|
|
136
|
+
def text_to_with_offsets(text, sep)
|
137
|
+
text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
|
138
|
+
# TODO allocate less, and try again if needed
|
139
|
+
out = Fiddle::Pointer.malloc([text.bytesize * 3, 20].max)
|
140
|
+
|
141
|
+
start_offsets = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT * out.size)
|
142
|
+
end_offsets = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT * out.size)
|
143
|
+
|
144
|
+
out_size = yield(text, out, start_offsets, end_offsets)
|
145
|
+
|
146
|
+
check_status out_size, out
|
147
|
+
|
148
|
+
result = encode_utf8(out.to_str(out_size - 1)).split(sep)
|
149
|
+
[result].concat(unpack_offsets(start_offsets, end_offsets, result, text))
|
150
|
+
end
|
151
|
+
|
88
152
|
def encode_utf8(text)
|
89
153
|
text.force_encoding(Encoding::UTF_8)
|
90
154
|
end
|
155
|
+
|
156
|
+
def unpack_offsets(start_offsets, end_offsets, result, text)
|
157
|
+
start_bytes = start_offsets.to_s(Fiddle::SIZEOF_INT * result.size).unpack("i*")
|
158
|
+
end_bytes = end_offsets.to_s(Fiddle::SIZEOF_INT * result.size).unpack("i*")
|
159
|
+
starts = []
|
160
|
+
ends = []
|
161
|
+
|
162
|
+
# convert byte offsets to character offsets
|
163
|
+
# TODO see if more efficient to store next_pos in variable
|
164
|
+
pos = 0
|
165
|
+
text.each_char.with_index do |c, i|
|
166
|
+
while pos == start_bytes[starts.size]
|
167
|
+
starts << i
|
168
|
+
end
|
169
|
+
pos += c.bytesize
|
170
|
+
while pos - 1 == end_bytes[ends.size]
|
171
|
+
ends << i + 1
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
[starts, ends]
|
176
|
+
end
|
91
177
|
end
|
92
178
|
end
|
data/lib/blingfire/ffi.rb
CHANGED
@@ -10,13 +10,35 @@ module BlingFire
|
|
10
10
|
raise e
|
11
11
|
end
|
12
12
|
|
13
|
+
# https://github.com/microsoft/BlingFire/blob/master/blingfiretools/blingfiretokdll/blingfiretokdll.cpp
|
14
|
+
|
15
|
+
# version
|
13
16
|
extern "int GetBlingFireTokVersion()"
|
14
|
-
|
15
|
-
|
17
|
+
|
18
|
+
# text to sentences
|
19
|
+
extern "int TextToSentencesWithOffsetsWithModel(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int * pStartOffsets, int * pEndOffsets, int MaxOutUtf8StrByteCount, void * hModel)"
|
20
|
+
extern "int TextToSentencesWithOffsets(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int * pStartOffsets, int * pEndOffsets, int MaxOutUtf8StrByteCount)"
|
21
|
+
extern "int TextToSentencesWithModel(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int MaxOutUtf8StrByteCount, void * hModel)"
|
22
|
+
extern "int TextToSentences(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int MaxOutUtf8StrByteCount)"
|
23
|
+
|
24
|
+
# text to words
|
25
|
+
extern "int TextToWordsWithOffsetsWithModel(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int * pStartOffsets, int * pEndOffsets, int MaxOutUtf8StrByteCount, void * hModel)"
|
26
|
+
extern "int TextToWordsWithOffsets(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int * pStartOffsets, int * pEndOffsets, int MaxOutUtf8StrByteCount)"
|
16
27
|
extern "int TextToWordsWithModel(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int MaxOutUtf8StrByteCount, void * hModel)"
|
17
28
|
extern "int TextToWords(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int MaxOutUtf8StrByteCount)"
|
29
|
+
|
30
|
+
# misc
|
31
|
+
extern "int NormalizeSpaces(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int MaxOutUtf8StrByteCount, int uSpace)"
|
32
|
+
extern "int TextToHashes(char * pInUtf8Str, int InUtf8StrByteCount, int32_t * pHashArr, int MaxHashArrLength, int wordNgrams, int bucketSize)"
|
33
|
+
|
34
|
+
# model
|
35
|
+
extern "void* LoadModel(char * pszLdbFileName)"
|
36
|
+
|
37
|
+
# text to ids
|
38
|
+
extern "int TextToIdsWithOffsets(void* ModelPtr, char * pInUtf8Str, int InUtf8StrByteCount, int32_t * pIdsArr, int * pStartOffsets, int * pEndOffsets, int MaxIdsArrLength, int UnkId)"
|
18
39
|
extern "int TextToIds(void* ModelPtr, char * pInUtf8Str, int InUtf8StrByteCount, int32_t * pIdsArr, int MaxIdsArrLength, int UnkId)"
|
19
|
-
|
20
|
-
|
40
|
+
|
41
|
+
# free model
|
42
|
+
extern "int FreeModel(void* ModelPtr)"
|
21
43
|
end
|
22
44
|
end
|
data/lib/blingfire/model.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
module BlingFire
|
2
2
|
class Model
|
3
3
|
def initialize(path = nil)
|
4
|
+
@handle = nil
|
4
5
|
if path
|
5
6
|
raise Error, "Model not found" unless File.exist?(path)
|
6
7
|
@handle = FFI.LoadModel(path)
|
@@ -16,6 +17,14 @@ module BlingFire
|
|
16
17
|
end
|
17
18
|
end
|
18
19
|
|
20
|
+
def text_to_words_with_offsets(text)
|
21
|
+
if @handle
|
22
|
+
BlingFire.text_to_words_with_offsets_with_model(@handle, text)
|
23
|
+
else
|
24
|
+
BlingFire.text_to_words_with_offsets(text)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
19
28
|
def text_to_sentences(text)
|
20
29
|
if @handle
|
21
30
|
BlingFire.text_to_sentences_with_model(@handle, text)
|
@@ -24,6 +33,14 @@ module BlingFire
|
|
24
33
|
end
|
25
34
|
end
|
26
35
|
|
36
|
+
def text_to_sentences_with_offsets(text)
|
37
|
+
if @handle
|
38
|
+
BlingFire.text_to_sentences_with_offsets_with_model(@handle, text)
|
39
|
+
else
|
40
|
+
BlingFire.text_to_sentences_with_offsets(text)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
27
44
|
def text_to_ids(text, max_len = nil, unk_id = 0)
|
28
45
|
if @handle
|
29
46
|
BlingFire.text_to_ids(@handle, text, max_len, unk_id)
|
@@ -32,6 +49,14 @@ module BlingFire
|
|
32
49
|
end
|
33
50
|
end
|
34
51
|
|
52
|
+
def text_to_ids_with_offsets(text, max_len = nil, unk_id = 0)
|
53
|
+
if @handle
|
54
|
+
BlingFire.text_to_ids_with_offsets(@handle, text, max_len, unk_id)
|
55
|
+
else
|
56
|
+
raise "Not implemented"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
35
60
|
def to_ptr
|
36
61
|
@handle
|
37
62
|
end
|
data/lib/blingfire/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: blingfire
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-10-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|