blingfire 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 130b126e317ef7c923db7f6ef2b53384d1983cbfc25894c27c6e5b59a4ca2523
4
- data.tar.gz: 2d121c5b405a760889a14977119d605e87375abcf2dc623151ce03c0b164ea2f
3
+ metadata.gz: af6d8ad14f2c3f56fb148eca079a81bb48b3cec80790bc184fc99afd03572f12
4
+ data.tar.gz: 92f351ac35b186b54b2d2c261427b3827ccb40613d3dd3fc1f57ce6eec83aa57
5
5
  SHA512:
6
- metadata.gz: 15ecb71b9875542281cfe9167bd410abb58ba24d9ff57181afe2d942101ca335b7aa95c0c4fcdede39090dd3ec8ab57f7f1ed83c341a4b2505cfc9711e86637e
7
- data.tar.gz: 23771de5c189aa3d7750cc2b09b2dd0bbe02512bf05fb8ccdad7f8664e63bbbcac1419c1d57f7137c16d9c57e8d36b5fac2d3cae96ea5979081d89bad6737ddd
6
+ metadata.gz: 5714bc6333a94669d1b6b69682dd3671933ffc0611107eeb2ce174263c329b42b57c92205afac8519b6626dcae631f724682fccfde57c6514c9cac9233653c62
7
+ data.tar.gz: 46a6de8f8a081c893db5d446b6e981a170efc4cdcfdaac158f58ba423592f0274990345066a703a687e67d2f2496cb46b2a46cfe8a0f476f19b30afd37fd4697
@@ -1,3 +1,10 @@
1
+ ## 0.1.3 (2020-10-01)
2
+
3
+ - Added `text_to_words_with_offsets` method
4
+ - Added `text_to_sentences_with_offsets` method
5
+ - Added `text_to_ids_with_offsets` method
6
+ - Added `normalize_spaces` method
7
+
1
8
  ## 0.1.2 (2020-06-25)
2
9
 
3
10
  - Updated Bling Fire to 0.1.3
@@ -1,22 +1,22 @@
1
- Copyright (c) 2020 Andrew Kane
2
-
3
1
  MIT License
4
2
 
5
- Permission is hereby granted, free of charge, to any person obtaining
6
- a copy of this software and associated documentation files (the
7
- "Software"), to deal in the Software without restriction, including
8
- without limitation the rights to use, copy, modify, merge, publish,
9
- distribute, sublicense, and/or sell copies of the Software, and to
10
- permit persons to whom the Software is furnished to do so, subject to
11
- the following conditions:
3
+ Copyright (c) Microsoft Corporation. All rights reserved.
4
+ Copyright (c) 2020 Andrew Kane
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
12
 
13
- The above copyright notice and this permission notice shall be
14
- included in all copies or substantial portions of the Software.
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
15
 
16
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE
data/README.md CHANGED
@@ -32,6 +32,18 @@ Tokenize sentences
32
32
  model.text_to_sentences(text)
33
33
  ```
34
34
 
35
+ Get offsets for words
36
+
37
+ ```ruby
38
+ words, start_offsets, end_offsets = model.text_to_words_with_offsets(text)
39
+ ```
40
+
41
+ Get offsets for sentences
42
+
43
+ ```ruby
44
+ sentences, start_offsets, end_offsets = model.text_to_sentences_with_offsets(text)
45
+ ```
46
+
35
47
  ## Pre-trained Models
36
48
 
37
49
  BlingFire comes with a default model that follows the tokenization logic of NLTK with a few changes. You can also download other models:
@@ -60,6 +72,12 @@ Convert text to ids
60
72
  model.text_to_ids(text)
61
73
  ```
62
74
 
75
+ Get offsets for ids
76
+
77
+ ```ruby
78
+ ids, start_offsets, end_offsets = model.text_to_ids_with_offsets(text)
79
+ ```
80
+
63
81
  ## History
64
82
 
65
83
  View the [changelog](https://github.com/ankane/blingfire/blob/master/CHANGELOG.md)
@@ -79,6 +97,6 @@ To get started with development:
79
97
  git clone https://github.com/ankane/blingfire.git
80
98
  cd blingfire
81
99
  bundle install
82
- bundle exec rake vendor:all
100
+ bundle exec rake vendor:all download:models
83
101
  bundle exec rake test
84
102
  ```
@@ -46,6 +46,18 @@ module BlingFire
46
46
  end
47
47
  end
48
48
 
49
+ def text_to_words_with_offsets(text)
50
+ text_to_with_offsets(text, " ") do |t, out, start_offsets, end_offsets|
51
+ FFI.TextToWordsWithOffsets(t, t.bytesize, out, start_offsets, end_offsets, out.size)
52
+ end
53
+ end
54
+
55
+ def text_to_words_with_offsets_with_model(model, text)
56
+ text_to_with_offsets(text, " ") do |t, out, start_offsets, end_offsets|
57
+ FFI.TextToWordsWithOffsetsWithModel(t, t.bytesize, out, start_offsets, end_offsets, out.size, model)
58
+ end
59
+ end
60
+
49
61
  def text_to_sentences(text)
50
62
  text_to(text, "\n") do |t, out|
51
63
  FFI.TextToSentences(t, t.bytesize, out, out.size)
@@ -58,6 +70,18 @@ module BlingFire
58
70
  end
59
71
  end
60
72
 
73
+ def text_to_sentences_with_offsets(text)
74
+ text_to_with_offsets(text, "\n") do |t, out, start_offsets, end_offsets|
75
+ FFI.TextToSentencesWithOffsets(t, t.bytesize, out, start_offsets, end_offsets, out.size)
76
+ end
77
+ end
78
+
79
+ def text_to_sentences_with_offsets_with_model(model, text)
80
+ text_to_with_offsets(text, "\n") do |t, out, start_offsets, end_offsets|
81
+ FFI.TextToSentencesWithOffsetsWithModel(t, t.bytesize, out, start_offsets, end_offsets, out.size, model)
82
+ end
83
+ end
84
+
61
85
  def text_to_ids(model, text, max_len = nil, unk_id = 0)
62
86
  text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
63
87
  ids = Fiddle::Pointer.malloc((max_len || text.size) * Fiddle::SIZEOF_INT)
@@ -66,10 +90,34 @@ module BlingFire
66
90
  ids[0, (max_len || out_size) * Fiddle::SIZEOF_INT].unpack("i!*")
67
91
  end
68
92
 
93
+ def text_to_ids_with_offsets(model, text, max_len = nil, unk_id = 0)
94
+ text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
95
+ ids = Fiddle::Pointer.malloc((max_len || text.size) * Fiddle::SIZEOF_INT)
96
+
97
+ start_offsets = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT * ids.size)
98
+ end_offsets = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT * ids.size)
99
+
100
+ out_size = FFI.TextToIdsWithOffsets(model, text, text.bytesize, ids, start_offsets, end_offsets, ids.size, unk_id)
101
+
102
+ check_status out_size, ids
103
+
104
+ result = ids[0, (max_len || out_size) * Fiddle::SIZEOF_INT].unpack("i!*")
105
+ [result].concat(unpack_offsets(start_offsets, end_offsets, result, text))
106
+ end
107
+
69
108
  def free_model(model)
70
109
  FFI.FreeModel(model)
71
110
  end
72
111
 
112
+ def normalize_spaces(text)
113
+ u_space = 0x20
114
+ text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
115
+ out = Fiddle::Pointer.malloc([text.bytesize * 1.5, 20].max)
116
+ out_size = FFI.NormalizeSpaces(text, text.bytesize, out, out.size, u_space)
117
+ check_status out_size, out
118
+ encode_utf8(out.to_str(out_size))
119
+ end
120
+
73
121
  private
74
122
 
75
123
  def check_status(ret, ptr)
@@ -79,14 +127,52 @@ module BlingFire
79
127
  def text_to(text, sep)
80
128
  text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
81
129
  # TODO allocate less, and try again if needed
82
- out = Fiddle::Pointer.malloc([text.bytesize * 1.5, 20].max)
130
+ out = Fiddle::Pointer.malloc([text.bytesize * 3, 20].max)
83
131
  out_size = yield(text, out)
84
132
  check_status out_size, out
85
133
  encode_utf8(out.to_str(out_size - 1)).split(sep)
86
134
  end
87
135
 
136
+ def text_to_with_offsets(text, sep)
137
+ text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
138
+ # TODO allocate less, and try again if needed
139
+ out = Fiddle::Pointer.malloc([text.bytesize * 3, 20].max)
140
+
141
+ start_offsets = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT * out.size)
142
+ end_offsets = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT * out.size)
143
+
144
+ out_size = yield(text, out, start_offsets, end_offsets)
145
+
146
+ check_status out_size, out
147
+
148
+ result = encode_utf8(out.to_str(out_size - 1)).split(sep)
149
+ [result].concat(unpack_offsets(start_offsets, end_offsets, result, text))
150
+ end
151
+
88
152
  def encode_utf8(text)
89
153
  text.force_encoding(Encoding::UTF_8)
90
154
  end
155
+
156
+ def unpack_offsets(start_offsets, end_offsets, result, text)
157
+ start_bytes = start_offsets.to_s(Fiddle::SIZEOF_INT * result.size).unpack("i*")
158
+ end_bytes = end_offsets.to_s(Fiddle::SIZEOF_INT * result.size).unpack("i*")
159
+ starts = []
160
+ ends = []
161
+
162
+ # convert byte offsets to character offsets
163
+ # TODO see if more efficient to store next_pos in variable
164
+ pos = 0
165
+ text.each_char.with_index do |c, i|
166
+ while pos == start_bytes[starts.size]
167
+ starts << i
168
+ end
169
+ pos += c.bytesize
170
+ while pos - 1 == end_bytes[ends.size]
171
+ ends << i + 1
172
+ end
173
+ end
174
+
175
+ [starts, ends]
176
+ end
91
177
  end
92
178
  end
@@ -10,13 +10,35 @@ module BlingFire
10
10
  raise e
11
11
  end
12
12
 
13
+ # https://github.com/microsoft/BlingFire/blob/master/blingfiretools/blingfiretokdll/blingfiretokdll.cpp
14
+
15
+ # version
13
16
  extern "int GetBlingFireTokVersion()"
14
- extern "void* LoadModel(char * pszLdbFileName)"
15
- extern "int FreeModel(void* ModelPtr)"
17
+
18
+ # text to sentences
19
+ extern "int TextToSentencesWithOffsetsWithModel(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int * pStartOffsets, int * pEndOffsets, int MaxOutUtf8StrByteCount, void * hModel)"
20
+ extern "int TextToSentencesWithOffsets(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int * pStartOffsets, int * pEndOffsets, int MaxOutUtf8StrByteCount)"
21
+ extern "int TextToSentencesWithModel(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int MaxOutUtf8StrByteCount, void * hModel)"
22
+ extern "int TextToSentences(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int MaxOutUtf8StrByteCount)"
23
+
24
+ # text to words
25
+ extern "int TextToWordsWithOffsetsWithModel(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int * pStartOffsets, int * pEndOffsets, int MaxOutUtf8StrByteCount, void * hModel)"
26
+ extern "int TextToWordsWithOffsets(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int * pStartOffsets, int * pEndOffsets, int MaxOutUtf8StrByteCount)"
16
27
  extern "int TextToWordsWithModel(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int MaxOutUtf8StrByteCount, void * hModel)"
17
28
  extern "int TextToWords(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int MaxOutUtf8StrByteCount)"
29
+
30
+ # misc
31
+ extern "int NormalizeSpaces(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int MaxOutUtf8StrByteCount, int uSpace)"
32
+ extern "int TextToHashes(char * pInUtf8Str, int InUtf8StrByteCount, int32_t * pHashArr, int MaxHashArrLength, int wordNgrams, int bucketSize)"
33
+
34
+ # model
35
+ extern "void* LoadModel(char * pszLdbFileName)"
36
+
37
+ # text to ids
38
+ extern "int TextToIdsWithOffsets(void* ModelPtr, char * pInUtf8Str, int InUtf8StrByteCount, int32_t * pIdsArr, int * pStartOffsets, int * pEndOffsets, int MaxIdsArrLength, int UnkId)"
18
39
  extern "int TextToIds(void* ModelPtr, char * pInUtf8Str, int InUtf8StrByteCount, int32_t * pIdsArr, int MaxIdsArrLength, int UnkId)"
19
- extern "int TextToSentences(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int MaxOutUtf8StrByteCount)"
20
- extern "int TextToSentencesWithModel(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int MaxOutUtf8StrByteCount, void * hModel)"
40
+
41
+ # free model
42
+ extern "int FreeModel(void* ModelPtr)"
21
43
  end
22
44
  end
@@ -1,6 +1,7 @@
1
1
  module BlingFire
2
2
  class Model
3
3
  def initialize(path = nil)
4
+ @handle = nil
4
5
  if path
5
6
  raise Error, "Model not found" unless File.exist?(path)
6
7
  @handle = FFI.LoadModel(path)
@@ -16,6 +17,14 @@ module BlingFire
16
17
  end
17
18
  end
18
19
 
20
+ def text_to_words_with_offsets(text)
21
+ if @handle
22
+ BlingFire.text_to_words_with_offsets_with_model(@handle, text)
23
+ else
24
+ BlingFire.text_to_words_with_offsets(text)
25
+ end
26
+ end
27
+
19
28
  def text_to_sentences(text)
20
29
  if @handle
21
30
  BlingFire.text_to_sentences_with_model(@handle, text)
@@ -24,6 +33,14 @@ module BlingFire
24
33
  end
25
34
  end
26
35
 
36
+ def text_to_sentences_with_offsets(text)
37
+ if @handle
38
+ BlingFire.text_to_sentences_with_offsets_with_model(@handle, text)
39
+ else
40
+ BlingFire.text_to_sentences_with_offsets(text)
41
+ end
42
+ end
43
+
27
44
  def text_to_ids(text, max_len = nil, unk_id = 0)
28
45
  if @handle
29
46
  BlingFire.text_to_ids(@handle, text, max_len, unk_id)
@@ -32,6 +49,14 @@ module BlingFire
32
49
  end
33
50
  end
34
51
 
52
+ def text_to_ids_with_offsets(text, max_len = nil, unk_id = 0)
53
+ if @handle
54
+ BlingFire.text_to_ids_with_offsets(@handle, text, max_len, unk_id)
55
+ else
56
+ raise "Not implemented"
57
+ end
58
+ end
59
+
35
60
  def to_ptr
36
61
  @handle
37
62
  end
@@ -1,3 +1,3 @@
1
1
  module BlingFire
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.3"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: blingfire
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-06-25 00:00:00.000000000 Z
11
+ date: 2020-10-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler