blingfire 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 130b126e317ef7c923db7f6ef2b53384d1983cbfc25894c27c6e5b59a4ca2523
4
- data.tar.gz: 2d121c5b405a760889a14977119d605e87375abcf2dc623151ce03c0b164ea2f
3
+ metadata.gz: af6d8ad14f2c3f56fb148eca079a81bb48b3cec80790bc184fc99afd03572f12
4
+ data.tar.gz: 92f351ac35b186b54b2d2c261427b3827ccb40613d3dd3fc1f57ce6eec83aa57
5
5
  SHA512:
6
- metadata.gz: 15ecb71b9875542281cfe9167bd410abb58ba24d9ff57181afe2d942101ca335b7aa95c0c4fcdede39090dd3ec8ab57f7f1ed83c341a4b2505cfc9711e86637e
7
- data.tar.gz: 23771de5c189aa3d7750cc2b09b2dd0bbe02512bf05fb8ccdad7f8664e63bbbcac1419c1d57f7137c16d9c57e8d36b5fac2d3cae96ea5979081d89bad6737ddd
6
+ metadata.gz: 5714bc6333a94669d1b6b69682dd3671933ffc0611107eeb2ce174263c329b42b57c92205afac8519b6626dcae631f724682fccfde57c6514c9cac9233653c62
7
+ data.tar.gz: 46a6de8f8a081c893db5d446b6e981a170efc4cdcfdaac158f58ba423592f0274990345066a703a687e67d2f2496cb46b2a46cfe8a0f476f19b30afd37fd4697
@@ -1,3 +1,10 @@
1
+ ## 0.1.3 (2020-10-01)
2
+
3
+ - Added `text_to_words_with_offsets` method
4
+ - Added `text_to_sentences_with_offsets` method
5
+ - Added `text_to_ids_with_offsets` method
6
+ - Added `normalize_spaces` method
7
+
1
8
  ## 0.1.2 (2020-06-25)
2
9
 
3
10
  - Updated Bling Fire to 0.1.3
@@ -1,22 +1,22 @@
1
- Copyright (c) 2020 Andrew Kane
2
-
3
1
  MIT License
4
2
 
5
- Permission is hereby granted, free of charge, to any person obtaining
6
- a copy of this software and associated documentation files (the
7
- "Software"), to deal in the Software without restriction, including
8
- without limitation the rights to use, copy, modify, merge, publish,
9
- distribute, sublicense, and/or sell copies of the Software, and to
10
- permit persons to whom the Software is furnished to do so, subject to
11
- the following conditions:
3
+ Copyright (c) Microsoft Corporation. All rights reserved.
4
+ Copyright (c) 2020 Andrew Kane
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
12
 
13
- The above copyright notice and this permission notice shall be
14
- included in all copies or substantial portions of the Software.
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
15
 
16
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE
data/README.md CHANGED
@@ -32,6 +32,18 @@ Tokenize sentences
32
32
  model.text_to_sentences(text)
33
33
  ```
34
34
 
35
+ Get offsets for words
36
+
37
+ ```ruby
38
+ words, start_offsets, end_offsets = model.text_to_words_with_offsets(text)
39
+ ```
40
+
41
+ Get offsets for sentences
42
+
43
+ ```ruby
44
+ sentences, start_offsets, end_offsets = model.text_to_sentences_with_offsets(text)
45
+ ```
46
+
35
47
  ## Pre-trained Models
36
48
 
37
49
  BlingFire comes with a default model that follows the tokenization logic of NLTK with a few changes. You can also download other models:
@@ -60,6 +72,12 @@ Convert text to ids
60
72
  model.text_to_ids(text)
61
73
  ```
62
74
 
75
+ Get offsets for ids
76
+
77
+ ```ruby
78
+ ids, start_offsets, end_offsets = model.text_to_ids_with_offsets(text)
79
+ ```
80
+
63
81
  ## History
64
82
 
65
83
  View the [changelog](https://github.com/ankane/blingfire/blob/master/CHANGELOG.md)
@@ -79,6 +97,6 @@ To get started with development:
79
97
  git clone https://github.com/ankane/blingfire.git
80
98
  cd blingfire
81
99
  bundle install
82
- bundle exec rake vendor:all
100
+ bundle exec rake vendor:all download:models
83
101
  bundle exec rake test
84
102
  ```
@@ -46,6 +46,18 @@ module BlingFire
46
46
  end
47
47
  end
48
48
 
49
+ def text_to_words_with_offsets(text)
50
+ text_to_with_offsets(text, " ") do |t, out, start_offsets, end_offsets|
51
+ FFI.TextToWordsWithOffsets(t, t.bytesize, out, start_offsets, end_offsets, out.size)
52
+ end
53
+ end
54
+
55
+ def text_to_words_with_offsets_with_model(model, text)
56
+ text_to_with_offsets(text, " ") do |t, out, start_offsets, end_offsets|
57
+ FFI.TextToWordsWithOffsetsWithModel(t, t.bytesize, out, start_offsets, end_offsets, out.size, model)
58
+ end
59
+ end
60
+
49
61
  def text_to_sentences(text)
50
62
  text_to(text, "\n") do |t, out|
51
63
  FFI.TextToSentences(t, t.bytesize, out, out.size)
@@ -58,6 +70,18 @@ module BlingFire
58
70
  end
59
71
  end
60
72
 
73
+ def text_to_sentences_with_offsets(text)
74
+ text_to_with_offsets(text, "\n") do |t, out, start_offsets, end_offsets|
75
+ FFI.TextToSentencesWithOffsets(t, t.bytesize, out, start_offsets, end_offsets, out.size)
76
+ end
77
+ end
78
+
79
+ def text_to_sentences_with_offsets_with_model(model, text)
80
+ text_to_with_offsets(text, "\n") do |t, out, start_offsets, end_offsets|
81
+ FFI.TextToSentencesWithOffsetsWithModel(t, t.bytesize, out, start_offsets, end_offsets, out.size, model)
82
+ end
83
+ end
84
+
61
85
  def text_to_ids(model, text, max_len = nil, unk_id = 0)
62
86
  text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
63
87
  ids = Fiddle::Pointer.malloc((max_len || text.size) * Fiddle::SIZEOF_INT)
@@ -66,10 +90,34 @@ module BlingFire
66
90
  ids[0, (max_len || out_size) * Fiddle::SIZEOF_INT].unpack("i!*")
67
91
  end
68
92
 
93
+ def text_to_ids_with_offsets(model, text, max_len = nil, unk_id = 0)
94
+ text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
95
+ ids = Fiddle::Pointer.malloc((max_len || text.size) * Fiddle::SIZEOF_INT)
96
+
97
+ start_offsets = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT * ids.size)
98
+ end_offsets = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT * ids.size)
99
+
100
+ out_size = FFI.TextToIdsWithOffsets(model, text, text.bytesize, ids, start_offsets, end_offsets, ids.size, unk_id)
101
+
102
+ check_status out_size, ids
103
+
104
+ result = ids[0, (max_len || out_size) * Fiddle::SIZEOF_INT].unpack("i!*")
105
+ [result].concat(unpack_offsets(start_offsets, end_offsets, result, text))
106
+ end
107
+
69
108
  def free_model(model)
70
109
  FFI.FreeModel(model)
71
110
  end
72
111
 
112
+ def normalize_spaces(text)
113
+ u_space = 0x20
114
+ text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
115
+ out = Fiddle::Pointer.malloc([text.bytesize * 1.5, 20].max)
116
+ out_size = FFI.NormalizeSpaces(text, text.bytesize, out, out.size, u_space)
117
+ check_status out_size, out
118
+ encode_utf8(out.to_str(out_size))
119
+ end
120
+
73
121
  private
74
122
 
75
123
  def check_status(ret, ptr)
@@ -79,14 +127,52 @@ module BlingFire
79
127
  def text_to(text, sep)
80
128
  text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
81
129
  # TODO allocate less, and try again if needed
82
- out = Fiddle::Pointer.malloc([text.bytesize * 1.5, 20].max)
130
+ out = Fiddle::Pointer.malloc([text.bytesize * 3, 20].max)
83
131
  out_size = yield(text, out)
84
132
  check_status out_size, out
85
133
  encode_utf8(out.to_str(out_size - 1)).split(sep)
86
134
  end
87
135
 
136
+ def text_to_with_offsets(text, sep)
137
+ text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
138
+ # TODO allocate less, and try again if needed
139
+ out = Fiddle::Pointer.malloc([text.bytesize * 3, 20].max)
140
+
141
+ start_offsets = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT * out.size)
142
+ end_offsets = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT * out.size)
143
+
144
+ out_size = yield(text, out, start_offsets, end_offsets)
145
+
146
+ check_status out_size, out
147
+
148
+ result = encode_utf8(out.to_str(out_size - 1)).split(sep)
149
+ [result].concat(unpack_offsets(start_offsets, end_offsets, result, text))
150
+ end
151
+
88
152
  def encode_utf8(text)
89
153
  text.force_encoding(Encoding::UTF_8)
90
154
  end
155
+
156
+ def unpack_offsets(start_offsets, end_offsets, result, text)
157
+ start_bytes = start_offsets.to_s(Fiddle::SIZEOF_INT * result.size).unpack("i*")
158
+ end_bytes = end_offsets.to_s(Fiddle::SIZEOF_INT * result.size).unpack("i*")
159
+ starts = []
160
+ ends = []
161
+
162
+ # convert byte offsets to character offsets
163
+ # TODO see if more efficient to store next_pos in variable
164
+ pos = 0
165
+ text.each_char.with_index do |c, i|
166
+ while pos == start_bytes[starts.size]
167
+ starts << i
168
+ end
169
+ pos += c.bytesize
170
+ while pos - 1 == end_bytes[ends.size]
171
+ ends << i + 1
172
+ end
173
+ end
174
+
175
+ [starts, ends]
176
+ end
91
177
  end
92
178
  end
@@ -10,13 +10,35 @@ module BlingFire
10
10
  raise e
11
11
  end
12
12
 
13
+ # https://github.com/microsoft/BlingFire/blob/master/blingfiretools/blingfiretokdll/blingfiretokdll.cpp
14
+
15
+ # version
13
16
  extern "int GetBlingFireTokVersion()"
14
- extern "void* LoadModel(char * pszLdbFileName)"
15
- extern "int FreeModel(void* ModelPtr)"
17
+
18
+ # text to sentences
19
+ extern "int TextToSentencesWithOffsetsWithModel(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int * pStartOffsets, int * pEndOffsets, int MaxOutUtf8StrByteCount, void * hModel)"
20
+ extern "int TextToSentencesWithOffsets(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int * pStartOffsets, int * pEndOffsets, int MaxOutUtf8StrByteCount)"
21
+ extern "int TextToSentencesWithModel(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int MaxOutUtf8StrByteCount, void * hModel)"
22
+ extern "int TextToSentences(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int MaxOutUtf8StrByteCount)"
23
+
24
+ # text to words
25
+ extern "int TextToWordsWithOffsetsWithModel(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int * pStartOffsets, int * pEndOffsets, int MaxOutUtf8StrByteCount, void * hModel)"
26
+ extern "int TextToWordsWithOffsets(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int * pStartOffsets, int * pEndOffsets, int MaxOutUtf8StrByteCount)"
16
27
  extern "int TextToWordsWithModel(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int MaxOutUtf8StrByteCount, void * hModel)"
17
28
  extern "int TextToWords(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int MaxOutUtf8StrByteCount)"
29
+
30
+ # misc
31
+ extern "int NormalizeSpaces(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int MaxOutUtf8StrByteCount, int uSpace)"
32
+ extern "int TextToHashes(char * pInUtf8Str, int InUtf8StrByteCount, int32_t * pHashArr, int MaxHashArrLength, int wordNgrams, int bucketSize)"
33
+
34
+ # model
35
+ extern "void* LoadModel(char * pszLdbFileName)"
36
+
37
+ # text to ids
38
+ extern "int TextToIdsWithOffsets(void* ModelPtr, char * pInUtf8Str, int InUtf8StrByteCount, int32_t * pIdsArr, int * pStartOffsets, int * pEndOffsets, int MaxIdsArrLength, int UnkId)"
18
39
  extern "int TextToIds(void* ModelPtr, char * pInUtf8Str, int InUtf8StrByteCount, int32_t * pIdsArr, int MaxIdsArrLength, int UnkId)"
19
- extern "int TextToSentences(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int MaxOutUtf8StrByteCount)"
20
- extern "int TextToSentencesWithModel(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int MaxOutUtf8StrByteCount, void * hModel)"
40
+
41
+ # free model
42
+ extern "int FreeModel(void* ModelPtr)"
21
43
  end
22
44
  end
@@ -1,6 +1,7 @@
1
1
  module BlingFire
2
2
  class Model
3
3
  def initialize(path = nil)
4
+ @handle = nil
4
5
  if path
5
6
  raise Error, "Model not found" unless File.exist?(path)
6
7
  @handle = FFI.LoadModel(path)
@@ -16,6 +17,14 @@ module BlingFire
16
17
  end
17
18
  end
18
19
 
20
+ def text_to_words_with_offsets(text)
21
+ if @handle
22
+ BlingFire.text_to_words_with_offsets_with_model(@handle, text)
23
+ else
24
+ BlingFire.text_to_words_with_offsets(text)
25
+ end
26
+ end
27
+
19
28
  def text_to_sentences(text)
20
29
  if @handle
21
30
  BlingFire.text_to_sentences_with_model(@handle, text)
@@ -24,6 +33,14 @@ module BlingFire
24
33
  end
25
34
  end
26
35
 
36
+ def text_to_sentences_with_offsets(text)
37
+ if @handle
38
+ BlingFire.text_to_sentences_with_offsets_with_model(@handle, text)
39
+ else
40
+ BlingFire.text_to_sentences_with_offsets(text)
41
+ end
42
+ end
43
+
27
44
  def text_to_ids(text, max_len = nil, unk_id = 0)
28
45
  if @handle
29
46
  BlingFire.text_to_ids(@handle, text, max_len, unk_id)
@@ -32,6 +49,14 @@ module BlingFire
32
49
  end
33
50
  end
34
51
 
52
+ def text_to_ids_with_offsets(text, max_len = nil, unk_id = 0)
53
+ if @handle
54
+ BlingFire.text_to_ids_with_offsets(@handle, text, max_len, unk_id)
55
+ else
56
+ raise "Not implemented"
57
+ end
58
+ end
59
+
35
60
  def to_ptr
36
61
  @handle
37
62
  end
@@ -1,3 +1,3 @@
1
1
  module BlingFire
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.3"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: blingfire
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-06-25 00:00:00.000000000 Z
11
+ date: 2020-10-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler