blingfire 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a93f90584141a618ff5bb73860068e2ea662c568a07e2f877a5f2be7e265919e
4
- data.tar.gz: 61b49a4bd7eb304f44c041730bce733102f1936c9340cd9c0c3f1390bd9e853a
3
+ metadata.gz: 225c7163bbbd6bc06e0b7d54b2a0a64c3b622c174e5c6cb0b8fa54b6e68740a3
4
+ data.tar.gz: b9d8ce2d21a450c53b1628bdab97b07aa2c39dd1b758fa763c39f052a944084c
5
5
  SHA512:
6
- metadata.gz: fd0242fc13696c620ffd3a7383b9088aaddade1f7280c4f35c54d5165f66e26b783e88fa2d65ced002e8f233af0637e43e9adb68ad7556a16cfdcbb6aff9ba7e
7
- data.tar.gz: a1206eaa056ed93639c00c03eb16be5fbbec6b077e1677a916b69c6244c26d3c2cce51299dc0394335c54d12de500a9648f30fa6f3785ef749be528904d5ae60
6
+ metadata.gz: 98eed6c71ee974381772e032b27d022cd9c9485a83e8dae25c21817a19489953c7c5dfd4bb4498d796b65a569bd3468f55df558603d904ed9293a4c21e7041d5
7
+ data.tar.gz: 15389737eec2ad109f528bf7cd064184205bc17d822d108031f29eb7cf9d421c0a84f40a3d170461e7fa3c853b1fbc912b28f07e403924f9744cb8f6c2294f2d
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ ## 0.1.1 (2020-05-01)
2
+
3
+ - Updated Bling Fire to 0.1.1
4
+ - Improved error message when model not found
5
+
1
6
  ## 0.1.0 (2020-02-24)
2
7
 
3
8
  - First release
data/README.md CHANGED
@@ -1,6 +1,8 @@
1
- # BlingFire
1
+ # Bling Fire
2
2
 
3
- [BlingFire](https://github.com/microsoft/BlingFire) - high speed text tokenization - for Ruby
3
+ [Bling Fire](https://github.com/microsoft/BlingFire) - high speed text tokenization - for Ruby
4
+
5
+ [![Build Status](https://travis-ci.org/ankane/blingfire.svg?branch=master)](https://travis-ci.org/ankane/blingfire) [![Build status](https://ci.appveyor.com/api/projects/status/3gyca4gsjw2w9ns1/branch/master?svg=true)](https://ci.appveyor.com/project/ankane/blingfire/branch/master)
4
6
 
5
7
  ## Installation
6
8
 
@@ -39,6 +41,8 @@ BlingFire comes with a default model that follows the tokenization logic of NLTK
39
41
  - [BERT Chinese](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/bert_chinese.bin)
40
42
  - [BERT Multilingual Cased](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/bert_multi_cased.bin)
41
43
  - [WBD](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/wbd_chuni.bin)
44
+ - [XLNet](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/xlnet.bin)
45
+ - [XLNet No Norm](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/xlnet_nonorm.bin)
42
46
 
43
47
  Load a model
44
48
 
data/lib/blingfire.rb CHANGED
@@ -35,42 +35,34 @@ module BlingFire
35
35
  end
36
36
 
37
37
  def text_to_words(text)
38
- text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
39
- out = Fiddle::Pointer.malloc(text.bytesize * 3)
40
- out_size = FFI.TextToWords(text, text.bytesize, out, out.size)
41
- check_status out_size
42
- encode_utf8(out[0, out_size - 1]).split(" ")
38
+ text_to(text, " ") do |t, out|
39
+ FFI.TextToWords(t, t.bytesize, out, out.size)
40
+ end
43
41
  end
44
42
 
45
43
  def text_to_words_with_model(model, text)
46
- text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
47
- out = Fiddle::Pointer.malloc(text.bytesize * 3)
48
- out_size = FFI.TextToWordsWithModel(text, text.bytesize, out, out.size, model)
49
- check_status out_size
50
- encode_utf8(out[0, out_size - 1]).split(" ")
44
+ text_to(text, " ") do |t, out|
45
+ FFI.TextToWordsWithModel(t, t.bytesize, out, out.size, model)
46
+ end
51
47
  end
52
48
 
53
49
  def text_to_sentences(text)
54
- text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
55
- out = Fiddle::Pointer.malloc(text.bytesize * 3)
56
- out_size = FFI.TextToSentences(text, text.bytesize, out, out.size)
57
- check_status out_size
58
- encode_utf8(out[0, out_size - 1]).split("\n")
50
+ text_to(text, "\n") do |t, out|
51
+ FFI.TextToSentences(t, t.bytesize, out, out.size)
52
+ end
59
53
  end
60
54
 
61
55
  def text_to_sentences_with_model(model, text)
62
- text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
63
- out = Fiddle::Pointer.malloc(text.bytesize * 3)
64
- out_size = FFI.TextToSentencesWithModel(text, text.bytesize, out, out.size, model)
65
- check_status out_size
66
- encode_utf8(out[0, out_size - 1]).split("\n")
56
+ text_to(text, "\n") do |t, out|
57
+ FFI.TextToSentencesWithModel(t, t.bytesize, out, out.size, model)
58
+ end
67
59
  end
68
60
 
69
61
  def text_to_ids(model, text, max_len = nil, unk_id = 0)
70
62
  text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
71
63
  ids = Fiddle::Pointer.malloc((max_len || text.size) * Fiddle::SIZEOF_INT)
72
64
  out_size = FFI.TextToIds(model, text, text.bytesize, ids, ids.size, unk_id)
73
- check_status out_size
65
+ check_status out_size, ids
74
66
  ids[0, (max_len || out_size) * Fiddle::SIZEOF_INT].unpack("i!*")
75
67
  end
76
68
 
@@ -80,8 +72,17 @@ module BlingFire
80
72
 
81
73
  private
82
74
 
83
- def check_status(ret)
84
- raise Error, "Bad status" if ret == -1
75
+ def check_status(ret, ptr)
76
+ raise Error, "Not enough memory allocated" if ret == -1 || ret > ptr.size
77
+ end
78
+
79
+ def text_to(text, sep)
80
+ text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
81
+ # TODO allocate less, and try again if needed
82
+ out = Fiddle::Pointer.malloc([text.bytesize * 1.5, 20].max)
83
+ out_size = yield(text, out)
84
+ check_status out_size, out
85
+ encode_utf8(out.to_str(out_size - 1)).split(sep)
85
86
  end
86
87
 
87
88
  def encode_utf8(text)
@@ -2,6 +2,7 @@ module BlingFire
2
2
  class Model
3
3
  def initialize(path = nil)
4
4
  if path
5
+ raise Error, "Model not found" unless File.exist?(path)
5
6
  @handle = FFI.LoadModel(path)
6
7
  ObjectSpace.define_finalizer(self, self.class.finalize(@handle))
7
8
  end
@@ -1,3 +1,3 @@
1
1
  module BlingFire
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.1"
3
3
  end
Binary file
Binary file
Binary file
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: blingfire
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-02-24 00:00:00.000000000 Z
11
+ date: 2020-05-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler