blingfire 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a93f90584141a618ff5bb73860068e2ea662c568a07e2f877a5f2be7e265919e
4
- data.tar.gz: 61b49a4bd7eb304f44c041730bce733102f1936c9340cd9c0c3f1390bd9e853a
3
+ metadata.gz: 225c7163bbbd6bc06e0b7d54b2a0a64c3b622c174e5c6cb0b8fa54b6e68740a3
4
+ data.tar.gz: b9d8ce2d21a450c53b1628bdab97b07aa2c39dd1b758fa763c39f052a944084c
5
5
  SHA512:
6
- metadata.gz: fd0242fc13696c620ffd3a7383b9088aaddade1f7280c4f35c54d5165f66e26b783e88fa2d65ced002e8f233af0637e43e9adb68ad7556a16cfdcbb6aff9ba7e
7
- data.tar.gz: a1206eaa056ed93639c00c03eb16be5fbbec6b077e1677a916b69c6244c26d3c2cce51299dc0394335c54d12de500a9648f30fa6f3785ef749be528904d5ae60
6
+ metadata.gz: 98eed6c71ee974381772e032b27d022cd9c9485a83e8dae25c21817a19489953c7c5dfd4bb4498d796b65a569bd3468f55df558603d904ed9293a4c21e7041d5
7
+ data.tar.gz: 15389737eec2ad109f528bf7cd064184205bc17d822d108031f29eb7cf9d421c0a84f40a3d170461e7fa3c853b1fbc912b28f07e403924f9744cb8f6c2294f2d
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ ## 0.1.1 (2020-05-01)
2
+
3
+ - Updated Bling Fire to 0.1.1
4
+ - Improved error message when model not found
5
+
1
6
  ## 0.1.0 (2020-02-24)
2
7
 
3
8
  - First release
data/README.md CHANGED
@@ -1,6 +1,8 @@
1
- # BlingFire
1
+ # Bling Fire
2
2
 
3
- [BlingFire](https://github.com/microsoft/BlingFire) - high speed text tokenization - for Ruby
3
+ [Bling Fire](https://github.com/microsoft/BlingFire) - high speed text tokenization - for Ruby
4
+
5
+ [![Build Status](https://travis-ci.org/ankane/blingfire.svg?branch=master)](https://travis-ci.org/ankane/blingfire) [![Build status](https://ci.appveyor.com/api/projects/status/3gyca4gsjw2w9ns1/branch/master?svg=true)](https://ci.appveyor.com/project/ankane/blingfire/branch/master)
4
6
 
5
7
  ## Installation
6
8
 
@@ -39,6 +41,8 @@ BlingFire comes with a default model that follows the tokenization logic of NLTK
39
41
  - [BERT Chinese](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/bert_chinese.bin)
40
42
  - [BERT Multilingual Cased](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/bert_multi_cased.bin)
41
43
  - [WBD](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/wbd_chuni.bin)
44
+ - [XLNet](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/xlnet.bin)
45
+ - [XLNet No Norm](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/xlnet_nonorm.bin)
42
46
 
43
47
  Load a model
44
48
 
data/lib/blingfire.rb CHANGED
@@ -35,42 +35,34 @@ module BlingFire
35
35
  end
36
36
 
37
37
  def text_to_words(text)
38
- text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
39
- out = Fiddle::Pointer.malloc(text.bytesize * 3)
40
- out_size = FFI.TextToWords(text, text.bytesize, out, out.size)
41
- check_status out_size
42
- encode_utf8(out[0, out_size - 1]).split(" ")
38
+ text_to(text, " ") do |t, out|
39
+ FFI.TextToWords(t, t.bytesize, out, out.size)
40
+ end
43
41
  end
44
42
 
45
43
  def text_to_words_with_model(model, text)
46
- text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
47
- out = Fiddle::Pointer.malloc(text.bytesize * 3)
48
- out_size = FFI.TextToWordsWithModel(text, text.bytesize, out, out.size, model)
49
- check_status out_size
50
- encode_utf8(out[0, out_size - 1]).split(" ")
44
+ text_to(text, " ") do |t, out|
45
+ FFI.TextToWordsWithModel(t, t.bytesize, out, out.size, model)
46
+ end
51
47
  end
52
48
 
53
49
  def text_to_sentences(text)
54
- text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
55
- out = Fiddle::Pointer.malloc(text.bytesize * 3)
56
- out_size = FFI.TextToSentences(text, text.bytesize, out, out.size)
57
- check_status out_size
58
- encode_utf8(out[0, out_size - 1]).split("\n")
50
+ text_to(text, "\n") do |t, out|
51
+ FFI.TextToSentences(t, t.bytesize, out, out.size)
52
+ end
59
53
  end
60
54
 
61
55
  def text_to_sentences_with_model(model, text)
62
- text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
63
- out = Fiddle::Pointer.malloc(text.bytesize * 3)
64
- out_size = FFI.TextToSentencesWithModel(text, text.bytesize, out, out.size, model)
65
- check_status out_size
66
- encode_utf8(out[0, out_size - 1]).split("\n")
56
+ text_to(text, "\n") do |t, out|
57
+ FFI.TextToSentencesWithModel(t, t.bytesize, out, out.size, model)
58
+ end
67
59
  end
68
60
 
69
61
  def text_to_ids(model, text, max_len = nil, unk_id = 0)
70
62
  text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
71
63
  ids = Fiddle::Pointer.malloc((max_len || text.size) * Fiddle::SIZEOF_INT)
72
64
  out_size = FFI.TextToIds(model, text, text.bytesize, ids, ids.size, unk_id)
73
- check_status out_size
65
+ check_status out_size, ids
74
66
  ids[0, (max_len || out_size) * Fiddle::SIZEOF_INT].unpack("i!*")
75
67
  end
76
68
 
@@ -80,8 +72,17 @@ module BlingFire
80
72
 
81
73
  private
82
74
 
83
- def check_status(ret)
84
- raise Error, "Bad status" if ret == -1
75
+ def check_status(ret, ptr)
76
+ raise Error, "Not enough memory allocated" if ret == -1 || ret > ptr.size
77
+ end
78
+
79
+ def text_to(text, sep)
80
+ text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
81
+ # TODO allocate less, and try again if needed
82
+ out = Fiddle::Pointer.malloc([text.bytesize * 1.5, 20].max)
83
+ out_size = yield(text, out)
84
+ check_status out_size, out
85
+ encode_utf8(out.to_str(out_size - 1)).split(sep)
85
86
  end
86
87
 
87
88
  def encode_utf8(text)
@@ -2,6 +2,7 @@ module BlingFire
2
2
  class Model
3
3
  def initialize(path = nil)
4
4
  if path
5
+ raise Error, "Model not found" unless File.exist?(path)
5
6
  @handle = FFI.LoadModel(path)
6
7
  ObjectSpace.define_finalizer(self, self.class.finalize(@handle))
7
8
  end
@@ -1,3 +1,3 @@
1
1
  module BlingFire
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.1"
3
3
  end
Binary file
Binary file
Binary file
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: blingfire
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-02-24 00:00:00.000000000 Z
11
+ date: 2020-05-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler