blingfire 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: a93f90584141a618ff5bb73860068e2ea662c568a07e2f877a5f2be7e265919e
4
+ data.tar.gz: 61b49a4bd7eb304f44c041730bce733102f1936c9340cd9c0c3f1390bd9e853a
5
+ SHA512:
6
+ metadata.gz: fd0242fc13696c620ffd3a7383b9088aaddade1f7280c4f35c54d5165f66e26b783e88fa2d65ced002e8f233af0637e43e9adb68ad7556a16cfdcbb6aff9ba7e
7
+ data.tar.gz: a1206eaa056ed93639c00c03eb16be5fbbec6b077e1677a916b69c6244c26d3c2cce51299dc0394335c54d12de500a9648f30fa6f3785ef749be528904d5ae60
@@ -0,0 +1,3 @@
1
+ ## 0.1.0 (2020-02-24)
2
+
3
+ - First release
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2020 Andrew Kane
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,76 @@
1
+ # BlingFire
2
+
3
+ [BlingFire](https://github.com/microsoft/BlingFire) - high speed text tokenization - for Ruby
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application’s Gemfile:
8
+
9
+ ```ruby
10
+ gem 'blingfire'
11
+ ```
12
+
13
+ ## Getting Started
14
+
15
+ Create a model
16
+
17
+ ```ruby
18
+ model = BlingFire::Model.new
19
+ ```
20
+
21
+ Tokenize words
22
+
23
+ ```ruby
24
+ model.text_to_words(text)
25
+ ```
26
+
27
+ Tokenize sentences
28
+
29
+ ```ruby
30
+ model.text_to_sentences(text)
31
+ ```
32
+
33
+ ## Pre-trained Models
34
+
35
+ BlingFire comes with a default model that follows the tokenization logic of NLTK with a few changes. You can also download other models:
36
+
37
+ - [BERT Base](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/bert_base_tok.bin)
38
+ - [BERT Base Cased](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/bert_base_cased_tok.bin)
39
+ - [BERT Chinese](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/bert_chinese.bin)
40
+ - [BERT Multilingual Cased](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/bert_multi_cased.bin)
41
+ - [WBD](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/wbd_chuni.bin)
42
+
43
+ Load a model
44
+
45
+ ```ruby
46
+ model = BlingFire.load_model("bert_base_tok.bin")
47
+ ```
48
+
49
+ Convert text to ids
50
+
51
+ ```ruby
52
+ model.text_to_ids(text)
53
+ ```
54
+
55
+ ## History
56
+
57
+ View the [changelog](https://github.com/ankane/blingfire/blob/master/CHANGELOG.md)
58
+
59
+ ## Contributing
60
+
61
+ Everyone is encouraged to help improve this project. Here are a few ways you can help:
62
+
63
+ - [Report bugs](https://github.com/ankane/blingfire/issues)
64
+ - Fix bugs and [submit pull requests](https://github.com/ankane/blingfire/pulls)
65
+ - Write, clarify, or fix documentation
66
+ - Suggest or add new features
67
+
68
+ To get started with development:
69
+
70
+ ```sh
71
+ git clone https://github.com/ankane/blingfire.git
72
+ cd blingfire
73
+ bundle install
74
+ bundle exec rake vendor:all
75
+ bundle exec rake test
76
+ ```
@@ -0,0 +1,91 @@
1
+ # stdlib
2
+ require "fiddle/import"
3
+
4
+ # modules
5
+ require "blingfire/model"
6
+ require "blingfire/version"
7
+
8
+ module BlingFire
9
+ class Error < StandardError; end
10
+
11
+ class << self
12
+ attr_accessor :ffi_lib
13
+ end
14
+ lib_name =
15
+ if Gem.win_platform?
16
+ "blingfiretokdll.dll"
17
+ elsif RbConfig::CONFIG["host_os"] =~ /darwin/i
18
+ "libblingfiretokdll.dylib"
19
+ else
20
+ "libblingfiretokdll.so"
21
+ end
22
+ vendor_lib = File.expand_path("../vendor/#{lib_name}", __dir__)
23
+ self.ffi_lib = [vendor_lib]
24
+
25
+ # friendlier error message
26
+ autoload :FFI, "blingfire/ffi"
27
+
28
+ class << self
29
+ def lib_version
30
+ FFI.GetBlingFireTokVersion
31
+ end
32
+
33
+ def load_model(path)
34
+ Model.new(path)
35
+ end
36
+
37
+ def text_to_words(text)
38
+ text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
39
+ out = Fiddle::Pointer.malloc(text.bytesize * 3)
40
+ out_size = FFI.TextToWords(text, text.bytesize, out, out.size)
41
+ check_status out_size
42
+ encode_utf8(out[0, out_size - 1]).split(" ")
43
+ end
44
+
45
+ def text_to_words_with_model(model, text)
46
+ text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
47
+ out = Fiddle::Pointer.malloc(text.bytesize * 3)
48
+ out_size = FFI.TextToWordsWithModel(text, text.bytesize, out, out.size, model)
49
+ check_status out_size
50
+ encode_utf8(out[0, out_size - 1]).split(" ")
51
+ end
52
+
53
+ def text_to_sentences(text)
54
+ text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
55
+ out = Fiddle::Pointer.malloc(text.bytesize * 3)
56
+ out_size = FFI.TextToSentences(text, text.bytesize, out, out.size)
57
+ check_status out_size
58
+ encode_utf8(out[0, out_size - 1]).split("\n")
59
+ end
60
+
61
+ def text_to_sentences_with_model(model, text)
62
+ text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
63
+ out = Fiddle::Pointer.malloc(text.bytesize * 3)
64
+ out_size = FFI.TextToSentencesWithModel(text, text.bytesize, out, out.size, model)
65
+ check_status out_size
66
+ encode_utf8(out[0, out_size - 1]).split("\n")
67
+ end
68
+
69
+ def text_to_ids(model, text, max_len = nil, unk_id = 0)
70
+ text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
71
+ ids = Fiddle::Pointer.malloc((max_len || text.size) * Fiddle::SIZEOF_INT)
72
+ out_size = FFI.TextToIds(model, text, text.bytesize, ids, ids.size, unk_id)
73
+ check_status out_size
74
+ ids[0, (max_len || out_size) * Fiddle::SIZEOF_INT].unpack("i!*")
75
+ end
76
+
77
+ def free_model(model)
78
+ FFI.FreeModel(model)
79
+ end
80
+
81
+ private
82
+
83
+ def check_status(ret)
84
+ raise Error, "Bad status" if ret == -1
85
+ end
86
+
87
+ def encode_utf8(text)
88
+ text.force_encoding(Encoding::UTF_8)
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,22 @@
1
+ module BlingFire
2
+ module FFI
3
+ extend Fiddle::Importer
4
+
5
+ libs = Array(BlingFire.ffi_lib).dup
6
+ begin
7
+ dlload Fiddle.dlopen(libs.shift)
8
+ rescue Fiddle::DLError => e
9
+ retry if libs.any?
10
+ raise e
11
+ end
12
+
13
+ extern "int GetBlingFireTokVersion()"
14
+ extern "void* LoadModel(char * pszLdbFileName)"
15
+ extern "int FreeModel(void* ModelPtr)"
16
+ extern "int TextToWordsWithModel(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int MaxOutUtf8StrByteCount, void * hModel)"
17
+ extern "int TextToWords(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int MaxOutUtf8StrByteCount)"
18
+ extern "int TextToIds(void* ModelPtr, char * pInUtf8Str, int InUtf8StrByteCount, int32_t * pIdsArr, int MaxIdsArrLength, int UnkId)"
19
+ extern "int TextToSentences(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int MaxOutUtf8StrByteCount)"
20
+ extern "int TextToSentencesWithModel(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int MaxOutUtf8StrByteCount, void * hModel)"
21
+ end
22
+ end
@@ -0,0 +1,43 @@
1
+ module BlingFire
2
+ class Model
3
+ def initialize(path = nil)
4
+ if path
5
+ @handle = FFI.LoadModel(path)
6
+ ObjectSpace.define_finalizer(self, self.class.finalize(@handle))
7
+ end
8
+ end
9
+
10
+ def text_to_words(text)
11
+ if @handle
12
+ BlingFire.text_to_words_with_model(@handle, text)
13
+ else
14
+ BlingFire.text_to_words(text)
15
+ end
16
+ end
17
+
18
+ def text_to_sentences(text)
19
+ if @handle
20
+ BlingFire.text_to_sentences_with_model(@handle, text)
21
+ else
22
+ BlingFire.text_to_sentences(text)
23
+ end
24
+ end
25
+
26
+ def text_to_ids(text, max_len = nil, unk_id = 0)
27
+ if @handle
28
+ BlingFire.text_to_ids(@handle, text, max_len, unk_id)
29
+ else
30
+ raise "Not implemented"
31
+ end
32
+ end
33
+
34
+ def to_ptr
35
+ @handle
36
+ end
37
+
38
+ def self.finalize(pointer)
39
+ # must use proc instead of stabby lambda
40
+ proc { FFI.FreeModel(pointer) }
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,3 @@
1
+ module BlingFire
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) Microsoft Corporation. All rights reserved.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE
metadata ADDED
@@ -0,0 +1,95 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: blingfire
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Andrew Kane
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-02-24 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: minitest
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '5'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '5'
55
+ description:
56
+ email: andrew@chartkick.com
57
+ executables: []
58
+ extensions: []
59
+ extra_rdoc_files: []
60
+ files:
61
+ - CHANGELOG.md
62
+ - LICENSE.txt
63
+ - README.md
64
+ - lib/blingfire.rb
65
+ - lib/blingfire/ffi.rb
66
+ - lib/blingfire/model.rb
67
+ - lib/blingfire/version.rb
68
+ - vendor/LICENSE
69
+ - vendor/blingfiretokdll.dll
70
+ - vendor/libblingfiretokdll.dylib
71
+ - vendor/libblingfiretokdll.so
72
+ homepage: https://github.com/ankane/blingfire
73
+ licenses:
74
+ - MIT
75
+ metadata: {}
76
+ post_install_message:
77
+ rdoc_options: []
78
+ require_paths:
79
+ - lib
80
+ required_ruby_version: !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - ">="
83
+ - !ruby/object:Gem::Version
84
+ version: '2.4'
85
+ required_rubygems_version: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ requirements: []
91
+ rubygems_version: 3.1.2
92
+ signing_key:
93
+ specification_version: 4
94
+ summary: High speed text tokenization for Ruby
95
+ test_files: []