blingfire 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: a93f90584141a618ff5bb73860068e2ea662c568a07e2f877a5f2be7e265919e
4
+ data.tar.gz: 61b49a4bd7eb304f44c041730bce733102f1936c9340cd9c0c3f1390bd9e853a
5
+ SHA512:
6
+ metadata.gz: fd0242fc13696c620ffd3a7383b9088aaddade1f7280c4f35c54d5165f66e26b783e88fa2d65ced002e8f233af0637e43e9adb68ad7556a16cfdcbb6aff9ba7e
7
+ data.tar.gz: a1206eaa056ed93639c00c03eb16be5fbbec6b077e1677a916b69c6244c26d3c2cce51299dc0394335c54d12de500a9648f30fa6f3785ef749be528904d5ae60
@@ -0,0 +1,3 @@
1
+ ## 0.1.0 (2020-02-24)
2
+
3
+ - First release
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2020 Andrew Kane
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,76 @@
1
+ # BlingFire
2
+
3
+ [BlingFire](https://github.com/microsoft/BlingFire) - high speed text tokenization - for Ruby
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application’s Gemfile:
8
+
9
+ ```ruby
10
+ gem 'blingfire'
11
+ ```
12
+
13
+ ## Getting Started
14
+
15
+ Create a model
16
+
17
+ ```ruby
18
+ model = BlingFire::Model.new
19
+ ```
20
+
21
+ Tokenize words
22
+
23
+ ```ruby
24
+ model.text_to_words(text)
25
+ ```
26
+
27
+ Tokenize sentences
28
+
29
+ ```ruby
30
+ model.text_to_sentences(text)
31
+ ```
32
+
33
+ ## Pre-trained Models
34
+
35
+ BlingFire comes with a default model that follows the tokenization logic of NLTK with a few changes. You can also download other models:
36
+
37
+ - [BERT Base](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/bert_base_tok.bin)
38
+ - [BERT Base Cased](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/bert_base_cased_tok.bin)
39
+ - [BERT Chinese](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/bert_chinese.bin)
40
+ - [BERT Multilingual Cased](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/bert_multi_cased.bin)
41
+ - [WBD](https://github.com/microsoft/BlingFire/blob/master/dist-pypi/blingfire/wbd_chuni.bin)
42
+
43
+ Load a model
44
+
45
+ ```ruby
46
+ model = BlingFire.load_model("bert_base_tok.bin")
47
+ ```
48
+
49
+ Convert text to ids
50
+
51
+ ```ruby
52
+ model.text_to_ids(text)
53
+ ```
54
+
55
+ ## History
56
+
57
+ View the [changelog](https://github.com/ankane/blingfire/blob/master/CHANGELOG.md)
58
+
59
+ ## Contributing
60
+
61
+ Everyone is encouraged to help improve this project. Here are a few ways you can help:
62
+
63
+ - [Report bugs](https://github.com/ankane/blingfire/issues)
64
+ - Fix bugs and [submit pull requests](https://github.com/ankane/blingfire/pulls)
65
+ - Write, clarify, or fix documentation
66
+ - Suggest or add new features
67
+
68
+ To get started with development:
69
+
70
+ ```sh
71
+ git clone https://github.com/ankane/blingfire.git
72
+ cd blingfire
73
+ bundle install
74
+ bundle exec rake vendor:all
75
+ bundle exec rake test
76
+ ```
@@ -0,0 +1,91 @@
1
+ # stdlib
2
+ require "fiddle/import"
3
+
4
+ # modules
5
+ require "blingfire/model"
6
+ require "blingfire/version"
7
+
8
+ module BlingFire
9
+ class Error < StandardError; end
10
+
11
+ class << self
12
+ attr_accessor :ffi_lib
13
+ end
14
+ lib_name =
15
+ if Gem.win_platform?
16
+ "blingfiretokdll.dll"
17
+ elsif RbConfig::CONFIG["host_os"] =~ /darwin/i
18
+ "libblingfiretokdll.dylib"
19
+ else
20
+ "libblingfiretokdll.so"
21
+ end
22
+ vendor_lib = File.expand_path("../vendor/#{lib_name}", __dir__)
23
+ self.ffi_lib = [vendor_lib]
24
+
25
+ # friendlier error message
26
+ autoload :FFI, "blingfire/ffi"
27
+
28
+ class << self
29
+ def lib_version
30
+ FFI.GetBlingFireTokVersion
31
+ end
32
+
33
+ def load_model(path)
34
+ Model.new(path)
35
+ end
36
+
37
+ def text_to_words(text)
38
+ text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
39
+ out = Fiddle::Pointer.malloc(text.bytesize * 3)
40
+ out_size = FFI.TextToWords(text, text.bytesize, out, out.size)
41
+ check_status out_size
42
+ encode_utf8(out[0, out_size - 1]).split(" ")
43
+ end
44
+
45
+ def text_to_words_with_model(model, text)
46
+ text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
47
+ out = Fiddle::Pointer.malloc(text.bytesize * 3)
48
+ out_size = FFI.TextToWordsWithModel(text, text.bytesize, out, out.size, model)
49
+ check_status out_size
50
+ encode_utf8(out[0, out_size - 1]).split(" ")
51
+ end
52
+
53
+ def text_to_sentences(text)
54
+ text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
55
+ out = Fiddle::Pointer.malloc(text.bytesize * 3)
56
+ out_size = FFI.TextToSentences(text, text.bytesize, out, out.size)
57
+ check_status out_size
58
+ encode_utf8(out[0, out_size - 1]).split("\n")
59
+ end
60
+
61
+ def text_to_sentences_with_model(model, text)
62
+ text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
63
+ out = Fiddle::Pointer.malloc(text.bytesize * 3)
64
+ out_size = FFI.TextToSentencesWithModel(text, text.bytesize, out, out.size, model)
65
+ check_status out_size
66
+ encode_utf8(out[0, out_size - 1]).split("\n")
67
+ end
68
+
69
+ def text_to_ids(model, text, max_len = nil, unk_id = 0)
70
+ text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
71
+ ids = Fiddle::Pointer.malloc((max_len || text.size) * Fiddle::SIZEOF_INT)
72
+ out_size = FFI.TextToIds(model, text, text.bytesize, ids, ids.size, unk_id)
73
+ check_status out_size
74
+ ids[0, (max_len || out_size) * Fiddle::SIZEOF_INT].unpack("i!*")
75
+ end
76
+
77
+ def free_model(model)
78
+ FFI.FreeModel(model)
79
+ end
80
+
81
+ private
82
+
83
+ def check_status(ret)
84
+ raise Error, "Bad status" if ret == -1
85
+ end
86
+
87
+ def encode_utf8(text)
88
+ text.force_encoding(Encoding::UTF_8)
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,22 @@
1
+ module BlingFire
2
+ module FFI
3
+ extend Fiddle::Importer
4
+
5
+ libs = Array(BlingFire.ffi_lib).dup
6
+ begin
7
+ dlload Fiddle.dlopen(libs.shift)
8
+ rescue Fiddle::DLError => e
9
+ retry if libs.any?
10
+ raise e
11
+ end
12
+
13
+ extern "int GetBlingFireTokVersion()"
14
+ extern "void* LoadModel(char * pszLdbFileName)"
15
+ extern "int FreeModel(void* ModelPtr)"
16
+ extern "int TextToWordsWithModel(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int MaxOutUtf8StrByteCount, void * hModel)"
17
+ extern "int TextToWords(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int MaxOutUtf8StrByteCount)"
18
+ extern "int TextToIds(void* ModelPtr, char * pInUtf8Str, int InUtf8StrByteCount, int32_t * pIdsArr, int MaxIdsArrLength, int UnkId)"
19
+ extern "int TextToSentences(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int MaxOutUtf8StrByteCount)"
20
+ extern "int TextToSentencesWithModel(char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, int MaxOutUtf8StrByteCount, void * hModel)"
21
+ end
22
+ end
@@ -0,0 +1,43 @@
1
+ module BlingFire
2
+ class Model
3
+ def initialize(path = nil)
4
+ if path
5
+ @handle = FFI.LoadModel(path)
6
+ ObjectSpace.define_finalizer(self, self.class.finalize(@handle))
7
+ end
8
+ end
9
+
10
+ def text_to_words(text)
11
+ if @handle
12
+ BlingFire.text_to_words_with_model(@handle, text)
13
+ else
14
+ BlingFire.text_to_words(text)
15
+ end
16
+ end
17
+
18
+ def text_to_sentences(text)
19
+ if @handle
20
+ BlingFire.text_to_sentences_with_model(@handle, text)
21
+ else
22
+ BlingFire.text_to_sentences(text)
23
+ end
24
+ end
25
+
26
+ def text_to_ids(text, max_len = nil, unk_id = 0)
27
+ if @handle
28
+ BlingFire.text_to_ids(@handle, text, max_len, unk_id)
29
+ else
30
+ raise "Not implemented"
31
+ end
32
+ end
33
+
34
+ def to_ptr
35
+ @handle
36
+ end
37
+
38
+ def self.finalize(pointer)
39
+ # must use proc instead of stabby lambda
40
+ proc { FFI.FreeModel(pointer) }
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,3 @@
1
+ module BlingFire
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) Microsoft Corporation. All rights reserved.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE
metadata ADDED
@@ -0,0 +1,95 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: blingfire
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Andrew Kane
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-02-24 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: minitest
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '5'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '5'
55
+ description:
56
+ email: andrew@chartkick.com
57
+ executables: []
58
+ extensions: []
59
+ extra_rdoc_files: []
60
+ files:
61
+ - CHANGELOG.md
62
+ - LICENSE.txt
63
+ - README.md
64
+ - lib/blingfire.rb
65
+ - lib/blingfire/ffi.rb
66
+ - lib/blingfire/model.rb
67
+ - lib/blingfire/version.rb
68
+ - vendor/LICENSE
69
+ - vendor/blingfiretokdll.dll
70
+ - vendor/libblingfiretokdll.dylib
71
+ - vendor/libblingfiretokdll.so
72
+ homepage: https://github.com/ankane/blingfire
73
+ licenses:
74
+ - MIT
75
+ metadata: {}
76
+ post_install_message:
77
+ rdoc_options: []
78
+ require_paths:
79
+ - lib
80
+ required_ruby_version: !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - ">="
83
+ - !ruby/object:Gem::Version
84
+ version: '2.4'
85
+ required_rubygems_version: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ requirements: []
91
+ rubygems_version: 3.1.2
92
+ signing_key:
93
+ specification_version: 4
94
+ summary: High speed text tokenization for Ruby
95
+ test_files: []