blingfire 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/CHANGELOG.md +5 -0
 - data/lib/blingfire/ffi.rb +0 -2
 - data/lib/blingfire/model.rb +1 -1
 - data/lib/blingfire/version.rb +1 -1
 - data/lib/blingfire.rb +19 -19
 - metadata +5 -5
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA256:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: a8bcd44e1517ca140f88c58672f9ce09644e4498add5763ec41a5551e8c281ab
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: 47c29e51a1f442ecc3a033a73d0c4916e8eebd8be194d46389e3cb64a98a1e8a
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: 43d1be1d30fcadf809aab8728d8a7d6d7b57128dc4f7e8784072e469e43fd01ff3a463ecc915563d3bc3c84c7265bfad7b854c26759bc6eee0b47196c79c1456
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: 6fc78f981153589681ba6016a9d0e2aaa264d68a551b7656db03e353aaa2bd9441ba9653bece7d6f886063ebca25d8006804d54328c207a6b01719a63f664e53
         
     | 
    
        data/CHANGELOG.md
    CHANGED
    
    
    
        data/lib/blingfire/ffi.rb
    CHANGED
    
    
    
        data/lib/blingfire/model.rb
    CHANGED
    
    | 
         @@ -4,7 +4,7 @@ module BlingFire 
     | 
|
| 
       4 
4 
     | 
    
         
             
                  @handle = nil
         
     | 
| 
       5 
5 
     | 
    
         
             
                  if path
         
     | 
| 
       6 
6 
     | 
    
         
             
                    raise Error, "Model not found" unless File.exist?(path)
         
     | 
| 
       7 
     | 
    
         
            -
                    @handle = FFI.LoadModel(path)
         
     | 
| 
      
 7 
     | 
    
         
            +
                    @handle = FFI.LoadModel(+path)
         
     | 
| 
       8 
8 
     | 
    
         
             
                    @handle.free = FFI["FreeModel"]
         
     | 
| 
       9 
9 
     | 
    
         | 
| 
       10 
10 
     | 
    
         
             
                    BlingFire.change_settings_dummy_prefix(@handle, prefix) unless prefix.nil?
         
     | 
    
        data/lib/blingfire/version.rb
    CHANGED
    
    
    
        data/lib/blingfire.rb
    CHANGED
    
    | 
         @@ -92,20 +92,20 @@ module BlingFire 
     | 
|
| 
       92 
92 
     | 
    
         | 
| 
       93 
93 
     | 
    
         
             
                def text_to_ids(model, text, max_len = nil, unk_id = 0)
         
     | 
| 
       94 
94 
     | 
    
         
             
                  text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
         
     | 
| 
       95 
     | 
    
         
            -
                  ids = Fiddle::Pointer.malloc((max_len || text.size) * Fiddle::SIZEOF_INT)
         
     | 
| 
       96 
     | 
    
         
            -
                  out_size = FFI.TextToIds(model, text, text.bytesize, ids, ids.size, unk_id)
         
     | 
| 
      
 95 
     | 
    
         
            +
                  ids = Fiddle::Pointer.malloc((max_len || text.size) * Fiddle::SIZEOF_INT, Fiddle::RUBY_FREE)
         
     | 
| 
      
 96 
     | 
    
         
            +
                  out_size = FFI.TextToIds(model, +text, text.bytesize, ids, ids.size, unk_id)
         
     | 
| 
       97 
97 
     | 
    
         
             
                  check_status out_size, ids
         
     | 
| 
       98 
98 
     | 
    
         
             
                  ids[0, (max_len || out_size) * Fiddle::SIZEOF_INT].unpack("i!*")
         
     | 
| 
       99 
99 
     | 
    
         
             
                end
         
     | 
| 
       100 
100 
     | 
    
         | 
| 
       101 
101 
     | 
    
         
             
                def text_to_ids_with_offsets(model, text, max_len = nil, unk_id = 0)
         
     | 
| 
       102 
102 
     | 
    
         
             
                  text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
         
     | 
| 
       103 
     | 
    
         
            -
                  ids = Fiddle::Pointer.malloc((max_len || text.size) * Fiddle::SIZEOF_INT)
         
     | 
| 
      
 103 
     | 
    
         
            +
                  ids = Fiddle::Pointer.malloc((max_len || text.size) * Fiddle::SIZEOF_INT, Fiddle::RUBY_FREE)
         
     | 
| 
       104 
104 
     | 
    
         | 
| 
       105 
     | 
    
         
            -
                  start_offsets = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT * ids.size)
         
     | 
| 
       106 
     | 
    
         
            -
                  end_offsets = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT * ids.size)
         
     | 
| 
      
 105 
     | 
    
         
            +
                  start_offsets = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT * ids.size, Fiddle::RUBY_FREE)
         
     | 
| 
      
 106 
     | 
    
         
            +
                  end_offsets = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT * ids.size, Fiddle::RUBY_FREE)
         
     | 
| 
       107 
107 
     | 
    
         | 
| 
       108 
     | 
    
         
            -
                  out_size = FFI.TextToIdsWithOffsets(model, text, text.bytesize, ids, start_offsets, end_offsets, ids.size, unk_id)
         
     | 
| 
      
 108 
     | 
    
         
            +
                  out_size = FFI.TextToIdsWithOffsets(model, +text, text.bytesize, ids, start_offsets, end_offsets, ids.size, unk_id)
         
     | 
| 
       109 
109 
     | 
    
         | 
| 
       110 
110 
     | 
    
         
             
                  check_status out_size, ids
         
     | 
| 
       111 
111 
     | 
    
         | 
| 
         @@ -116,8 +116,8 @@ module BlingFire 
     | 
|
| 
       116 
116 
     | 
    
         
             
                def ids_to_text(model, ids, skip_special_tokens: true, output_buffer_size: nil)
         
     | 
| 
       117 
117 
     | 
    
         
             
                  output_buffer_size ||= ids.size * 32
         
     | 
| 
       118 
118 
     | 
    
         
             
                  c_ids = Fiddle::Pointer[ids.pack("i*")]
         
     | 
| 
       119 
     | 
    
         
            -
                  out = Fiddle::Pointer.malloc(output_buffer_size)
         
     | 
| 
       120 
     | 
    
         
            -
                  out_size = FFI.IdsToText(model, c_ids, ids.size, out, output_buffer_size, skip_special_tokens 
     | 
| 
      
 119 
     | 
    
         
            +
                  out = Fiddle::Pointer.malloc(output_buffer_size, Fiddle::RUBY_FREE)
         
     | 
| 
      
 120 
     | 
    
         
            +
                  out_size = FFI.IdsToText(model, c_ids, ids.size, out, output_buffer_size, !!skip_special_tokens)
         
     | 
| 
       121 
121 
     | 
    
         
             
                  check_status out_size, out
         
     | 
| 
       122 
122 
     | 
    
         
             
                  out_size <= 0 ? "" : encode_utf8(out.to_str(out_size - 1))
         
     | 
| 
       123 
123 
     | 
    
         
             
                end
         
     | 
| 
         @@ -129,15 +129,15 @@ module BlingFire 
     | 
|
| 
       129 
129 
     | 
    
         
             
                def normalize_spaces(text)
         
     | 
| 
       130 
130 
     | 
    
         
             
                  u_space = 0x20
         
     | 
| 
       131 
131 
     | 
    
         
             
                  text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
         
     | 
| 
       132 
     | 
    
         
            -
                  out = Fiddle::Pointer.malloc([text.bytesize * 1.5, 20].max)
         
     | 
| 
       133 
     | 
    
         
            -
                  out_size = FFI.NormalizeSpaces(text, text.bytesize, out, out.size, u_space)
         
     | 
| 
      
 132 
     | 
    
         
            +
                  out = Fiddle::Pointer.malloc([text.bytesize * 1.5, 20].max, Fiddle::RUBY_FREE)
         
     | 
| 
      
 133 
     | 
    
         
            +
                  out_size = FFI.NormalizeSpaces(+text, text.bytesize, out, out.size, u_space)
         
     | 
| 
       134 
134 
     | 
    
         
             
                  check_status out_size, out
         
     | 
| 
       135 
135 
     | 
    
         
             
                  encode_utf8(out.to_str(out_size))
         
     | 
| 
       136 
136 
     | 
    
         
             
                end
         
     | 
| 
       137 
137 
     | 
    
         | 
| 
       138 
138 
     | 
    
         
             
                def change_settings_dummy_prefix(model, value)
         
     | 
| 
       139 
139 
     | 
    
         
             
                  # use opposite of value
         
     | 
| 
       140 
     | 
    
         
            -
                  ret = FFI.SetNoDummyPrefix(model, value 
     | 
| 
      
 140 
     | 
    
         
            +
                  ret = FFI.SetNoDummyPrefix(model, !value)
         
     | 
| 
       141 
141 
     | 
    
         
             
                  raise Error, "Bad status: #{ret}" if ret != 1
         
     | 
| 
       142 
142 
     | 
    
         
             
                end
         
     | 
| 
       143 
143 
     | 
    
         | 
| 
         @@ -150,8 +150,8 @@ module BlingFire 
     | 
|
| 
       150 
150 
     | 
    
         
             
                def text_to(text, sep)
         
     | 
| 
       151 
151 
     | 
    
         
             
                  text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
         
     | 
| 
       152 
152 
     | 
    
         
             
                  # TODO allocate less, and try again if needed
         
     | 
| 
       153 
     | 
    
         
            -
                  out = Fiddle::Pointer.malloc([text.bytesize * 3, 20].max)
         
     | 
| 
       154 
     | 
    
         
            -
                  out_size = yield(text, out)
         
     | 
| 
      
 153 
     | 
    
         
            +
                  out = Fiddle::Pointer.malloc([text.bytesize * 3, 20].max, Fiddle::RUBY_FREE)
         
     | 
| 
      
 154 
     | 
    
         
            +
                  out_size = yield(+text, out)
         
     | 
| 
       155 
155 
     | 
    
         
             
                  check_status out_size, out
         
     | 
| 
       156 
156 
     | 
    
         
             
                  encode_utf8(out.to_str(out_size - 1)).split(sep)
         
     | 
| 
       157 
157 
     | 
    
         
             
                end
         
     | 
| 
         @@ -159,12 +159,12 @@ module BlingFire 
     | 
|
| 
       159 
159 
     | 
    
         
             
                def text_to_with_offsets(text, sep)
         
     | 
| 
       160 
160 
     | 
    
         
             
                  text = encode_utf8(text.dup) unless text.encoding == Encoding::UTF_8
         
     | 
| 
       161 
161 
     | 
    
         
             
                  # TODO allocate less, and try again if needed
         
     | 
| 
       162 
     | 
    
         
            -
                  out = Fiddle::Pointer.malloc([text.bytesize * 3, 20].max)
         
     | 
| 
      
 162 
     | 
    
         
            +
                  out = Fiddle::Pointer.malloc([text.bytesize * 3, 20].max, Fiddle::RUBY_FREE)
         
     | 
| 
       163 
163 
     | 
    
         | 
| 
       164 
     | 
    
         
            -
                  start_offsets = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT * out.size)
         
     | 
| 
       165 
     | 
    
         
            -
                  end_offsets = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT * out.size)
         
     | 
| 
      
 164 
     | 
    
         
            +
                  start_offsets = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT * out.size, Fiddle::RUBY_FREE)
         
     | 
| 
      
 165 
     | 
    
         
            +
                  end_offsets = Fiddle::Pointer.malloc(Fiddle::SIZEOF_INT * out.size, Fiddle::RUBY_FREE)
         
     | 
| 
       166 
166 
     | 
    
         | 
| 
       167 
     | 
    
         
            -
                  out_size = yield(text, out, start_offsets, end_offsets)
         
     | 
| 
      
 167 
     | 
    
         
            +
                  out_size = yield(+text, out, start_offsets, end_offsets)
         
     | 
| 
       168 
168 
     | 
    
         | 
| 
       169 
169 
     | 
    
         
             
                  check_status out_size, out
         
     | 
| 
       170 
170 
     | 
    
         | 
| 
         @@ -177,8 +177,8 @@ module BlingFire 
     | 
|
| 
       177 
177 
     | 
    
         
             
                end
         
     | 
| 
       178 
178 
     | 
    
         | 
| 
       179 
179 
     | 
    
         
             
                def unpack_offsets(start_offsets, end_offsets, result, text)
         
     | 
| 
       180 
     | 
    
         
            -
                  start_bytes = start_offsets. 
     | 
| 
       181 
     | 
    
         
            -
                  end_bytes = end_offsets. 
     | 
| 
      
 180 
     | 
    
         
            +
                  start_bytes = start_offsets.to_str(Fiddle::SIZEOF_INT * result.size).unpack("i*")
         
     | 
| 
      
 181 
     | 
    
         
            +
                  end_bytes = end_offsets.to_str(Fiddle::SIZEOF_INT * result.size).unpack("i*")
         
     | 
| 
       182 
182 
     | 
    
         
             
                  starts = []
         
     | 
| 
       183 
183 
     | 
    
         
             
                  ends = []
         
     | 
| 
       184 
184 
     | 
    
         | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,13 +1,13 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: blingfire
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 0.3. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.3.1
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
7 
     | 
    
         
             
            - Andrew Kane
         
     | 
| 
       8 
8 
     | 
    
         
             
            bindir: bin
         
     | 
| 
       9 
9 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       10 
     | 
    
         
            -
            date:  
     | 
| 
      
 10 
     | 
    
         
            +
            date: 1980-01-02 00:00:00.000000000 Z
         
     | 
| 
       11 
11 
     | 
    
         
             
            dependencies:
         
     | 
| 
       12 
12 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       13 
13 
     | 
    
         
             
              name: fiddle
         
     | 
| 
         @@ -15,14 +15,14 @@ dependencies: 
     | 
|
| 
       15 
15 
     | 
    
         
             
                requirements:
         
     | 
| 
       16 
16 
     | 
    
         
             
                - - ">="
         
     | 
| 
       17 
17 
     | 
    
         
             
                  - !ruby/object:Gem::Version
         
     | 
| 
       18 
     | 
    
         
            -
                    version:  
     | 
| 
      
 18 
     | 
    
         
            +
                    version: 1.1.7
         
     | 
| 
       19 
19 
     | 
    
         
             
              type: :runtime
         
     | 
| 
       20 
20 
     | 
    
         
             
              prerelease: false
         
     | 
| 
       21 
21 
     | 
    
         
             
              version_requirements: !ruby/object:Gem::Requirement
         
     | 
| 
       22 
22 
     | 
    
         
             
                requirements:
         
     | 
| 
       23 
23 
     | 
    
         
             
                - - ">="
         
     | 
| 
       24 
24 
     | 
    
         
             
                  - !ruby/object:Gem::Version
         
     | 
| 
       25 
     | 
    
         
            -
                    version:  
     | 
| 
      
 25 
     | 
    
         
            +
                    version: 1.1.7
         
     | 
| 
       26 
26 
     | 
    
         
             
            email: andrew@ankane.org
         
     | 
| 
       27 
27 
     | 
    
         
             
            executables: []
         
     | 
| 
       28 
28 
     | 
    
         
             
            extensions: []
         
     | 
| 
         @@ -59,7 +59,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement 
     | 
|
| 
       59 
59 
     | 
    
         
             
                - !ruby/object:Gem::Version
         
     | 
| 
       60 
60 
     | 
    
         
             
                  version: '0'
         
     | 
| 
       61 
61 
     | 
    
         
             
            requirements: []
         
     | 
| 
       62 
     | 
    
         
            -
            rubygems_version: 3.6. 
     | 
| 
      
 62 
     | 
    
         
            +
            rubygems_version: 3.6.7
         
     | 
| 
       63 
63 
     | 
    
         
             
            specification_version: 4
         
     | 
| 
       64 
64 
     | 
    
         
             
            summary: High speed text tokenization for Ruby
         
     | 
| 
       65 
65 
     | 
    
         
             
            test_files: []
         
     |