sooth 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/VERSION +1 -1
- data/ext/sooth_native/native.c +81 -0
- data/ext/sooth_native/sooth_predictor.c +49 -0
- data/ext/sooth_native/sooth_predictor.h +2 -0
- data/sooth.gemspec +2 -2
- data/spec/predictor_spec.rb +41 -0
- metadata +1 -1
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA1:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: 1bd89c2dd37bdeec58eb72ecca31a1075dd28350
         | 
| 4 | 
            +
              data.tar.gz: 8bc31931e77993880f3ff5f516a362933c061f8d
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: 3c79c5b70b6ee7df2e90b2b1fcaabaeae98da17dd8bc0c0338292b5b5ca00bcdd25b9640e1ed6599be982fce7704e1d79803adaded651dd417cb03e9227fc521
         | 
| 7 | 
            +
              data.tar.gz: 9cb14586481824dc2d8b6662d0e5d31c27c5b00c1a9932fa1c4ccc4cd5ddf5753b65d403a27c604a518fa11c5af4206be0ca7fcfbf5ad823784ead1f97d1496c
         | 
    
        data/VERSION
    CHANGED
    
    | @@ -1 +1 @@ | |
| 1 | 
            -
            0. | 
| 1 | 
            +
            0.5.0
         | 
    
        data/ext/sooth_native/native.c
    CHANGED
    
    | @@ -35,6 +35,12 @@ void method_sooth_native_deallocate(void * predictor); | |
| 35 35 | 
             
             *       def select(bigram, limit)
         | 
| 36 36 | 
             
             *         # (native code)
         | 
| 37 37 | 
             
             *       end
         | 
| 38 | 
            +
             *       def uncertainty(bigram)
         | 
| 39 | 
            +
             *         # (native code)
         | 
| 40 | 
            +
             *       end
         | 
| 41 | 
            +
             *       def surprise(bigram, symbol)
         | 
| 42 | 
            +
             *         # (native code)
         | 
| 43 | 
            +
             *       end
         | 
| 38 44 | 
             
             *     end
         | 
| 39 45 | 
             
             *   end
         | 
| 40 46 | 
             
             *
         | 
| @@ -106,6 +112,32 @@ VALUE method_sooth_native_count(VALUE self, VALUE bigram); | |
| 106 112 | 
             
             */
         | 
| 107 113 | 
             
            VALUE method_sooth_native_select(VALUE self, VALUE bigram, VALUE limit);
         | 
| 108 114 |  | 
| 115 | 
            +
            /*
         | 
| 116 | 
            +
             * Return a number indicating how uncertain the predictor is about which symbol
         | 
| 117 | 
            +
             * is likely to be observed after the given bigram. Note that nil will be
         | 
| 118 | 
            +
             * returned if the bigram has never been observed.
         | 
| 119 | 
            +
             *
         | 
| 120 | 
            +
             * @param [Array] bigram A pair of symbols.
         | 
| 121 | 
            +
             * @return [Float] The uncertainty, which is calculated to be the shannon entropy
         | 
| 122 | 
            +
             *                 of the probability distribution over the alphabet of symbols
         | 
| 123 | 
            +
             *                 in the context of the bigram.
         | 
| 124 | 
            +
             */
         | 
| 125 | 
            +
            VALUE method_sooth_native_uncertainty(VALUE self, VALUE bigram);
         | 
| 126 | 
            +
             | 
| 127 | 
            +
            /*
         | 
| 128 | 
            +
             * Return a number indicating the surprise received by the predictor when it
         | 
| 129 | 
            +
             * observed the given symbol after the given bigram. Note that nil will be
         | 
| 130 | 
            +
             * returned if the symbol has never been observed after the bigram.
         | 
| 131 | 
            +
             *
         | 
| 132 | 
            +
             * @param [Array] bigram A pair of symbols.
         | 
| 133 | 
            +
             * @param [Fixnum] symbol The symbol that has been observed.
         | 
| 134 | 
            +
             * @return [Float] The surprise, which is calculated to be the shannon pointwise
         | 
| 135 | 
            +
             *                 mutual information of the symbol according to the probability
         | 
| 136 | 
            +
             *                 distribution over the alphabet of symbols in the context of
         | 
| 137 | 
            +
             *                 the bigram.
         | 
| 138 | 
            +
             */
         | 
| 139 | 
            +
            VALUE method_sooth_native_surprise(VALUE self, VALUE bigram, VALUE limit);
         | 
| 140 | 
            +
             | 
| 109 141 | 
             
            //------------------------------------------------------------------------------
         | 
| 110 142 |  | 
| 111 143 | 
             
            void Init_sooth_native()
         | 
| @@ -123,6 +155,8 @@ void Init_sooth_native() | |
| 123 155 | 
             
              rb_define_method(SoothNative, "observe", method_sooth_native_observe, 2);
         | 
| 124 156 | 
             
              rb_define_method(SoothNative, "count", method_sooth_native_count, 1);
         | 
| 125 157 | 
             
              rb_define_method(SoothNative, "select", method_sooth_native_select, 2);
         | 
| 158 | 
            +
              rb_define_method(SoothNative, "uncertainty", method_sooth_native_uncertainty, 1);
         | 
| 159 | 
            +
              rb_define_method(SoothNative, "surprise", method_sooth_native_surprise, 2);
         | 
| 126 160 | 
             
            }
         | 
| 127 161 |  | 
| 128 162 | 
             
            //------------------------------------------------------------------------------
         | 
| @@ -258,4 +292,51 @@ method_sooth_native_select(VALUE self, VALUE bigram, VALUE limit) | |
| 258 292 | 
             
              return UINT2NUM(symbol);
         | 
| 259 293 | 
             
            }
         | 
| 260 294 |  | 
| 295 | 
            +
            //------------------------------------------------------------------------------
         | 
| 296 | 
            +
             | 
| 297 | 
            +
            VALUE
         | 
| 298 | 
            +
            method_sooth_native_uncertainty(VALUE self, VALUE bigram)
         | 
| 299 | 
            +
            {
         | 
| 300 | 
            +
              SoothPredictor * predictor = NULL;
         | 
| 301 | 
            +
              Check_Type(bigram, T_ARRAY);
         | 
| 302 | 
            +
              if (RARRAY_LEN(bigram) != 2)
         | 
| 303 | 
            +
              {
         | 
| 304 | 
            +
                rb_raise(rb_eTypeError, "bigram must be an array of exactly two symbols");
         | 
| 305 | 
            +
              }
         | 
| 306 | 
            +
              Check_Type(RARRAY_PTR(bigram)[0], T_FIXNUM);
         | 
| 307 | 
            +
              Check_Type(RARRAY_PTR(bigram)[1], T_FIXNUM);
         | 
| 308 | 
            +
              Data_Get_Struct(self, SoothPredictor, predictor);
         | 
| 309 | 
            +
              uint32_t c_bigram[2] = {NUM2UINT(RARRAY_PTR(bigram)[0]), NUM2UINT(RARRAY_PTR(bigram)[1])};
         | 
| 310 | 
            +
              double uncertainty = sooth_predictor_uncertainty(predictor, c_bigram);
         | 
| 311 | 
            +
              if (uncertainty < 0)
         | 
| 312 | 
            +
              {
         | 
| 313 | 
            +
                return Qnil;
         | 
| 314 | 
            +
              }
         | 
| 315 | 
            +
              return DBL2NUM(uncertainty);
         | 
| 316 | 
            +
            }
         | 
| 317 | 
            +
             | 
| 318 | 
            +
            //------------------------------------------------------------------------------
         | 
| 319 | 
            +
             | 
| 320 | 
            +
            VALUE
         | 
| 321 | 
            +
            method_sooth_native_surprise(VALUE self, VALUE bigram, VALUE symbol)
         | 
| 322 | 
            +
            {
         | 
| 323 | 
            +
              SoothPredictor * predictor = NULL;
         | 
| 324 | 
            +
              Check_Type(symbol, T_FIXNUM);
         | 
| 325 | 
            +
              Check_Type(bigram, T_ARRAY);
         | 
| 326 | 
            +
              if (RARRAY_LEN(bigram) != 2)
         | 
| 327 | 
            +
              {
         | 
| 328 | 
            +
                rb_raise(rb_eTypeError, "bigram must be an array of exactly two symbols");
         | 
| 329 | 
            +
              }
         | 
| 330 | 
            +
              Check_Type(RARRAY_PTR(bigram)[0], T_FIXNUM);
         | 
| 331 | 
            +
              Check_Type(RARRAY_PTR(bigram)[1], T_FIXNUM);
         | 
| 332 | 
            +
              Data_Get_Struct(self, SoothPredictor, predictor);
         | 
| 333 | 
            +
              uint32_t c_bigram[2] = {NUM2UINT(RARRAY_PTR(bigram)[0]), NUM2UINT(RARRAY_PTR(bigram)[1])};
         | 
| 334 | 
            +
              double surprise = sooth_predictor_surprise(predictor, c_bigram, NUM2UINT(symbol));
         | 
| 335 | 
            +
              if (surprise < 0)
         | 
| 336 | 
            +
              {
         | 
| 337 | 
            +
                return Qnil;
         | 
| 338 | 
            +
              }
         | 
| 339 | 
            +
              return DBL2NUM(surprise);
         | 
| 340 | 
            +
            }
         | 
| 341 | 
            +
             | 
| 261 342 | 
             
            //==============================================================================
         | 
| @@ -3,6 +3,7 @@ | |
| 3 3 | 
             
            #include <stdio.h>
         | 
| 4 4 | 
             
            #include <stdlib.h>
         | 
| 5 5 | 
             
            #include <string.h>
         | 
| 6 | 
            +
            #include <math.h>
         | 
| 6 7 |  | 
| 7 8 | 
             
            #include "sooth_predictor.h"
         | 
| 8 9 |  | 
| @@ -337,4 +338,52 @@ sooth_predictor_select(SoothPredictor * predictor, uint32_t bigram[2], uint32_t | |
| 337 338 | 
             
              return predictor->error_symbol;
         | 
| 338 339 | 
             
            }
         | 
| 339 340 |  | 
| 341 | 
            +
            //------------------------------------------------------------------------------
         | 
| 342 | 
            +
             | 
| 343 | 
            +
            double
         | 
| 344 | 
            +
            sooth_predictor_uncertainty(SoothPredictor * predictor, uint32_t bigram[2])
         | 
| 345 | 
            +
            {
         | 
| 346 | 
            +
              SoothContext * context = sooth_predictor_find_context(predictor, bigram);
         | 
| 347 | 
            +
             | 
| 348 | 
            +
              if (context == NULL || context->count == 0)
         | 
| 349 | 
            +
              {
         | 
| 350 | 
            +
                return -1;
         | 
| 351 | 
            +
              }
         | 
| 352 | 
            +
             | 
| 353 | 
            +
              double uncertainty = 0.0;
         | 
| 354 | 
            +
              for (uint32_t i = 0; i < context->statistics_size; ++i)
         | 
| 355 | 
            +
              {
         | 
| 356 | 
            +
                if (context->statistics[i].count > 0)
         | 
| 357 | 
            +
                {
         | 
| 358 | 
            +
                  double frequency = (double)context->statistics[i].count / (double)context->count;
         | 
| 359 | 
            +
                  uncertainty -= frequency * log2(frequency);
         | 
| 360 | 
            +
                }
         | 
| 361 | 
            +
              }
         | 
| 362 | 
            +
             | 
| 363 | 
            +
              return uncertainty;
         | 
| 364 | 
            +
            }
         | 
| 365 | 
            +
             | 
| 366 | 
            +
            //------------------------------------------------------------------------------
         | 
| 367 | 
            +
             | 
| 368 | 
            +
            double
         | 
| 369 | 
            +
            sooth_predictor_surprise(SoothPredictor * predictor, uint32_t bigram[2], uint32_t symbol)
         | 
| 370 | 
            +
            {
         | 
| 371 | 
            +
              SoothContext * context = sooth_predictor_find_context(predictor, bigram);
         | 
| 372 | 
            +
             | 
| 373 | 
            +
              if (context == NULL || context->count == 0)
         | 
| 374 | 
            +
              {
         | 
| 375 | 
            +
                return -1;
         | 
| 376 | 
            +
              }
         | 
| 377 | 
            +
             | 
| 378 | 
            +
              SoothStatistic * statistic = sooth_predictor_find_statistic(context, symbol);
         | 
| 379 | 
            +
             | 
| 380 | 
            +
              if (statistic == NULL || statistic->count == 0)
         | 
| 381 | 
            +
              {
         | 
| 382 | 
            +
                return -1;
         | 
| 383 | 
            +
              }
         | 
| 384 | 
            +
             | 
| 385 | 
            +
              double frequency = (double)statistic->count / (double)context->count;
         | 
| 386 | 
            +
              return -log2(frequency);
         | 
| 387 | 
            +
            }
         | 
| 388 | 
            +
             | 
| 340 389 | 
             
            //==============================================================================
         | 
| @@ -26,6 +26,8 @@ bool sooth_predictor_save(const char * const filename, SoothPredictor * predicto | |
| 26 26 | 
             
            uint32_t sooth_predictor_observe(SoothPredictor * predictor, uint32_t bigram[2], uint32_t symbol);
         | 
| 27 27 | 
             
            uint32_t sooth_predictor_count(SoothPredictor * predictor, uint32_t bigram[2]);
         | 
| 28 28 | 
             
            uint32_t sooth_predictor_select(SoothPredictor * predictor, uint32_t bigram[2], uint32_t limit);
         | 
| 29 | 
            +
            double sooth_predictor_uncertainty(SoothPredictor * predictor, uint32_t bigram[2]);
         | 
| 30 | 
            +
            double sooth_predictor_surprise(SoothPredictor * predictor, uint32_t bigram[2], uint32_t symbol);
         | 
| 29 31 |  | 
| 30 32 | 
             
            //==============================================================================
         | 
| 31 33 |  | 
    
        data/sooth.gemspec
    CHANGED
    
    | @@ -2,12 +2,12 @@ | |
| 2 2 | 
             
            # DO NOT EDIT THIS FILE DIRECTLY
         | 
| 3 3 | 
             
            # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
         | 
| 4 4 | 
             
            # -*- encoding: utf-8 -*-
         | 
| 5 | 
            -
            # stub: sooth 0. | 
| 5 | 
            +
            # stub: sooth 0.5.0 ruby lib
         | 
| 6 6 | 
             
            # stub: ext/sooth_native/extconf.rb
         | 
| 7 7 |  | 
| 8 8 | 
             
            Gem::Specification.new do |s|
         | 
| 9 9 | 
             
              s.name = "sooth"
         | 
| 10 | 
            -
              s.version = "0. | 
| 10 | 
            +
              s.version = "0.5.0"
         | 
| 11 11 |  | 
| 12 12 | 
             
              s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
         | 
| 13 13 | 
             
              s.require_paths = ["lib"]
         | 
    
        data/spec/predictor_spec.rb
    CHANGED
    
    | @@ -134,4 +134,45 @@ describe Sooth::Predictor do | |
| 134 134 | 
             
                end
         | 
| 135 135 |  | 
| 136 136 | 
             
              end
         | 
| 137 | 
            +
             | 
| 138 | 
            +
              describe "#uncertainty" do
         | 
| 139 | 
            +
             | 
| 140 | 
            +
                it "has no uncertainty for a new context" do
         | 
| 141 | 
            +
                  expect(predictor.uncertainty([1, 2])).to be_nil 
         | 
| 142 | 
            +
                  expect(predictor.count([1, 2])).to eq(0) 
         | 
| 143 | 
            +
                  expect(predictor.uncertainty([1, 2])).to be_nil 
         | 
| 144 | 
            +
                end
         | 
| 145 | 
            +
             | 
| 146 | 
            +
                it "has zero uncertainty for a lone context" do
         | 
| 147 | 
            +
                  predictor.observe([1, 2], 3)
         | 
| 148 | 
            +
                  expect(predictor.uncertainty([1, 2])).to eq(0) 
         | 
| 149 | 
            +
                end
         | 
| 150 | 
            +
             | 
| 151 | 
            +
                it "has maximal uncertainty for a uniform distribution" do
         | 
| 152 | 
            +
                  (1..256).each { |i| predictor.observe([1, 2], i) }
         | 
| 153 | 
            +
                  expect(predictor.uncertainty([1, 2])).to eq(8) 
         | 
| 154 | 
            +
                end
         | 
| 155 | 
            +
             | 
| 156 | 
            +
              end
         | 
| 157 | 
            +
             | 
| 158 | 
            +
              describe "#surprise" do
         | 
| 159 | 
            +
             | 
| 160 | 
            +
                it "has no surprise for a new context or symbol" do
         | 
| 161 | 
            +
                  expect(predictor.surprise([1, 2], 3)).to be_nil 
         | 
| 162 | 
            +
                  expect(predictor.count([1, 2])).to eq(0) 
         | 
| 163 | 
            +
                  expect(predictor.surprise([1, 2], 3)).to be_nil 
         | 
| 164 | 
            +
                end
         | 
| 165 | 
            +
             | 
| 166 | 
            +
                it "has zero surprise for a lone context" do
         | 
| 167 | 
            +
                  predictor.observe([1, 2], 3)
         | 
| 168 | 
            +
                  expect(predictor.surprise([1, 2], 3)).to eq(0) 
         | 
| 169 | 
            +
                end
         | 
| 170 | 
            +
             | 
| 171 | 
            +
                it "has uniform surprise for a uniform distribution" do
         | 
| 172 | 
            +
                  (1..256).each { |i| predictor.observe([1, 2], i) }
         | 
| 173 | 
            +
                  expect(predictor.surprise([1, 2], 3)).to eq(8) 
         | 
| 174 | 
            +
                end
         | 
| 175 | 
            +
             | 
| 176 | 
            +
              end
         | 
| 177 | 
            +
             | 
| 137 178 | 
             
            end
         |