sooth 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/VERSION +1 -1
- data/ext/sooth_native/native.c +81 -0
- data/ext/sooth_native/sooth_predictor.c +49 -0
- data/ext/sooth_native/sooth_predictor.h +2 -0
- data/sooth.gemspec +2 -2
- data/spec/predictor_spec.rb +41 -0
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1bd89c2dd37bdeec58eb72ecca31a1075dd28350
|
4
|
+
data.tar.gz: 8bc31931e77993880f3ff5f516a362933c061f8d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3c79c5b70b6ee7df2e90b2b1fcaabaeae98da17dd8bc0c0338292b5b5ca00bcdd25b9640e1ed6599be982fce7704e1d79803adaded651dd417cb03e9227fc521
|
7
|
+
data.tar.gz: 9cb14586481824dc2d8b6662d0e5d31c27c5b00c1a9932fa1c4ccc4cd5ddf5753b65d403a27c604a518fa11c5af4206be0ca7fcfbf5ad823784ead1f97d1496c
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.5.0
|
data/ext/sooth_native/native.c
CHANGED
@@ -35,6 +35,12 @@ void method_sooth_native_deallocate(void * predictor);
|
|
35
35
|
* def select(bigram, limit)
|
36
36
|
* # (native code)
|
37
37
|
* end
|
38
|
+
* def uncertainty(bigram)
|
39
|
+
* # (native code)
|
40
|
+
* end
|
41
|
+
* def surprise(bigram, symbol)
|
42
|
+
* # (native code)
|
43
|
+
* end
|
38
44
|
* end
|
39
45
|
* end
|
40
46
|
*
|
@@ -106,6 +112,32 @@ VALUE method_sooth_native_count(VALUE self, VALUE bigram);
|
|
106
112
|
*/
|
107
113
|
VALUE method_sooth_native_select(VALUE self, VALUE bigram, VALUE limit);
|
108
114
|
|
115
|
+
/*
|
116
|
+
* Return a number indicating how uncertain the predictor is about which symbol
|
117
|
+
* is likely to be observed after the given bigram. Note that nil will be
|
118
|
+
* returned if the bigram has never been observed.
|
119
|
+
*
|
120
|
+
* @param [Array] bigram A pair of symbols.
|
121
|
+
* @return [Float] The uncertainty, which is calculated to be the shannon entropy
|
122
|
+
* of the probability distribution over the alphabet of symbols
|
123
|
+
* in the context of the bigram.
|
124
|
+
*/
|
125
|
+
VALUE method_sooth_native_uncertainty(VALUE self, VALUE bigram);
|
126
|
+
|
127
|
+
/*
|
128
|
+
* Return a number indicating the surprise received by the predictor when it
|
129
|
+
* observed the given symbol after the given bigram. Note that nil will be
|
130
|
+
* returned if the symbol has never been observed after the bigram.
|
131
|
+
*
|
132
|
+
* @param [Array] bigram A pair of symbols.
|
133
|
+
* @param [Fixnum] symbol The symbol that has been observed.
|
134
|
+
* @return [Float] The surprise, which is calculated to be the shannon pointwise
|
135
|
+
* mutual information of the symbol according to the probability
|
136
|
+
* distribution over the alphabet of symbols in the context of
|
137
|
+
* the bigram.
|
138
|
+
*/
|
139
|
+
VALUE method_sooth_native_surprise(VALUE self, VALUE bigram, VALUE limit);
|
140
|
+
|
109
141
|
//------------------------------------------------------------------------------
|
110
142
|
|
111
143
|
void Init_sooth_native()
|
@@ -123,6 +155,8 @@ void Init_sooth_native()
|
|
123
155
|
rb_define_method(SoothNative, "observe", method_sooth_native_observe, 2);
|
124
156
|
rb_define_method(SoothNative, "count", method_sooth_native_count, 1);
|
125
157
|
rb_define_method(SoothNative, "select", method_sooth_native_select, 2);
|
158
|
+
rb_define_method(SoothNative, "uncertainty", method_sooth_native_uncertainty, 1);
|
159
|
+
rb_define_method(SoothNative, "surprise", method_sooth_native_surprise, 2);
|
126
160
|
}
|
127
161
|
|
128
162
|
//------------------------------------------------------------------------------
|
@@ -258,4 +292,51 @@ method_sooth_native_select(VALUE self, VALUE bigram, VALUE limit)
|
|
258
292
|
return UINT2NUM(symbol);
|
259
293
|
}
|
260
294
|
|
295
|
+
//------------------------------------------------------------------------------
|
296
|
+
|
297
|
+
VALUE
|
298
|
+
method_sooth_native_uncertainty(VALUE self, VALUE bigram)
|
299
|
+
{
|
300
|
+
SoothPredictor * predictor = NULL;
|
301
|
+
Check_Type(bigram, T_ARRAY);
|
302
|
+
if (RARRAY_LEN(bigram) != 2)
|
303
|
+
{
|
304
|
+
rb_raise(rb_eTypeError, "bigram must be an array of exactly two symbols");
|
305
|
+
}
|
306
|
+
Check_Type(RARRAY_PTR(bigram)[0], T_FIXNUM);
|
307
|
+
Check_Type(RARRAY_PTR(bigram)[1], T_FIXNUM);
|
308
|
+
Data_Get_Struct(self, SoothPredictor, predictor);
|
309
|
+
uint32_t c_bigram[2] = {NUM2UINT(RARRAY_PTR(bigram)[0]), NUM2UINT(RARRAY_PTR(bigram)[1])};
|
310
|
+
double uncertainty = sooth_predictor_uncertainty(predictor, c_bigram);
|
311
|
+
if (uncertainty < 0)
|
312
|
+
{
|
313
|
+
return Qnil;
|
314
|
+
}
|
315
|
+
return DBL2NUM(uncertainty);
|
316
|
+
}
|
317
|
+
|
318
|
+
//------------------------------------------------------------------------------
|
319
|
+
|
320
|
+
VALUE
|
321
|
+
method_sooth_native_surprise(VALUE self, VALUE bigram, VALUE symbol)
|
322
|
+
{
|
323
|
+
SoothPredictor * predictor = NULL;
|
324
|
+
Check_Type(symbol, T_FIXNUM);
|
325
|
+
Check_Type(bigram, T_ARRAY);
|
326
|
+
if (RARRAY_LEN(bigram) != 2)
|
327
|
+
{
|
328
|
+
rb_raise(rb_eTypeError, "bigram must be an array of exactly two symbols");
|
329
|
+
}
|
330
|
+
Check_Type(RARRAY_PTR(bigram)[0], T_FIXNUM);
|
331
|
+
Check_Type(RARRAY_PTR(bigram)[1], T_FIXNUM);
|
332
|
+
Data_Get_Struct(self, SoothPredictor, predictor);
|
333
|
+
uint32_t c_bigram[2] = {NUM2UINT(RARRAY_PTR(bigram)[0]), NUM2UINT(RARRAY_PTR(bigram)[1])};
|
334
|
+
double surprise = sooth_predictor_surprise(predictor, c_bigram, NUM2UINT(symbol));
|
335
|
+
if (surprise < 0)
|
336
|
+
{
|
337
|
+
return Qnil;
|
338
|
+
}
|
339
|
+
return DBL2NUM(surprise);
|
340
|
+
}
|
341
|
+
|
261
342
|
//==============================================================================
|
@@ -3,6 +3,7 @@
|
|
3
3
|
#include <stdio.h>
|
4
4
|
#include <stdlib.h>
|
5
5
|
#include <string.h>
|
6
|
+
#include <math.h>
|
6
7
|
|
7
8
|
#include "sooth_predictor.h"
|
8
9
|
|
@@ -337,4 +338,52 @@ sooth_predictor_select(SoothPredictor * predictor, uint32_t bigram[2], uint32_t
|
|
337
338
|
return predictor->error_symbol;
|
338
339
|
}
|
339
340
|
|
341
|
+
//------------------------------------------------------------------------------
|
342
|
+
|
343
|
+
double
|
344
|
+
sooth_predictor_uncertainty(SoothPredictor * predictor, uint32_t bigram[2])
|
345
|
+
{
|
346
|
+
SoothContext * context = sooth_predictor_find_context(predictor, bigram);
|
347
|
+
|
348
|
+
if (context == NULL || context->count == 0)
|
349
|
+
{
|
350
|
+
return -1;
|
351
|
+
}
|
352
|
+
|
353
|
+
double uncertainty = 0.0;
|
354
|
+
for (uint32_t i = 0; i < context->statistics_size; ++i)
|
355
|
+
{
|
356
|
+
if (context->statistics[i].count > 0)
|
357
|
+
{
|
358
|
+
double frequency = (double)context->statistics[i].count / (double)context->count;
|
359
|
+
uncertainty -= frequency * log2(frequency);
|
360
|
+
}
|
361
|
+
}
|
362
|
+
|
363
|
+
return uncertainty;
|
364
|
+
}
|
365
|
+
|
366
|
+
//------------------------------------------------------------------------------
|
367
|
+
|
368
|
+
double
|
369
|
+
sooth_predictor_surprise(SoothPredictor * predictor, uint32_t bigram[2], uint32_t symbol)
|
370
|
+
{
|
371
|
+
SoothContext * context = sooth_predictor_find_context(predictor, bigram);
|
372
|
+
|
373
|
+
if (context == NULL || context->count == 0)
|
374
|
+
{
|
375
|
+
return -1;
|
376
|
+
}
|
377
|
+
|
378
|
+
SoothStatistic * statistic = sooth_predictor_find_statistic(context, symbol);
|
379
|
+
|
380
|
+
if (statistic == NULL || statistic->count == 0)
|
381
|
+
{
|
382
|
+
return -1;
|
383
|
+
}
|
384
|
+
|
385
|
+
double frequency = (double)statistic->count / (double)context->count;
|
386
|
+
return -log2(frequency);
|
387
|
+
}
|
388
|
+
|
340
389
|
//==============================================================================
|
@@ -26,6 +26,8 @@ bool sooth_predictor_save(const char * const filename, SoothPredictor * predicto
|
|
26
26
|
uint32_t sooth_predictor_observe(SoothPredictor * predictor, uint32_t bigram[2], uint32_t symbol);
|
27
27
|
uint32_t sooth_predictor_count(SoothPredictor * predictor, uint32_t bigram[2]);
|
28
28
|
uint32_t sooth_predictor_select(SoothPredictor * predictor, uint32_t bigram[2], uint32_t limit);
|
29
|
+
double sooth_predictor_uncertainty(SoothPredictor * predictor, uint32_t bigram[2]);
|
30
|
+
double sooth_predictor_surprise(SoothPredictor * predictor, uint32_t bigram[2], uint32_t symbol);
|
29
31
|
|
30
32
|
//==============================================================================
|
31
33
|
|
data/sooth.gemspec
CHANGED
@@ -2,12 +2,12 @@
|
|
2
2
|
# DO NOT EDIT THIS FILE DIRECTLY
|
3
3
|
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
|
-
# stub: sooth 0.
|
5
|
+
# stub: sooth 0.5.0 ruby lib
|
6
6
|
# stub: ext/sooth_native/extconf.rb
|
7
7
|
|
8
8
|
Gem::Specification.new do |s|
|
9
9
|
s.name = "sooth"
|
10
|
-
s.version = "0.
|
10
|
+
s.version = "0.5.0"
|
11
11
|
|
12
12
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
13
13
|
s.require_paths = ["lib"]
|
data/spec/predictor_spec.rb
CHANGED
@@ -134,4 +134,45 @@ describe Sooth::Predictor do
|
|
134
134
|
end
|
135
135
|
|
136
136
|
end
|
137
|
+
|
138
|
+
describe "#uncertainty" do
|
139
|
+
|
140
|
+
it "has no uncertainty for a new context" do
|
141
|
+
expect(predictor.uncertainty([1, 2])).to be_nil
|
142
|
+
expect(predictor.count([1, 2])).to eq(0)
|
143
|
+
expect(predictor.uncertainty([1, 2])).to be_nil
|
144
|
+
end
|
145
|
+
|
146
|
+
it "has zero uncertainty for a lone context" do
|
147
|
+
predictor.observe([1, 2], 3)
|
148
|
+
expect(predictor.uncertainty([1, 2])).to eq(0)
|
149
|
+
end
|
150
|
+
|
151
|
+
it "has maximal uncertainty for a uniform distribution" do
|
152
|
+
(1..256).each { |i| predictor.observe([1, 2], i) }
|
153
|
+
expect(predictor.uncertainty([1, 2])).to eq(8)
|
154
|
+
end
|
155
|
+
|
156
|
+
end
|
157
|
+
|
158
|
+
describe "#surprise" do
|
159
|
+
|
160
|
+
it "has no surprise for a new context or symbol" do
|
161
|
+
expect(predictor.surprise([1, 2], 3)).to be_nil
|
162
|
+
expect(predictor.count([1, 2])).to eq(0)
|
163
|
+
expect(predictor.surprise([1, 2], 3)).to be_nil
|
164
|
+
end
|
165
|
+
|
166
|
+
it "has zero surprise for a lone context" do
|
167
|
+
predictor.observe([1, 2], 3)
|
168
|
+
expect(predictor.surprise([1, 2], 3)).to eq(0)
|
169
|
+
end
|
170
|
+
|
171
|
+
it "has uniform surprise for a uniform distribution" do
|
172
|
+
(1..256).each { |i| predictor.observe([1, 2], i) }
|
173
|
+
expect(predictor.surprise([1, 2], 3)).to eq(8)
|
174
|
+
end
|
175
|
+
|
176
|
+
end
|
177
|
+
|
137
178
|
end
|