sooth 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/VERSION +1 -1
- data/ext/sooth_native/native.c +81 -0
- data/ext/sooth_native/sooth_predictor.c +49 -0
- data/ext/sooth_native/sooth_predictor.h +2 -0
- data/sooth.gemspec +2 -2
- data/spec/predictor_spec.rb +41 -0
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1bd89c2dd37bdeec58eb72ecca31a1075dd28350
|
4
|
+
data.tar.gz: 8bc31931e77993880f3ff5f516a362933c061f8d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3c79c5b70b6ee7df2e90b2b1fcaabaeae98da17dd8bc0c0338292b5b5ca00bcdd25b9640e1ed6599be982fce7704e1d79803adaded651dd417cb03e9227fc521
|
7
|
+
data.tar.gz: 9cb14586481824dc2d8b6662d0e5d31c27c5b00c1a9932fa1c4ccc4cd5ddf5753b65d403a27c604a518fa11c5af4206be0ca7fcfbf5ad823784ead1f97d1496c
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.5.0
|
data/ext/sooth_native/native.c
CHANGED
@@ -35,6 +35,12 @@ void method_sooth_native_deallocate(void * predictor);
|
|
35
35
|
* def select(bigram, limit)
|
36
36
|
* # (native code)
|
37
37
|
* end
|
38
|
+
* def uncertainty(bigram)
|
39
|
+
* # (native code)
|
40
|
+
* end
|
41
|
+
* def surprise(bigram, symbol)
|
42
|
+
* # (native code)
|
43
|
+
* end
|
38
44
|
* end
|
39
45
|
* end
|
40
46
|
*
|
@@ -106,6 +112,32 @@ VALUE method_sooth_native_count(VALUE self, VALUE bigram);
|
|
106
112
|
*/
|
107
113
|
VALUE method_sooth_native_select(VALUE self, VALUE bigram, VALUE limit);
|
108
114
|
|
115
|
+
/*
|
116
|
+
* Return a number indicating how uncertain the predictor is about which symbol
|
117
|
+
* is likely to be observed after the given bigram. Note that nil will be
|
118
|
+
* returned if the bigram has never been observed.
|
119
|
+
*
|
120
|
+
* @param [Array] bigram A pair of symbols.
|
121
|
+
* @return [Float] The uncertainty, which is calculated to be the shannon entropy
|
122
|
+
* of the probability distribution over the alphabet of symbols
|
123
|
+
* in the context of the bigram.
|
124
|
+
*/
|
125
|
+
VALUE method_sooth_native_uncertainty(VALUE self, VALUE bigram);
|
126
|
+
|
127
|
+
/*
|
128
|
+
* Return a number indicating the surprise received by the predictor when it
|
129
|
+
* observed the given symbol after the given bigram. Note that nil will be
|
130
|
+
* returned if the symbol has never been observed after the bigram.
|
131
|
+
*
|
132
|
+
* @param [Array] bigram A pair of symbols.
|
133
|
+
* @param [Fixnum] symbol The symbol that has been observed.
|
134
|
+
* @return [Float] The surprise, which is calculated to be the shannon pointwise
|
135
|
+
* mutual information of the symbol according to the probability
|
136
|
+
* distribution over the alphabet of symbols in the context of
|
137
|
+
* the bigram.
|
138
|
+
*/
|
139
|
+
VALUE method_sooth_native_surprise(VALUE self, VALUE bigram, VALUE limit);
|
140
|
+
|
109
141
|
//------------------------------------------------------------------------------
|
110
142
|
|
111
143
|
void Init_sooth_native()
|
@@ -123,6 +155,8 @@ void Init_sooth_native()
|
|
123
155
|
rb_define_method(SoothNative, "observe", method_sooth_native_observe, 2);
|
124
156
|
rb_define_method(SoothNative, "count", method_sooth_native_count, 1);
|
125
157
|
rb_define_method(SoothNative, "select", method_sooth_native_select, 2);
|
158
|
+
rb_define_method(SoothNative, "uncertainty", method_sooth_native_uncertainty, 1);
|
159
|
+
rb_define_method(SoothNative, "surprise", method_sooth_native_surprise, 2);
|
126
160
|
}
|
127
161
|
|
128
162
|
//------------------------------------------------------------------------------
|
@@ -258,4 +292,51 @@ method_sooth_native_select(VALUE self, VALUE bigram, VALUE limit)
|
|
258
292
|
return UINT2NUM(symbol);
|
259
293
|
}
|
260
294
|
|
295
|
+
//------------------------------------------------------------------------------
|
296
|
+
|
297
|
+
VALUE
|
298
|
+
method_sooth_native_uncertainty(VALUE self, VALUE bigram)
|
299
|
+
{
|
300
|
+
SoothPredictor * predictor = NULL;
|
301
|
+
Check_Type(bigram, T_ARRAY);
|
302
|
+
if (RARRAY_LEN(bigram) != 2)
|
303
|
+
{
|
304
|
+
rb_raise(rb_eTypeError, "bigram must be an array of exactly two symbols");
|
305
|
+
}
|
306
|
+
Check_Type(RARRAY_PTR(bigram)[0], T_FIXNUM);
|
307
|
+
Check_Type(RARRAY_PTR(bigram)[1], T_FIXNUM);
|
308
|
+
Data_Get_Struct(self, SoothPredictor, predictor);
|
309
|
+
uint32_t c_bigram[2] = {NUM2UINT(RARRAY_PTR(bigram)[0]), NUM2UINT(RARRAY_PTR(bigram)[1])};
|
310
|
+
double uncertainty = sooth_predictor_uncertainty(predictor, c_bigram);
|
311
|
+
if (uncertainty < 0)
|
312
|
+
{
|
313
|
+
return Qnil;
|
314
|
+
}
|
315
|
+
return DBL2NUM(uncertainty);
|
316
|
+
}
|
317
|
+
|
318
|
+
//------------------------------------------------------------------------------
|
319
|
+
|
320
|
+
VALUE
|
321
|
+
method_sooth_native_surprise(VALUE self, VALUE bigram, VALUE symbol)
|
322
|
+
{
|
323
|
+
SoothPredictor * predictor = NULL;
|
324
|
+
Check_Type(symbol, T_FIXNUM);
|
325
|
+
Check_Type(bigram, T_ARRAY);
|
326
|
+
if (RARRAY_LEN(bigram) != 2)
|
327
|
+
{
|
328
|
+
rb_raise(rb_eTypeError, "bigram must be an array of exactly two symbols");
|
329
|
+
}
|
330
|
+
Check_Type(RARRAY_PTR(bigram)[0], T_FIXNUM);
|
331
|
+
Check_Type(RARRAY_PTR(bigram)[1], T_FIXNUM);
|
332
|
+
Data_Get_Struct(self, SoothPredictor, predictor);
|
333
|
+
uint32_t c_bigram[2] = {NUM2UINT(RARRAY_PTR(bigram)[0]), NUM2UINT(RARRAY_PTR(bigram)[1])};
|
334
|
+
double surprise = sooth_predictor_surprise(predictor, c_bigram, NUM2UINT(symbol));
|
335
|
+
if (surprise < 0)
|
336
|
+
{
|
337
|
+
return Qnil;
|
338
|
+
}
|
339
|
+
return DBL2NUM(surprise);
|
340
|
+
}
|
341
|
+
|
261
342
|
//==============================================================================
|
@@ -3,6 +3,7 @@
|
|
3
3
|
#include <stdio.h>
|
4
4
|
#include <stdlib.h>
|
5
5
|
#include <string.h>
|
6
|
+
#include <math.h>
|
6
7
|
|
7
8
|
#include "sooth_predictor.h"
|
8
9
|
|
@@ -337,4 +338,52 @@ sooth_predictor_select(SoothPredictor * predictor, uint32_t bigram[2], uint32_t
|
|
337
338
|
return predictor->error_symbol;
|
338
339
|
}
|
339
340
|
|
341
|
+
//------------------------------------------------------------------------------
|
342
|
+
|
343
|
+
double
|
344
|
+
sooth_predictor_uncertainty(SoothPredictor * predictor, uint32_t bigram[2])
|
345
|
+
{
|
346
|
+
SoothContext * context = sooth_predictor_find_context(predictor, bigram);
|
347
|
+
|
348
|
+
if (context == NULL || context->count == 0)
|
349
|
+
{
|
350
|
+
return -1;
|
351
|
+
}
|
352
|
+
|
353
|
+
double uncertainty = 0.0;
|
354
|
+
for (uint32_t i = 0; i < context->statistics_size; ++i)
|
355
|
+
{
|
356
|
+
if (context->statistics[i].count > 0)
|
357
|
+
{
|
358
|
+
double frequency = (double)context->statistics[i].count / (double)context->count;
|
359
|
+
uncertainty -= frequency * log2(frequency);
|
360
|
+
}
|
361
|
+
}
|
362
|
+
|
363
|
+
return uncertainty;
|
364
|
+
}
|
365
|
+
|
366
|
+
//------------------------------------------------------------------------------
|
367
|
+
|
368
|
+
double
|
369
|
+
sooth_predictor_surprise(SoothPredictor * predictor, uint32_t bigram[2], uint32_t symbol)
|
370
|
+
{
|
371
|
+
SoothContext * context = sooth_predictor_find_context(predictor, bigram);
|
372
|
+
|
373
|
+
if (context == NULL || context->count == 0)
|
374
|
+
{
|
375
|
+
return -1;
|
376
|
+
}
|
377
|
+
|
378
|
+
SoothStatistic * statistic = sooth_predictor_find_statistic(context, symbol);
|
379
|
+
|
380
|
+
if (statistic == NULL || statistic->count == 0)
|
381
|
+
{
|
382
|
+
return -1;
|
383
|
+
}
|
384
|
+
|
385
|
+
double frequency = (double)statistic->count / (double)context->count;
|
386
|
+
return -log2(frequency);
|
387
|
+
}
|
388
|
+
|
340
389
|
//==============================================================================
|
@@ -26,6 +26,8 @@ bool sooth_predictor_save(const char * const filename, SoothPredictor * predicto
|
|
26
26
|
uint32_t sooth_predictor_observe(SoothPredictor * predictor, uint32_t bigram[2], uint32_t symbol);
|
27
27
|
uint32_t sooth_predictor_count(SoothPredictor * predictor, uint32_t bigram[2]);
|
28
28
|
uint32_t sooth_predictor_select(SoothPredictor * predictor, uint32_t bigram[2], uint32_t limit);
|
29
|
+
double sooth_predictor_uncertainty(SoothPredictor * predictor, uint32_t bigram[2]);
|
30
|
+
double sooth_predictor_surprise(SoothPredictor * predictor, uint32_t bigram[2], uint32_t symbol);
|
29
31
|
|
30
32
|
//==============================================================================
|
31
33
|
|
data/sooth.gemspec
CHANGED
@@ -2,12 +2,12 @@
|
|
2
2
|
# DO NOT EDIT THIS FILE DIRECTLY
|
3
3
|
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
|
-
# stub: sooth 0.
|
5
|
+
# stub: sooth 0.5.0 ruby lib
|
6
6
|
# stub: ext/sooth_native/extconf.rb
|
7
7
|
|
8
8
|
Gem::Specification.new do |s|
|
9
9
|
s.name = "sooth"
|
10
|
-
s.version = "0.
|
10
|
+
s.version = "0.5.0"
|
11
11
|
|
12
12
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
13
13
|
s.require_paths = ["lib"]
|
data/spec/predictor_spec.rb
CHANGED
@@ -134,4 +134,45 @@ describe Sooth::Predictor do
|
|
134
134
|
end
|
135
135
|
|
136
136
|
end
|
137
|
+
|
138
|
+
describe "#uncertainty" do
|
139
|
+
|
140
|
+
it "has no uncertainty for a new context" do
|
141
|
+
expect(predictor.uncertainty([1, 2])).to be_nil
|
142
|
+
expect(predictor.count([1, 2])).to eq(0)
|
143
|
+
expect(predictor.uncertainty([1, 2])).to be_nil
|
144
|
+
end
|
145
|
+
|
146
|
+
it "has zero uncertainty for a lone context" do
|
147
|
+
predictor.observe([1, 2], 3)
|
148
|
+
expect(predictor.uncertainty([1, 2])).to eq(0)
|
149
|
+
end
|
150
|
+
|
151
|
+
it "has maximal uncertainty for a uniform distribution" do
|
152
|
+
(1..256).each { |i| predictor.observe([1, 2], i) }
|
153
|
+
expect(predictor.uncertainty([1, 2])).to eq(8)
|
154
|
+
end
|
155
|
+
|
156
|
+
end
|
157
|
+
|
158
|
+
describe "#surprise" do
|
159
|
+
|
160
|
+
it "has no surprise for a new context or symbol" do
|
161
|
+
expect(predictor.surprise([1, 2], 3)).to be_nil
|
162
|
+
expect(predictor.count([1, 2])).to eq(0)
|
163
|
+
expect(predictor.surprise([1, 2], 3)).to be_nil
|
164
|
+
end
|
165
|
+
|
166
|
+
it "has zero surprise for a lone context" do
|
167
|
+
predictor.observe([1, 2], 3)
|
168
|
+
expect(predictor.surprise([1, 2], 3)).to eq(0)
|
169
|
+
end
|
170
|
+
|
171
|
+
it "has uniform surprise for a uniform distribution" do
|
172
|
+
(1..256).each { |i| predictor.observe([1, 2], i) }
|
173
|
+
expect(predictor.surprise([1, 2], 3)).to eq(8)
|
174
|
+
end
|
175
|
+
|
176
|
+
end
|
177
|
+
|
137
178
|
end
|