entropic 0.1.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +5 -5
- data/lib/entropic.rb +38 -47
- data/lib/entropic/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 951873f3bb3edd267bb873b352dde384658d00ba
|
4
|
+
data.tar.gz: d403582aa90292e13a973be1d1652e4401266c00
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 88498c5b050afbb5e01fd6f0906d420c8b73a0e78ff1b73af2827c65838a42ad2a636363a5f455ebbed0371366cbf102b3419dfa2e4d9015ac6d53ceee970d39
|
7
|
+
data.tar.gz: 230990158cf17071445c4728ac8ff0317152782fc59ac30265b00bd567849b7679dad0c0ef37752576196a254ec2e5161a138be977223f2bf315f58791504584
|
data/README.md
CHANGED
@@ -8,7 +8,7 @@ require 'entropic'
|
|
8
8
|
=> true
|
9
9
|
>> m.predict("entropy")
|
10
10
|
=> {:log_prob_total=>-37.181802347513745, :log_prob_average=>-6.1969670579189575, :size=>6}
|
11
|
-
>> m.
|
11
|
+
>> m.entropy("yportne")
|
12
12
|
=> {:log_prob_total=>-34.25705444264748, :log_prob_average=>-5.70950907377458, :size=>6}
|
13
13
|
```
|
14
14
|
|
@@ -21,8 +21,8 @@ You can also train a model, using strings one per line.
|
|
21
21
|
=> true
|
22
22
|
>> File.open('/tmp/training.txt') {|f| n.train(f)}; true
|
23
23
|
=> true
|
24
|
-
>> n.
|
25
|
-
=>
|
24
|
+
>> n.entropy('love')
|
25
|
+
=> 5.132072254636385
|
26
26
|
```
|
27
27
|
|
28
28
|
You can also train a model, using strings and a count of the number of times it appers, tab separated.
|
@@ -32,8 +32,8 @@ You can also train a model, using strings and a count of the number of times it
|
|
32
32
|
=> true
|
33
33
|
>> File.open('/tmp/training_with_counts.txt') {|f| o.train_with_multiplier(f)}; true
|
34
34
|
=> true
|
35
|
-
>> o.
|
36
|
-
=>
|
35
|
+
>> o.entropy('love')
|
36
|
+
=> 5.132072254636385
|
37
37
|
```
|
38
38
|
|
39
39
|
You can also dump a model, to be read later.
|
data/lib/entropic.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
require
|
1
|
+
require 'entropic/version'
|
2
2
|
|
3
3
|
# Public: classes and methods useful for estimating entropy on strings.
|
4
4
|
#
|
@@ -9,7 +9,6 @@ require "entropic/version"
|
|
9
9
|
# # => { log_prob_total: -101.1, log_prob_average: -20.02, size: 5 }
|
10
10
|
#
|
11
11
|
module Entropic
|
12
|
-
|
13
12
|
# Public: create a sliding window of ngrams from a string
|
14
13
|
#
|
15
14
|
# string: The String to slide over
|
@@ -17,10 +16,10 @@ module Entropic
|
|
17
16
|
#
|
18
17
|
# Examples
|
19
18
|
#
|
20
|
-
# sliding('01234', 2)
|
19
|
+
# sliding('01234', 2)
|
21
20
|
# # => ['01', '12', '23', '34']
|
22
21
|
#
|
23
|
-
def
|
22
|
+
def self.sliding(string, n)
|
24
23
|
(0..string.length - n).map { |i| (string[i, n]).to_s }
|
25
24
|
end
|
26
25
|
|
@@ -36,7 +35,7 @@ module Entropic
|
|
36
35
|
# Public: update a counter with a string, and a multiplier
|
37
36
|
#
|
38
37
|
# Examples
|
39
|
-
#
|
38
|
+
#
|
40
39
|
# counter = NGramCounter.new(2)
|
41
40
|
# counter.update_with_multiplier('01234', 1)
|
42
41
|
#
|
@@ -53,7 +52,7 @@ module Entropic
|
|
53
52
|
# Public: update a counter with a string, with a multiplier of 1
|
54
53
|
#
|
55
54
|
# Examples
|
56
|
-
#
|
55
|
+
#
|
57
56
|
# counter = NGramCounter.new(2)
|
58
57
|
# counter.update('01234')
|
59
58
|
#
|
@@ -62,11 +61,11 @@ module Entropic
|
|
62
61
|
def update(string)
|
63
62
|
update_with_multiplier(string, 1)
|
64
63
|
end
|
65
|
-
|
64
|
+
|
66
65
|
# Public: get count for string, with default
|
67
66
|
#
|
68
67
|
# Examples
|
69
|
-
#
|
68
|
+
#
|
70
69
|
# counter = NGramCounter.new(2)
|
71
70
|
# counter.update('01234')
|
72
71
|
# counter.count('01', 0)
|
@@ -85,18 +84,17 @@ module Entropic
|
|
85
84
|
# Public; A model for entropy
|
86
85
|
class Model
|
87
86
|
VERSION = '1.0.0'.freeze
|
88
|
-
attr_accessor :size, :
|
87
|
+
attr_accessor :size, :counter
|
89
88
|
|
90
89
|
def initialize(size)
|
91
90
|
@size = size
|
92
|
-
@
|
93
|
-
(1..size).each { |key| @map[key] = NGramCounter.new(key) }
|
91
|
+
@counter = NGramCounter.new(size)
|
94
92
|
end
|
95
93
|
|
96
94
|
# Public: update a model with a string, and a multiplier
|
97
95
|
#
|
98
96
|
# Examples
|
99
|
-
#
|
97
|
+
#
|
100
98
|
# model = Model.new(2)
|
101
99
|
# model.update_with_multiplier('01234', 1)
|
102
100
|
#
|
@@ -104,15 +102,13 @@ module Entropic
|
|
104
102
|
# multiplier: The Integer describing how much weight (will often be 1)
|
105
103
|
#
|
106
104
|
def update_with_multiplier(string, multiplier)
|
107
|
-
@
|
108
|
-
counter.update_with_multiplier(string, multiplier)
|
109
|
-
end
|
105
|
+
@counter.update_with_multiplier(string, multiplier)
|
110
106
|
end
|
111
107
|
|
112
108
|
# Public: update a model with a string, with mulitplier or 1
|
113
109
|
#
|
114
110
|
# Examples
|
115
|
-
#
|
111
|
+
#
|
116
112
|
# model = Model.new(2)
|
117
113
|
# model.update('01234')
|
118
114
|
#
|
@@ -127,48 +123,37 @@ module Entropic
|
|
127
123
|
# or log_prob of a 1-gram appearing once if no suffix found
|
128
124
|
#
|
129
125
|
# Examples
|
130
|
-
#
|
126
|
+
#
|
131
127
|
# model = Model.new(2)
|
132
128
|
# model.update('01234')
|
133
129
|
# model.log_prob('01')
|
134
|
-
#
|
130
|
+
#
|
135
131
|
# string: The String to query
|
136
132
|
#
|
137
133
|
def log_prob(key)
|
138
|
-
|
139
|
-
if @map.all? { |_, m| m.counts.empty? } || !key || key == ''
|
134
|
+
if @counter.total == 0 || !key || key == ''
|
140
135
|
return Math.log(0, 2.0) # -Infinity
|
141
136
|
end
|
142
137
|
|
143
|
-
(
|
144
|
-
|
145
|
-
counter = @map.fetch(k.size, nil)
|
146
|
-
next unless counter
|
147
|
-
count = counter.counts.fetch(k, nil)
|
148
|
-
return Math.log(count, 2.0) - Math.log(counter.total, 2.0) if count
|
149
|
-
last_total = counter.total
|
150
|
-
end
|
151
|
-
# found it nowhere. Return '1 count' from last total
|
152
|
-
Math.log(1.0, 2.0) - Math.log(last_total, 2.0)
|
138
|
+
count = @counter.count(key, 0.5)
|
139
|
+
Math.log(count, 2.0) - Math.log(counter.total, 2.0)
|
153
140
|
end
|
154
141
|
|
155
142
|
# Public: dump model to some io object
|
156
|
-
#
|
143
|
+
#
|
157
144
|
# io: the IOWriter to write to
|
158
145
|
#
|
159
146
|
def dump(io)
|
160
|
-
@
|
161
|
-
|
162
|
-
io.write("#{k}\t#{ngram}\t#{count}\n")
|
163
|
-
end
|
147
|
+
@counter.counts.each do |ngram, count|
|
148
|
+
io.write("#{@size}\t#{ngram}\t#{count}\n")
|
164
149
|
end
|
165
150
|
end
|
166
151
|
|
167
152
|
# Public: predict the log_prob sum and average over a string
|
168
153
|
# which will be split into ngrams
|
169
|
-
#
|
154
|
+
#
|
170
155
|
# string: The String to query
|
171
|
-
#
|
156
|
+
#
|
172
157
|
# returns: a dictionary of
|
173
158
|
# - log_prob_total
|
174
159
|
# - log_prob_average
|
@@ -180,35 +165,42 @@ module Entropic
|
|
180
165
|
{ log_prob_total: log_prob_total, log_prob_average: log_prob_average, size: ngrams.size }
|
181
166
|
end
|
182
167
|
|
168
|
+
# Public: predict the entropy over a string
|
169
|
+
# which will be split into ngrams
|
170
|
+
#
|
171
|
+
# string: The String to query
|
172
|
+
#
|
173
|
+
# returns: entropy
|
174
|
+
def entropy(string)
|
175
|
+
-predict(string)[:log_prob_average]
|
176
|
+
end
|
177
|
+
|
183
178
|
# Public: create a Model from reading from an IO object
|
184
179
|
#
|
185
180
|
# io: the IOReader
|
186
181
|
#
|
187
182
|
# returns: Model with stats filled in, and size of largest ngram
|
188
183
|
def self.read(io)
|
189
|
-
model =
|
190
|
-
max_size = 0
|
184
|
+
model = nil
|
191
185
|
io.each_line do |string|
|
192
186
|
ngram_size, ngram, count = string.strip.split(/\t/)
|
193
187
|
ngram_size = ngram_size.to_i
|
194
188
|
count = count.to_f
|
195
|
-
model
|
196
|
-
counter = model.
|
189
|
+
model = Model.new(ngram_size) unless model
|
190
|
+
counter = model.counter
|
197
191
|
counter.total += count
|
198
192
|
counter.counts[ngram] = count
|
199
|
-
max_size = ngram_size if ngram_size > max_size
|
200
193
|
end
|
201
|
-
model.size = max_size
|
202
194
|
model
|
203
195
|
end
|
204
196
|
|
205
197
|
# Public: Train a model on a bunch of data, line by line
|
206
198
|
#
|
207
199
|
# io: the IOReader
|
208
|
-
#
|
200
|
+
#
|
209
201
|
def train(io)
|
210
202
|
io.each_line do |string|
|
211
|
-
update(string)
|
203
|
+
update(string.strip)
|
212
204
|
end
|
213
205
|
end
|
214
206
|
|
@@ -217,7 +209,7 @@ module Entropic
|
|
217
209
|
# each data line should be <string><tab><multiplier>
|
218
210
|
#
|
219
211
|
# io: the IOReader
|
220
|
-
#
|
212
|
+
#
|
221
213
|
def train_with_multiplier(io)
|
222
214
|
io.each_line do |string|
|
223
215
|
text, count = string.strip.split(/\t/)
|
@@ -227,4 +219,3 @@ module Entropic
|
|
227
219
|
end
|
228
220
|
end
|
229
221
|
end
|
230
|
-
|
data/lib/entropic/version.rb
CHANGED