entropic 0.1.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +5 -5
- data/lib/entropic.rb +38 -47
- data/lib/entropic/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 951873f3bb3edd267bb873b352dde384658d00ba
|
4
|
+
data.tar.gz: d403582aa90292e13a973be1d1652e4401266c00
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 88498c5b050afbb5e01fd6f0906d420c8b73a0e78ff1b73af2827c65838a42ad2a636363a5f455ebbed0371366cbf102b3419dfa2e4d9015ac6d53ceee970d39
|
7
|
+
data.tar.gz: 230990158cf17071445c4728ac8ff0317152782fc59ac30265b00bd567849b7679dad0c0ef37752576196a254ec2e5161a138be977223f2bf315f58791504584
|
data/README.md
CHANGED
@@ -8,7 +8,7 @@ require 'entropic'
|
|
8
8
|
=> true
|
9
9
|
>> m.predict("entropy")
|
10
10
|
=> {:log_prob_total=>-37.181802347513745, :log_prob_average=>-6.1969670579189575, :size=>6}
|
11
|
-
>> m.
|
11
|
+
>> m.entropy("yportne")
|
12
12
|
=> {:log_prob_total=>-34.25705444264748, :log_prob_average=>-5.70950907377458, :size=>6}
|
13
13
|
```
|
14
14
|
|
@@ -21,8 +21,8 @@ You can also train a model, using strings one per line.
|
|
21
21
|
=> true
|
22
22
|
>> File.open('/tmp/training.txt') {|f| n.train(f)}; true
|
23
23
|
=> true
|
24
|
-
>> n.
|
25
|
-
=>
|
24
|
+
>> n.entropy('love')
|
25
|
+
=> 5.132072254636385
|
26
26
|
```
|
27
27
|
|
28
28
|
You can also train a model, using strings and a count of the number of times it appers, tab separated.
|
@@ -32,8 +32,8 @@ You can also train a model, using strings and a count of the number of times it
|
|
32
32
|
=> true
|
33
33
|
>> File.open('/tmp/training_with_counts.txt') {|f| o.train_with_multiplier(f)}; true
|
34
34
|
=> true
|
35
|
-
>> o.
|
36
|
-
=>
|
35
|
+
>> o.entropy('love')
|
36
|
+
=> 5.132072254636385
|
37
37
|
```
|
38
38
|
|
39
39
|
You can also dump a model, to be read later.
|
data/lib/entropic.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
require
|
1
|
+
require 'entropic/version'
|
2
2
|
|
3
3
|
# Public: classes and methods useful for estimating entropy on strings.
|
4
4
|
#
|
@@ -9,7 +9,6 @@ require "entropic/version"
|
|
9
9
|
# # => { log_prob_total: -101.1, log_prob_average: -20.02, size: 5 }
|
10
10
|
#
|
11
11
|
module Entropic
|
12
|
-
|
13
12
|
# Public: create a sliding window of ngrams from a string
|
14
13
|
#
|
15
14
|
# string: The String to slide over
|
@@ -17,10 +16,10 @@ module Entropic
|
|
17
16
|
#
|
18
17
|
# Examples
|
19
18
|
#
|
20
|
-
# sliding('01234', 2)
|
19
|
+
# sliding('01234', 2)
|
21
20
|
# # => ['01', '12', '23', '34']
|
22
21
|
#
|
23
|
-
def
|
22
|
+
def self.sliding(string, n)
|
24
23
|
(0..string.length - n).map { |i| (string[i, n]).to_s }
|
25
24
|
end
|
26
25
|
|
@@ -36,7 +35,7 @@ module Entropic
|
|
36
35
|
# Public: update a counter with a string, and a multiplier
|
37
36
|
#
|
38
37
|
# Examples
|
39
|
-
#
|
38
|
+
#
|
40
39
|
# counter = NGramCounter.new(2)
|
41
40
|
# counter.update_with_multiplier('01234', 1)
|
42
41
|
#
|
@@ -53,7 +52,7 @@ module Entropic
|
|
53
52
|
# Public: update a counter with a string, with a multiplier of 1
|
54
53
|
#
|
55
54
|
# Examples
|
56
|
-
#
|
55
|
+
#
|
57
56
|
# counter = NGramCounter.new(2)
|
58
57
|
# counter.update('01234')
|
59
58
|
#
|
@@ -62,11 +61,11 @@ module Entropic
|
|
62
61
|
def update(string)
|
63
62
|
update_with_multiplier(string, 1)
|
64
63
|
end
|
65
|
-
|
64
|
+
|
66
65
|
# Public: get count for string, with default
|
67
66
|
#
|
68
67
|
# Examples
|
69
|
-
#
|
68
|
+
#
|
70
69
|
# counter = NGramCounter.new(2)
|
71
70
|
# counter.update('01234')
|
72
71
|
# counter.count('01', 0)
|
@@ -85,18 +84,17 @@ module Entropic
|
|
85
84
|
# Public; A model for entropy
|
86
85
|
class Model
|
87
86
|
VERSION = '1.0.0'.freeze
|
88
|
-
attr_accessor :size, :
|
87
|
+
attr_accessor :size, :counter
|
89
88
|
|
90
89
|
def initialize(size)
|
91
90
|
@size = size
|
92
|
-
@
|
93
|
-
(1..size).each { |key| @map[key] = NGramCounter.new(key) }
|
91
|
+
@counter = NGramCounter.new(size)
|
94
92
|
end
|
95
93
|
|
96
94
|
# Public: update a model with a string, and a multiplier
|
97
95
|
#
|
98
96
|
# Examples
|
99
|
-
#
|
97
|
+
#
|
100
98
|
# model = Model.new(2)
|
101
99
|
# model.update_with_multiplier('01234', 1)
|
102
100
|
#
|
@@ -104,15 +102,13 @@ module Entropic
|
|
104
102
|
# multiplier: The Integer describing how much weight (will often be 1)
|
105
103
|
#
|
106
104
|
def update_with_multiplier(string, multiplier)
|
107
|
-
@
|
108
|
-
counter.update_with_multiplier(string, multiplier)
|
109
|
-
end
|
105
|
+
@counter.update_with_multiplier(string, multiplier)
|
110
106
|
end
|
111
107
|
|
112
108
|
# Public: update a model with a string, with mulitplier or 1
|
113
109
|
#
|
114
110
|
# Examples
|
115
|
-
#
|
111
|
+
#
|
116
112
|
# model = Model.new(2)
|
117
113
|
# model.update('01234')
|
118
114
|
#
|
@@ -127,48 +123,37 @@ module Entropic
|
|
127
123
|
# or log_prob of a 1-gram appearing once if no suffix found
|
128
124
|
#
|
129
125
|
# Examples
|
130
|
-
#
|
126
|
+
#
|
131
127
|
# model = Model.new(2)
|
132
128
|
# model.update('01234')
|
133
129
|
# model.log_prob('01')
|
134
|
-
#
|
130
|
+
#
|
135
131
|
# string: The String to query
|
136
132
|
#
|
137
133
|
def log_prob(key)
|
138
|
-
|
139
|
-
if @map.all? { |_, m| m.counts.empty? } || !key || key == ''
|
134
|
+
if @counter.total == 0 || !key || key == ''
|
140
135
|
return Math.log(0, 2.0) # -Infinity
|
141
136
|
end
|
142
137
|
|
143
|
-
(
|
144
|
-
|
145
|
-
counter = @map.fetch(k.size, nil)
|
146
|
-
next unless counter
|
147
|
-
count = counter.counts.fetch(k, nil)
|
148
|
-
return Math.log(count, 2.0) - Math.log(counter.total, 2.0) if count
|
149
|
-
last_total = counter.total
|
150
|
-
end
|
151
|
-
# found it nowhere. Return '1 count' from last total
|
152
|
-
Math.log(1.0, 2.0) - Math.log(last_total, 2.0)
|
138
|
+
count = @counter.count(key, 0.5)
|
139
|
+
Math.log(count, 2.0) - Math.log(counter.total, 2.0)
|
153
140
|
end
|
154
141
|
|
155
142
|
# Public: dump model to some io object
|
156
|
-
#
|
143
|
+
#
|
157
144
|
# io: the IOWriter to write to
|
158
145
|
#
|
159
146
|
def dump(io)
|
160
|
-
@
|
161
|
-
|
162
|
-
io.write("#{k}\t#{ngram}\t#{count}\n")
|
163
|
-
end
|
147
|
+
@counter.counts.each do |ngram, count|
|
148
|
+
io.write("#{@size}\t#{ngram}\t#{count}\n")
|
164
149
|
end
|
165
150
|
end
|
166
151
|
|
167
152
|
# Public: predict the log_prob sum and average over a string
|
168
153
|
# which will be split into ngrams
|
169
|
-
#
|
154
|
+
#
|
170
155
|
# string: The String to query
|
171
|
-
#
|
156
|
+
#
|
172
157
|
# returns: a dictionary of
|
173
158
|
# - log_prob_total
|
174
159
|
# - log_prob_average
|
@@ -180,35 +165,42 @@ module Entropic
|
|
180
165
|
{ log_prob_total: log_prob_total, log_prob_average: log_prob_average, size: ngrams.size }
|
181
166
|
end
|
182
167
|
|
168
|
+
# Public: predict the entropy over a string
|
169
|
+
# which will be split into ngrams
|
170
|
+
#
|
171
|
+
# string: The String to query
|
172
|
+
#
|
173
|
+
# returns: entropy
|
174
|
+
def entropy(string)
|
175
|
+
-predict(string)[:log_prob_average]
|
176
|
+
end
|
177
|
+
|
183
178
|
# Public: create a Model from reading from an IO object
|
184
179
|
#
|
185
180
|
# io: the IOReader
|
186
181
|
#
|
187
182
|
# returns: Model with stats filled in, and size of largest ngram
|
188
183
|
def self.read(io)
|
189
|
-
model =
|
190
|
-
max_size = 0
|
184
|
+
model = nil
|
191
185
|
io.each_line do |string|
|
192
186
|
ngram_size, ngram, count = string.strip.split(/\t/)
|
193
187
|
ngram_size = ngram_size.to_i
|
194
188
|
count = count.to_f
|
195
|
-
model
|
196
|
-
counter = model.
|
189
|
+
model = Model.new(ngram_size) unless model
|
190
|
+
counter = model.counter
|
197
191
|
counter.total += count
|
198
192
|
counter.counts[ngram] = count
|
199
|
-
max_size = ngram_size if ngram_size > max_size
|
200
193
|
end
|
201
|
-
model.size = max_size
|
202
194
|
model
|
203
195
|
end
|
204
196
|
|
205
197
|
# Public: Train a model on a bunch of data, line by line
|
206
198
|
#
|
207
199
|
# io: the IOReader
|
208
|
-
#
|
200
|
+
#
|
209
201
|
def train(io)
|
210
202
|
io.each_line do |string|
|
211
|
-
update(string)
|
203
|
+
update(string.strip)
|
212
204
|
end
|
213
205
|
end
|
214
206
|
|
@@ -217,7 +209,7 @@ module Entropic
|
|
217
209
|
# each data line should be <string><tab><multiplier>
|
218
210
|
#
|
219
211
|
# io: the IOReader
|
220
|
-
#
|
212
|
+
#
|
221
213
|
def train_with_multiplier(io)
|
222
214
|
io.each_line do |string|
|
223
215
|
text, count = string.strip.split(/\t/)
|
@@ -227,4 +219,3 @@ module Entropic
|
|
227
219
|
end
|
228
220
|
end
|
229
221
|
end
|
230
|
-
|
data/lib/entropic/version.rb
CHANGED