entropic 0.1.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d35ae5edab8f2a86c92301ba0a35c1c90bfd3d26
4
- data.tar.gz: 3463138f664205b707808c9ae8783120258c897f
3
+ metadata.gz: 951873f3bb3edd267bb873b352dde384658d00ba
4
+ data.tar.gz: d403582aa90292e13a973be1d1652e4401266c00
5
5
  SHA512:
6
- metadata.gz: 862964de635c81b16806aa8f9acb1a5e347ac399208d688ed4b5b702ddac216ad29a0f5d339c3e12d66a43eaad05c370ffdfa3bba96740a3f1f2c8b7e2aa7f4d
7
- data.tar.gz: 2847b9f7d5ae73f36be387594085fd211203e9c365ee25b2cf9655b7ccd1565a9fa2b7d64961ebbb23cccebb9f889607a3b9756309dd0c69efd70f6aa848e055
6
+ metadata.gz: 88498c5b050afbb5e01fd6f0906d420c8b73a0e78ff1b73af2827c65838a42ad2a636363a5f455ebbed0371366cbf102b3419dfa2e4d9015ac6d53ceee970d39
7
+ data.tar.gz: 230990158cf17071445c4728ac8ff0317152782fc59ac30265b00bd567849b7679dad0c0ef37752576196a254ec2e5161a138be977223f2bf315f58791504584
data/README.md CHANGED
@@ -8,7 +8,7 @@ require 'entropic'
8
8
  => true
9
9
  >> m.predict("entropy")
10
10
  => {:log_prob_total=>-37.181802347513745, :log_prob_average=>-6.1969670579189575, :size=>6}
11
- >> m.predict("yportne")
11
+ >> m.entropy("yportne")
12
12
  => {:log_prob_total=>-34.25705444264748, :log_prob_average=>-5.70950907377458, :size=>6}
13
13
  ```
14
14
 
@@ -21,8 +21,8 @@ You can also train a model, using strings one per line.
21
21
  => true
22
22
  >> File.open('/tmp/training.txt') {|f| n.train(f)}; true
23
23
  => true
24
- >> n.predict('love')
25
- => {:log_prob_total=>-15.396216763909154, :log_prob_average=>-5.132072254636385, :size=>3}
24
+ >> n.entropy('love')
25
+ => 5.132072254636385
26
26
  ```
27
27
 
28
28
  You can also train a model, using strings and a count of the number of times it appers, tab separated.
@@ -32,8 +32,8 @@ You can also train a model, using strings and a count of the number of times it
32
32
  => true
33
33
  >> File.open('/tmp/training_with_counts.txt') {|f| o.train_with_multiplier(f)}; true
34
34
  => true
35
- >> o.predict('love')
36
- => {:log_prob_total=>-15.396216763909154, :log_prob_average=>-5.132072254636385, :size=>3}
35
+ >> o.entropy('love')
36
+ => 5.132072254636385
37
37
  ```
38
38
 
39
39
  You can also dump a model, to be read later.
@@ -1,4 +1,4 @@
1
- require "entropic/version"
1
+ require 'entropic/version'
2
2
 
3
3
  # Public: classes and methods useful for estimating entropy on strings.
4
4
  #
@@ -9,7 +9,6 @@ require "entropic/version"
9
9
  # # => { log_prob_total: -101.1, log_prob_average: -20.02, size: 5 }
10
10
  #
11
11
  module Entropic
12
-
13
12
  # Public: create a sliding window of ngrams from a string
14
13
  #
15
14
  # string: The String to slide over
@@ -17,10 +16,10 @@ module Entropic
17
16
  #
18
17
  # Examples
19
18
  #
20
- # sliding('01234', 2)
19
+ # sliding('01234', 2)
21
20
  # # => ['01', '12', '23', '34']
22
21
  #
23
- def Entropic.sliding(string, n)
22
+ def self.sliding(string, n)
24
23
  (0..string.length - n).map { |i| (string[i, n]).to_s }
25
24
  end
26
25
 
@@ -36,7 +35,7 @@ module Entropic
36
35
  # Public: update a counter with a string, and a multiplier
37
36
  #
38
37
  # Examples
39
- #
38
+ #
40
39
  # counter = NGramCounter.new(2)
41
40
  # counter.update_with_multiplier('01234', 1)
42
41
  #
@@ -53,7 +52,7 @@ module Entropic
53
52
  # Public: update a counter with a string, with a multiplier of 1
54
53
  #
55
54
  # Examples
56
- #
55
+ #
57
56
  # counter = NGramCounter.new(2)
58
57
  # counter.update('01234')
59
58
  #
@@ -62,11 +61,11 @@ module Entropic
62
61
  def update(string)
63
62
  update_with_multiplier(string, 1)
64
63
  end
65
-
64
+
66
65
  # Public: get count for string, with default
67
66
  #
68
67
  # Examples
69
- #
68
+ #
70
69
  # counter = NGramCounter.new(2)
71
70
  # counter.update('01234')
72
71
  # counter.count('01', 0)
@@ -85,18 +84,17 @@ module Entropic
85
84
  # Public; A model for entropy
86
85
  class Model
87
86
  VERSION = '1.0.0'.freeze
88
- attr_accessor :size, :map
87
+ attr_accessor :size, :counter
89
88
 
90
89
  def initialize(size)
91
90
  @size = size
92
- @map = {}
93
- (1..size).each { |key| @map[key] = NGramCounter.new(key) }
91
+ @counter = NGramCounter.new(size)
94
92
  end
95
93
 
96
94
  # Public: update a model with a string, and a multiplier
97
95
  #
98
96
  # Examples
99
- #
97
+ #
100
98
  # model = Model.new(2)
101
99
  # model.update_with_multiplier('01234', 1)
102
100
  #
@@ -104,15 +102,13 @@ module Entropic
104
102
  # multiplier: The Integer describing how much weight (will often be 1)
105
103
  #
106
104
  def update_with_multiplier(string, multiplier)
107
- @map.each do |_, counter|
108
- counter.update_with_multiplier(string, multiplier)
109
- end
105
+ @counter.update_with_multiplier(string, multiplier)
110
106
  end
111
107
 
112
108
  # Public: update a model with a string, with mulitplier or 1
113
109
  #
114
110
  # Examples
115
- #
111
+ #
116
112
  # model = Model.new(2)
117
113
  # model.update('01234')
118
114
  #
@@ -127,48 +123,37 @@ module Entropic
127
123
  # or log_prob of a 1-gram appearing once if no suffix found
128
124
  #
129
125
  # Examples
130
- #
126
+ #
131
127
  # model = Model.new(2)
132
128
  # model.update('01234')
133
129
  # model.log_prob('01')
134
- #
130
+ #
135
131
  # string: The String to query
136
132
  #
137
133
  def log_prob(key)
138
- last_total = 1
139
- if @map.all? { |_, m| m.counts.empty? } || !key || key == ''
134
+ if @counter.total == 0 || !key || key == ''
140
135
  return Math.log(0, 2.0) # -Infinity
141
136
  end
142
137
 
143
- (1..key.size).each do |i|
144
- k = key[-i..-1]
145
- counter = @map.fetch(k.size, nil)
146
- next unless counter
147
- count = counter.counts.fetch(k, nil)
148
- return Math.log(count, 2.0) - Math.log(counter.total, 2.0) if count
149
- last_total = counter.total
150
- end
151
- # found it nowhere. Return '1 count' from last total
152
- Math.log(1.0, 2.0) - Math.log(last_total, 2.0)
138
+ count = @counter.count(key, 0.5)
139
+ Math.log(count, 2.0) - Math.log(counter.total, 2.0)
153
140
  end
154
141
 
155
142
  # Public: dump model to some io object
156
- #
143
+ #
157
144
  # io: the IOWriter to write to
158
145
  #
159
146
  def dump(io)
160
- @map.each do |k, m|
161
- m.counts.each do |ngram, count|
162
- io.write("#{k}\t#{ngram}\t#{count}\n")
163
- end
147
+ @counter.counts.each do |ngram, count|
148
+ io.write("#{@size}\t#{ngram}\t#{count}\n")
164
149
  end
165
150
  end
166
151
 
167
152
  # Public: predict the log_prob sum and average over a string
168
153
  # which will be split into ngrams
169
- #
154
+ #
170
155
  # string: The String to query
171
- #
156
+ #
172
157
  # returns: a dictionary of
173
158
  # - log_prob_total
174
159
  # - log_prob_average
@@ -180,35 +165,42 @@ module Entropic
180
165
  { log_prob_total: log_prob_total, log_prob_average: log_prob_average, size: ngrams.size }
181
166
  end
182
167
 
168
+ # Public: predict the entropy over a string
169
+ # which will be split into ngrams
170
+ #
171
+ # string: The String to query
172
+ #
173
+ # returns: entropy
174
+ def entropy(string)
175
+ -predict(string)[:log_prob_average]
176
+ end
177
+
183
178
  # Public: create a Model from reading from an IO object
184
179
  #
185
180
  # io: the IOReader
186
181
  #
187
182
  # returns: Model with stats filled in, and size of largest ngram
188
183
  def self.read(io)
189
- model = Model.new(0)
190
- max_size = 0
184
+ model = nil
191
185
  io.each_line do |string|
192
186
  ngram_size, ngram, count = string.strip.split(/\t/)
193
187
  ngram_size = ngram_size.to_i
194
188
  count = count.to_f
195
- model.map[ngram_size] = NGramCounter.new(ngram_size) unless model.map.include?(ngram_size)
196
- counter = model.map[ngram_size]
189
+ model = Model.new(ngram_size) unless model
190
+ counter = model.counter
197
191
  counter.total += count
198
192
  counter.counts[ngram] = count
199
- max_size = ngram_size if ngram_size > max_size
200
193
  end
201
- model.size = max_size
202
194
  model
203
195
  end
204
196
 
205
197
  # Public: Train a model on a bunch of data, line by line
206
198
  #
207
199
  # io: the IOReader
208
- #
200
+ #
209
201
  def train(io)
210
202
  io.each_line do |string|
211
- update(string)
203
+ update(string.strip)
212
204
  end
213
205
  end
214
206
 
@@ -217,7 +209,7 @@ module Entropic
217
209
  # each data line should be <string><tab><multiplier>
218
210
  #
219
211
  # io: the IOReader
220
- #
212
+ #
221
213
  def train_with_multiplier(io)
222
214
  io.each_line do |string|
223
215
  text, count = string.strip.split(/\t/)
@@ -227,4 +219,3 @@ module Entropic
227
219
  end
228
220
  end
229
221
  end
230
-
@@ -1,3 +1,3 @@
1
1
  module Entropic
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.1"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: entropic
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Will Fitzgerald