entropic 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d35ae5edab8f2a86c92301ba0a35c1c90bfd3d26
4
- data.tar.gz: 3463138f664205b707808c9ae8783120258c897f
3
+ metadata.gz: 951873f3bb3edd267bb873b352dde384658d00ba
4
+ data.tar.gz: d403582aa90292e13a973be1d1652e4401266c00
5
5
  SHA512:
6
- metadata.gz: 862964de635c81b16806aa8f9acb1a5e347ac399208d688ed4b5b702ddac216ad29a0f5d339c3e12d66a43eaad05c370ffdfa3bba96740a3f1f2c8b7e2aa7f4d
7
- data.tar.gz: 2847b9f7d5ae73f36be387594085fd211203e9c365ee25b2cf9655b7ccd1565a9fa2b7d64961ebbb23cccebb9f889607a3b9756309dd0c69efd70f6aa848e055
6
+ metadata.gz: 88498c5b050afbb5e01fd6f0906d420c8b73a0e78ff1b73af2827c65838a42ad2a636363a5f455ebbed0371366cbf102b3419dfa2e4d9015ac6d53ceee970d39
7
+ data.tar.gz: 230990158cf17071445c4728ac8ff0317152782fc59ac30265b00bd567849b7679dad0c0ef37752576196a254ec2e5161a138be977223f2bf315f58791504584
data/README.md CHANGED
@@ -8,7 +8,7 @@ require 'entropic'
8
8
  => true
9
9
  >> m.predict("entropy")
10
10
  => {:log_prob_total=>-37.181802347513745, :log_prob_average=>-6.1969670579189575, :size=>6}
11
- >> m.predict("yportne")
11
+ >> m.entropy("yportne")
12
12
  => {:log_prob_total=>-34.25705444264748, :log_prob_average=>-5.70950907377458, :size=>6}
13
13
  ```
14
14
 
@@ -21,8 +21,8 @@ You can also train a model, using strings one per line.
21
21
  => true
22
22
  >> File.open('/tmp/training.txt') {|f| n.train(f)}; true
23
23
  => true
24
- >> n.predict('love')
25
- => {:log_prob_total=>-15.396216763909154, :log_prob_average=>-5.132072254636385, :size=>3}
24
+ >> n.entropy('love')
25
+ => 5.132072254636385
26
26
  ```
27
27
 
28
28
  You can also train a model, using strings and a count of the number of times it appers, tab separated.
@@ -32,8 +32,8 @@ You can also train a model, using strings and a count of the number of times it
32
32
  => true
33
33
  >> File.open('/tmp/training_with_counts.txt') {|f| o.train_with_multiplier(f)}; true
34
34
  => true
35
- >> o.predict('love')
36
- => {:log_prob_total=>-15.396216763909154, :log_prob_average=>-5.132072254636385, :size=>3}
35
+ >> o.entropy('love')
36
+ => 5.132072254636385
37
37
  ```
38
38
 
39
39
  You can also dump a model, to be read later.
@@ -1,4 +1,4 @@
1
- require "entropic/version"
1
+ require 'entropic/version'
2
2
 
3
3
  # Public: classes and methods useful for estimating entropy on strings.
4
4
  #
@@ -9,7 +9,6 @@ require "entropic/version"
9
9
  # # => { log_prob_total: -101.1, log_prob_average: -20.02, size: 5 }
10
10
  #
11
11
  module Entropic
12
-
13
12
  # Public: create a sliding window of ngrams from a string
14
13
  #
15
14
  # string: The String to slide over
@@ -17,10 +16,10 @@ module Entropic
17
16
  #
18
17
  # Examples
19
18
  #
20
- # sliding('01234', 2)
19
+ # sliding('01234', 2)
21
20
  # # => ['01', '12', '23', '34']
22
21
  #
23
- def Entropic.sliding(string, n)
22
+ def self.sliding(string, n)
24
23
  (0..string.length - n).map { |i| (string[i, n]).to_s }
25
24
  end
26
25
 
@@ -36,7 +35,7 @@ module Entropic
36
35
  # Public: update a counter with a string, and a multiplier
37
36
  #
38
37
  # Examples
39
- #
38
+ #
40
39
  # counter = NGramCounter.new(2)
41
40
  # counter.update_with_multiplier('01234', 1)
42
41
  #
@@ -53,7 +52,7 @@ module Entropic
53
52
  # Public: update a counter with a string, with a multiplier of 1
54
53
  #
55
54
  # Examples
56
- #
55
+ #
57
56
  # counter = NGramCounter.new(2)
58
57
  # counter.update('01234')
59
58
  #
@@ -62,11 +61,11 @@ module Entropic
62
61
  def update(string)
63
62
  update_with_multiplier(string, 1)
64
63
  end
65
-
64
+
66
65
  # Public: get count for string, with default
67
66
  #
68
67
  # Examples
69
- #
68
+ #
70
69
  # counter = NGramCounter.new(2)
71
70
  # counter.update('01234')
72
71
  # counter.count('01', 0)
@@ -85,18 +84,17 @@ module Entropic
85
84
  # Public; A model for entropy
86
85
  class Model
87
86
  VERSION = '1.0.0'.freeze
88
- attr_accessor :size, :map
87
+ attr_accessor :size, :counter
89
88
 
90
89
  def initialize(size)
91
90
  @size = size
92
- @map = {}
93
- (1..size).each { |key| @map[key] = NGramCounter.new(key) }
91
+ @counter = NGramCounter.new(size)
94
92
  end
95
93
 
96
94
  # Public: update a model with a string, and a multiplier
97
95
  #
98
96
  # Examples
99
- #
97
+ #
100
98
  # model = Model.new(2)
101
99
  # model.update_with_multiplier('01234', 1)
102
100
  #
@@ -104,15 +102,13 @@ module Entropic
104
102
  # multiplier: The Integer describing how much weight (will often be 1)
105
103
  #
106
104
  def update_with_multiplier(string, multiplier)
107
- @map.each do |_, counter|
108
- counter.update_with_multiplier(string, multiplier)
109
- end
105
+ @counter.update_with_multiplier(string, multiplier)
110
106
  end
111
107
 
112
108
  # Public: update a model with a string, with mulitplier or 1
113
109
  #
114
110
  # Examples
115
- #
111
+ #
116
112
  # model = Model.new(2)
117
113
  # model.update('01234')
118
114
  #
@@ -127,48 +123,37 @@ module Entropic
127
123
  # or log_prob of a 1-gram appearing once if no suffix found
128
124
  #
129
125
  # Examples
130
- #
126
+ #
131
127
  # model = Model.new(2)
132
128
  # model.update('01234')
133
129
  # model.log_prob('01')
134
- #
130
+ #
135
131
  # string: The String to query
136
132
  #
137
133
  def log_prob(key)
138
- last_total = 1
139
- if @map.all? { |_, m| m.counts.empty? } || !key || key == ''
134
+ if @counter.total == 0 || !key || key == ''
140
135
  return Math.log(0, 2.0) # -Infinity
141
136
  end
142
137
 
143
- (1..key.size).each do |i|
144
- k = key[-i..-1]
145
- counter = @map.fetch(k.size, nil)
146
- next unless counter
147
- count = counter.counts.fetch(k, nil)
148
- return Math.log(count, 2.0) - Math.log(counter.total, 2.0) if count
149
- last_total = counter.total
150
- end
151
- # found it nowhere. Return '1 count' from last total
152
- Math.log(1.0, 2.0) - Math.log(last_total, 2.0)
138
+ count = @counter.count(key, 0.5)
139
+ Math.log(count, 2.0) - Math.log(counter.total, 2.0)
153
140
  end
154
141
 
155
142
  # Public: dump model to some io object
156
- #
143
+ #
157
144
  # io: the IOWriter to write to
158
145
  #
159
146
  def dump(io)
160
- @map.each do |k, m|
161
- m.counts.each do |ngram, count|
162
- io.write("#{k}\t#{ngram}\t#{count}\n")
163
- end
147
+ @counter.counts.each do |ngram, count|
148
+ io.write("#{@size}\t#{ngram}\t#{count}\n")
164
149
  end
165
150
  end
166
151
 
167
152
  # Public: predict the log_prob sum and average over a string
168
153
  # which will be split into ngrams
169
- #
154
+ #
170
155
  # string: The String to query
171
- #
156
+ #
172
157
  # returns: a dictionary of
173
158
  # - log_prob_total
174
159
  # - log_prob_average
@@ -180,35 +165,42 @@ module Entropic
180
165
  { log_prob_total: log_prob_total, log_prob_average: log_prob_average, size: ngrams.size }
181
166
  end
182
167
 
168
+ # Public: predict the entropy over a string
169
+ # which will be split into ngrams
170
+ #
171
+ # string: The String to query
172
+ #
173
+ # returns: entropy
174
+ def entropy(string)
175
+ -predict(string)[:log_prob_average]
176
+ end
177
+
183
178
  # Public: create a Model from reading from an IO object
184
179
  #
185
180
  # io: the IOReader
186
181
  #
187
182
  # returns: Model with stats filled in, and size of largest ngram
188
183
  def self.read(io)
189
- model = Model.new(0)
190
- max_size = 0
184
+ model = nil
191
185
  io.each_line do |string|
192
186
  ngram_size, ngram, count = string.strip.split(/\t/)
193
187
  ngram_size = ngram_size.to_i
194
188
  count = count.to_f
195
- model.map[ngram_size] = NGramCounter.new(ngram_size) unless model.map.include?(ngram_size)
196
- counter = model.map[ngram_size]
189
+ model = Model.new(ngram_size) unless model
190
+ counter = model.counter
197
191
  counter.total += count
198
192
  counter.counts[ngram] = count
199
- max_size = ngram_size if ngram_size > max_size
200
193
  end
201
- model.size = max_size
202
194
  model
203
195
  end
204
196
 
205
197
  # Public: Train a model on a bunch of data, line by line
206
198
  #
207
199
  # io: the IOReader
208
- #
200
+ #
209
201
  def train(io)
210
202
  io.each_line do |string|
211
- update(string)
203
+ update(string.strip)
212
204
  end
213
205
  end
214
206
 
@@ -217,7 +209,7 @@ module Entropic
217
209
  # each data line should be <string><tab><multiplier>
218
210
  #
219
211
  # io: the IOReader
220
- #
212
+ #
221
213
  def train_with_multiplier(io)
222
214
  io.each_line do |string|
223
215
  text, count = string.strip.split(/\t/)
@@ -227,4 +219,3 @@ module Entropic
227
219
  end
228
220
  end
229
221
  end
230
-
@@ -1,3 +1,3 @@
1
1
  module Entropic
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.1"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: entropic
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Will Fitzgerald