entropic 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +2 -0
- data/.travis.yml +5 -0
- data/Gemfile +4 -0
- data/README.md +72 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/entropic.gemspec +35 -0
- data/lib/entropic/version.rb +3 -0
- data/lib/entropic.rb +230 -0
- metadata +96 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: d35ae5edab8f2a86c92301ba0a35c1c90bfd3d26
|
4
|
+
data.tar.gz: 3463138f664205b707808c9ae8783120258c897f
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 862964de635c81b16806aa8f9acb1a5e347ac399208d688ed4b5b702ddac216ad29a0f5d339c3e12d66a43eaad05c370ffdfa3bba96740a3f1f2c8b7e2aa7f4d
|
7
|
+
data.tar.gz: 2847b9f7d5ae73f36be387594085fd211203e9c365ee25b2cf9655b7ccd1565a9fa2b7d64961ebbb23cccebb9f889607a3b9756309dd0c69efd70f6aa848e055
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
# Entropic
|
2
|
+
|
3
|
+
Entropic trains and predicts entropy of strings based on character n-gram models. For example:
|
4
|
+
|
5
|
+
```ruby
|
6
|
+
require 'entropic'
|
7
|
+
>> m = Entropic::Model.read(open('https://raw.githubusercontent.com/willf/entropy/master/data/google_books_2.tsv')); true
|
8
|
+
=> true
|
9
|
+
>> m.predict("entropy")
|
10
|
+
=> {:log_prob_total=>-37.181802347513745, :log_prob_average=>-6.1969670579189575, :size=>6}
|
11
|
+
>> m.predict("yportne")
|
12
|
+
=> {:log_prob_total=>-34.25705444264748, :log_prob_average=>-5.70950907377458, :size=>6}
|
13
|
+
```
|
14
|
+
|
15
|
+
The string 'yportne' is much less likely than the string 'entropy'.
|
16
|
+
|
17
|
+
You can also train a model, using strings one per line.
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
>> n = Entropic::Model.new(2); true
|
21
|
+
=> true
|
22
|
+
>> File.open('/tmp/training.txt') {|f| n.train(f)}; true
|
23
|
+
=> true
|
24
|
+
>> n.predict('love')
|
25
|
+
=> {:log_prob_total=>-15.396216763909154, :log_prob_average=>-5.132072254636385, :size=>3}
|
26
|
+
```
|
27
|
+
|
28
|
+
You can also train a model, using strings and a count of the number of times it appers, tab separated.
|
29
|
+
|
30
|
+
```ruby
|
31
|
+
>> o = Entropic::Model.new(2); true
|
32
|
+
=> true
|
33
|
+
>> File.open('/tmp/training_with_counts.txt') {|f| o.train_with_multiplier(f)}; true
|
34
|
+
=> true
|
35
|
+
>> o.predict('love')
|
36
|
+
=> {:log_prob_total=>-15.396216763909154, :log_prob_average=>-5.132072254636385, :size=>3}
|
37
|
+
```
|
38
|
+
|
39
|
+
You can also dump a model, to be read later.
|
40
|
+
|
41
|
+
```ruby
|
42
|
+
>> File.open('/tmp/save.tsv','w') {|f| o.dump(f)}; true
|
43
|
+
=> true
|
44
|
+
```
|
45
|
+
|
46
|
+
## Installation
|
47
|
+
|
48
|
+
Add this line to your application's Gemfile:
|
49
|
+
|
50
|
+
```ruby
|
51
|
+
gem 'entropic'
|
52
|
+
```
|
53
|
+
|
54
|
+
And then execute:
|
55
|
+
|
56
|
+
$ bundle
|
57
|
+
|
58
|
+
Or install it yourself as:
|
59
|
+
|
60
|
+
$ gem install entropic
|
61
|
+
|
62
|
+
|
63
|
+
## Development
|
64
|
+
|
65
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
66
|
+
|
67
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
68
|
+
|
69
|
+
## Contributing
|
70
|
+
|
71
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/willf/entropic.
|
72
|
+
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "entropic"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start(__FILE__)
|
data/bin/setup
ADDED
data/entropic.gemspec
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'entropic/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "entropic"
|
8
|
+
spec.version = Entropic::VERSION
|
9
|
+
spec.authors = ["Will Fitzgerald"]
|
10
|
+
spec.email = ["willf@github.com"]
|
11
|
+
|
12
|
+
spec.description = %q{Train and predict string entropy based on character n-grams.}
|
13
|
+
spec.summary = spec.description
|
14
|
+
spec.homepage = "https://github.com/willf/entropic"
|
15
|
+
|
16
|
+
# Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
|
17
|
+
# to allow pushing to a single host or delete this section to allow pushing to any host.
|
18
|
+
# if spec.respond_to?(:metadata)
|
19
|
+
# spec.metadata['allowed_push_host'] = "TODO: Set to 'http://mygemserver.com'"
|
20
|
+
# else
|
21
|
+
# raise "RubyGems 2.0 or newer is required to protect against " \
|
22
|
+
# "public gem pushes."
|
23
|
+
# end
|
24
|
+
|
25
|
+
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
26
|
+
f.match(%r{^(test|spec|features)/})
|
27
|
+
end
|
28
|
+
spec.bindir = "exe"
|
29
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
30
|
+
spec.require_paths = ["lib"]
|
31
|
+
|
32
|
+
spec.add_development_dependency "bundler", "~> 1.14"
|
33
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
34
|
+
spec.add_development_dependency "rspec", "~> 3.0"
|
35
|
+
end
|
data/lib/entropic.rb
ADDED
@@ -0,0 +1,230 @@
|
|
1
|
+
require "entropic/version"
|
2
|
+
|
3
|
+
# Public: classes and methods useful for estimating entropy on strings.
|
4
|
+
#
|
5
|
+
# Examples
|
6
|
+
#
|
7
|
+
# model = Entropic.Model.read("ngrams.tsv")
|
8
|
+
# model.predict("the")
|
9
|
+
# # => { log_prob_total: -101.1, log_prob_average: -20.02, size: 5 }
|
10
|
+
#
|
11
|
+
module Entropic
|
12
|
+
|
13
|
+
# Public: create a sliding window of ngrams from a string
|
14
|
+
#
|
15
|
+
# string: The String to slide over
|
16
|
+
# n: The Integer defining the size of the ngrams
|
17
|
+
#
|
18
|
+
# Examples
|
19
|
+
#
|
20
|
+
# sliding('01234', 2)
|
21
|
+
# # => ['01', '12', '23', '34']
|
22
|
+
#
|
23
|
+
def Entropic.sliding(string, n)
|
24
|
+
(0..string.length - n).map { |i| (string[i, n]).to_s }
|
25
|
+
end
|
26
|
+
|
27
|
+
# Public: a counter for ngrams
|
28
|
+
class NGramCounter
|
29
|
+
attr_accessor :size, :counts, :total
|
30
|
+
def initialize(size)
|
31
|
+
@size = size
|
32
|
+
@counts = Hash.new(0)
|
33
|
+
@total = 0
|
34
|
+
end
|
35
|
+
|
36
|
+
# Public: update a counter with a string, and a multiplier
|
37
|
+
#
|
38
|
+
# Examples
|
39
|
+
#
|
40
|
+
# counter = NGramCounter.new(2)
|
41
|
+
# counter.update_with_multiplier('01234', 1)
|
42
|
+
#
|
43
|
+
# string: The String to update with
|
44
|
+
# multiplier: The Integer describing how much weight (will often be 1)
|
45
|
+
#
|
46
|
+
def update_with_multiplier(string, multiplier)
|
47
|
+
Entropic.sliding(string, @size).each do |ngram|
|
48
|
+
@counts[ngram] += multiplier
|
49
|
+
@total += multiplier
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Public: update a counter with a string, with a multiplier of 1
|
54
|
+
#
|
55
|
+
# Examples
|
56
|
+
#
|
57
|
+
# counter = NGramCounter.new(2)
|
58
|
+
# counter.update('01234')
|
59
|
+
#
|
60
|
+
# string: The String to update with
|
61
|
+
#
|
62
|
+
def update(string)
|
63
|
+
update_with_multiplier(string, 1)
|
64
|
+
end
|
65
|
+
|
66
|
+
# Public: get count for string, with default
|
67
|
+
#
|
68
|
+
# Examples
|
69
|
+
#
|
70
|
+
# counter = NGramCounter.new(2)
|
71
|
+
# counter.update('01234')
|
72
|
+
# counter.count('01', 0)
|
73
|
+
# #=> 1
|
74
|
+
# counter.count('bob, 0)
|
75
|
+
# #=> 0
|
76
|
+
#
|
77
|
+
# ngram: The String to check
|
78
|
+
# if_not_found : what to update with
|
79
|
+
#
|
80
|
+
def count(ngram, if_not_found)
|
81
|
+
@counts.fetch(ngram, if_not_found)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Public; A model for entropy
|
86
|
+
class Model
|
87
|
+
VERSION = '1.0.0'.freeze
|
88
|
+
attr_accessor :size, :map
|
89
|
+
|
90
|
+
def initialize(size)
|
91
|
+
@size = size
|
92
|
+
@map = {}
|
93
|
+
(1..size).each { |key| @map[key] = NGramCounter.new(key) }
|
94
|
+
end
|
95
|
+
|
96
|
+
# Public: update a model with a string, and a multiplier
|
97
|
+
#
|
98
|
+
# Examples
|
99
|
+
#
|
100
|
+
# model = Model.new(2)
|
101
|
+
# model.update_with_multiplier('01234', 1)
|
102
|
+
#
|
103
|
+
# string: The String to update with
|
104
|
+
# multiplier: The Integer describing how much weight (will often be 1)
|
105
|
+
#
|
106
|
+
def update_with_multiplier(string, multiplier)
|
107
|
+
@map.each do |_, counter|
|
108
|
+
counter.update_with_multiplier(string, multiplier)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
# Public: update a model with a string, with mulitplier or 1
|
113
|
+
#
|
114
|
+
# Examples
|
115
|
+
#
|
116
|
+
# model = Model.new(2)
|
117
|
+
# model.update('01234')
|
118
|
+
#
|
119
|
+
# string: The String to update with
|
120
|
+
#
|
121
|
+
def update(string)
|
122
|
+
update_with_multiplier(string, 1)
|
123
|
+
end
|
124
|
+
|
125
|
+
# Public: log probability of a ngram string in a model
|
126
|
+
# returns value of first suffix of string
|
127
|
+
# or log_prob of a 1-gram appearing once if no suffix found
|
128
|
+
#
|
129
|
+
# Examples
|
130
|
+
#
|
131
|
+
# model = Model.new(2)
|
132
|
+
# model.update('01234')
|
133
|
+
# model.log_prob('01')
|
134
|
+
#
|
135
|
+
# string: The String to query
|
136
|
+
#
|
137
|
+
def log_prob(key)
|
138
|
+
last_total = 1
|
139
|
+
if @map.all? { |_, m| m.counts.empty? } || !key || key == ''
|
140
|
+
return Math.log(0, 2.0) # -Infinity
|
141
|
+
end
|
142
|
+
|
143
|
+
(1..key.size).each do |i|
|
144
|
+
k = key[-i..-1]
|
145
|
+
counter = @map.fetch(k.size, nil)
|
146
|
+
next unless counter
|
147
|
+
count = counter.counts.fetch(k, nil)
|
148
|
+
return Math.log(count, 2.0) - Math.log(counter.total, 2.0) if count
|
149
|
+
last_total = counter.total
|
150
|
+
end
|
151
|
+
# found it nowhere. Return '1 count' from last total
|
152
|
+
Math.log(1.0, 2.0) - Math.log(last_total, 2.0)
|
153
|
+
end
|
154
|
+
|
155
|
+
# Public: dump model to some io object
|
156
|
+
#
|
157
|
+
# io: the IOWriter to write to
|
158
|
+
#
|
159
|
+
def dump(io)
|
160
|
+
@map.each do |k, m|
|
161
|
+
m.counts.each do |ngram, count|
|
162
|
+
io.write("#{k}\t#{ngram}\t#{count}\n")
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
# Public: predict the log_prob sum and average over a string
|
168
|
+
# which will be split into ngrams
|
169
|
+
#
|
170
|
+
# string: The String to query
|
171
|
+
#
|
172
|
+
# returns: a dictionary of
|
173
|
+
# - log_prob_total
|
174
|
+
# - log_prob_average
|
175
|
+
# - size (number of ngrams in string)
|
176
|
+
def predict(string)
|
177
|
+
ngrams = Entropic.sliding(string, @size)
|
178
|
+
log_prob_total = ngrams.map { |ngram| log_prob(ngram) }.inject(0.0, :+)
|
179
|
+
log_prob_average = log_prob_total / ngrams.size.to_f
|
180
|
+
{ log_prob_total: log_prob_total, log_prob_average: log_prob_average, size: ngrams.size }
|
181
|
+
end
|
182
|
+
|
183
|
+
# Public: create a Model from reading from an IO object
|
184
|
+
#
|
185
|
+
# io: the IOReader
|
186
|
+
#
|
187
|
+
# returns: Model with stats filled in, and size of largest ngram
|
188
|
+
def self.read(io)
|
189
|
+
model = Model.new(0)
|
190
|
+
max_size = 0
|
191
|
+
io.each_line do |string|
|
192
|
+
ngram_size, ngram, count = string.strip.split(/\t/)
|
193
|
+
ngram_size = ngram_size.to_i
|
194
|
+
count = count.to_f
|
195
|
+
model.map[ngram_size] = NGramCounter.new(ngram_size) unless model.map.include?(ngram_size)
|
196
|
+
counter = model.map[ngram_size]
|
197
|
+
counter.total += count
|
198
|
+
counter.counts[ngram] = count
|
199
|
+
max_size = ngram_size if ngram_size > max_size
|
200
|
+
end
|
201
|
+
model.size = max_size
|
202
|
+
model
|
203
|
+
end
|
204
|
+
|
205
|
+
# Public: Train a model on a bunch of data, line by line
|
206
|
+
#
|
207
|
+
# io: the IOReader
|
208
|
+
#
|
209
|
+
def train(io)
|
210
|
+
io.each_line do |string|
|
211
|
+
update(string)
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
# Public: Train a model on a bunch of data, line by line,
|
216
|
+
# with a multiplier
|
217
|
+
# each data line should be <string><tab><multiplier>
|
218
|
+
#
|
219
|
+
# io: the IOReader
|
220
|
+
#
|
221
|
+
def train_with_multiplier(io)
|
222
|
+
io.each_line do |string|
|
223
|
+
text, count = string.strip.split(/\t/)
|
224
|
+
count = count.to_i
|
225
|
+
update_with_multiplier(text, count)
|
226
|
+
end
|
227
|
+
end
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
metadata
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: entropic
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Will Fitzgerald
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2018-01-12 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.14'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.14'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '3.0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '3.0'
|
55
|
+
description: Train and predict string entropy based on character n-grams.
|
56
|
+
email:
|
57
|
+
- willf@github.com
|
58
|
+
executables: []
|
59
|
+
extensions: []
|
60
|
+
extra_rdoc_files: []
|
61
|
+
files:
|
62
|
+
- ".gitignore"
|
63
|
+
- ".rspec"
|
64
|
+
- ".travis.yml"
|
65
|
+
- Gemfile
|
66
|
+
- README.md
|
67
|
+
- Rakefile
|
68
|
+
- bin/console
|
69
|
+
- bin/setup
|
70
|
+
- entropic.gemspec
|
71
|
+
- lib/entropic.rb
|
72
|
+
- lib/entropic/version.rb
|
73
|
+
homepage: https://github.com/willf/entropic
|
74
|
+
licenses: []
|
75
|
+
metadata: {}
|
76
|
+
post_install_message:
|
77
|
+
rdoc_options: []
|
78
|
+
require_paths:
|
79
|
+
- lib
|
80
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
81
|
+
requirements:
|
82
|
+
- - ">="
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
version: '0'
|
85
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
requirements: []
|
91
|
+
rubyforge_project:
|
92
|
+
rubygems_version: 2.4.5.1
|
93
|
+
signing_key:
|
94
|
+
specification_version: 4
|
95
|
+
summary: Train and predict string entropy based on character n-grams.
|
96
|
+
test_files: []
|