entropic 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +2 -0
- data/.travis.yml +5 -0
- data/Gemfile +4 -0
- data/README.md +72 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/entropic.gemspec +35 -0
- data/lib/entropic/version.rb +3 -0
- data/lib/entropic.rb +230 -0
- metadata +96 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: d35ae5edab8f2a86c92301ba0a35c1c90bfd3d26
|
4
|
+
data.tar.gz: 3463138f664205b707808c9ae8783120258c897f
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 862964de635c81b16806aa8f9acb1a5e347ac399208d688ed4b5b702ddac216ad29a0f5d339c3e12d66a43eaad05c370ffdfa3bba96740a3f1f2c8b7e2aa7f4d
|
7
|
+
data.tar.gz: 2847b9f7d5ae73f36be387594085fd211203e9c365ee25b2cf9655b7ccd1565a9fa2b7d64961ebbb23cccebb9f889607a3b9756309dd0c69efd70f6aa848e055
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
# Entropic
|
2
|
+
|
3
|
+
Entropic trains and predicts entropy of strings based on character n-gram models. For example:
|
4
|
+
|
5
|
+
```ruby
|
6
|
+
require 'entropic'
|
7
|
+
>> m = Entropic::Model.read(open('https://raw.githubusercontent.com/willf/entropy/master/data/google_books_2.tsv')); true
|
8
|
+
=> true
|
9
|
+
>> m.predict("entropy")
|
10
|
+
=> {:log_prob_total=>-37.181802347513745, :log_prob_average=>-6.1969670579189575, :size=>6}
|
11
|
+
>> m.predict("yportne")
|
12
|
+
=> {:log_prob_total=>-34.25705444264748, :log_prob_average=>-5.70950907377458, :size=>6}
|
13
|
+
```
|
14
|
+
|
15
|
+
The string 'yportne' is much less likely than the string 'entropy'.
|
16
|
+
|
17
|
+
You can also train a model, using strings one per line.
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
>> n = Entropic::Model.new(2); true
|
21
|
+
=> true
|
22
|
+
>> File.open('/tmp/training.txt') {|f| n.train(f)}; true
|
23
|
+
=> true
|
24
|
+
>> n.predict('love')
|
25
|
+
=> {:log_prob_total=>-15.396216763909154, :log_prob_average=>-5.132072254636385, :size=>3}
|
26
|
+
```
|
27
|
+
|
28
|
+
You can also train a model, using strings and a count of the number of times it appers, tab separated.
|
29
|
+
|
30
|
+
```ruby
|
31
|
+
>> o = Entropic::Model.new(2); true
|
32
|
+
=> true
|
33
|
+
>> File.open('/tmp/training_with_counts.txt') {|f| o.train_with_multiplier(f)}; true
|
34
|
+
=> true
|
35
|
+
>> o.predict('love')
|
36
|
+
=> {:log_prob_total=>-15.396216763909154, :log_prob_average=>-5.132072254636385, :size=>3}
|
37
|
+
```
|
38
|
+
|
39
|
+
You can also dump a model, to be read later.
|
40
|
+
|
41
|
+
```ruby
|
42
|
+
>> File.open('/tmp/save.tsv','w') {|f| o.dump(f)}; true
|
43
|
+
=> true
|
44
|
+
```
|
45
|
+
|
46
|
+
## Installation
|
47
|
+
|
48
|
+
Add this line to your application's Gemfile:
|
49
|
+
|
50
|
+
```ruby
|
51
|
+
gem 'entropic'
|
52
|
+
```
|
53
|
+
|
54
|
+
And then execute:
|
55
|
+
|
56
|
+
$ bundle
|
57
|
+
|
58
|
+
Or install it yourself as:
|
59
|
+
|
60
|
+
$ gem install entropic
|
61
|
+
|
62
|
+
|
63
|
+
## Development
|
64
|
+
|
65
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
66
|
+
|
67
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
68
|
+
|
69
|
+
## Contributing
|
70
|
+
|
71
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/willf/entropic.
|
72
|
+
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "entropic"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start(__FILE__)
|
data/bin/setup
ADDED
data/entropic.gemspec
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'entropic/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "entropic"
|
8
|
+
spec.version = Entropic::VERSION
|
9
|
+
spec.authors = ["Will Fitzgerald"]
|
10
|
+
spec.email = ["willf@github.com"]
|
11
|
+
|
12
|
+
spec.description = %q{Train and predict string entropy based on character n-grams.}
|
13
|
+
spec.summary = spec.description
|
14
|
+
spec.homepage = "https://github.com/willf/entropic"
|
15
|
+
|
16
|
+
# Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
|
17
|
+
# to allow pushing to a single host or delete this section to allow pushing to any host.
|
18
|
+
# if spec.respond_to?(:metadata)
|
19
|
+
# spec.metadata['allowed_push_host'] = "TODO: Set to 'http://mygemserver.com'"
|
20
|
+
# else
|
21
|
+
# raise "RubyGems 2.0 or newer is required to protect against " \
|
22
|
+
# "public gem pushes."
|
23
|
+
# end
|
24
|
+
|
25
|
+
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
26
|
+
f.match(%r{^(test|spec|features)/})
|
27
|
+
end
|
28
|
+
spec.bindir = "exe"
|
29
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
30
|
+
spec.require_paths = ["lib"]
|
31
|
+
|
32
|
+
spec.add_development_dependency "bundler", "~> 1.14"
|
33
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
34
|
+
spec.add_development_dependency "rspec", "~> 3.0"
|
35
|
+
end
|
data/lib/entropic.rb
ADDED
@@ -0,0 +1,230 @@
|
|
1
|
+
require "entropic/version"
|
2
|
+
|
3
|
+
# Public: classes and methods useful for estimating entropy on strings.
|
4
|
+
#
|
5
|
+
# Examples
|
6
|
+
#
|
7
|
+
# model = Entropic.Model.read("ngrams.tsv")
|
8
|
+
# model.predict("the")
|
9
|
+
# # => { log_prob_total: -101.1, log_prob_average: -20.02, size: 5 }
|
10
|
+
#
|
11
|
+
module Entropic
|
12
|
+
|
13
|
+
# Public: create a sliding window of ngrams from a string
|
14
|
+
#
|
15
|
+
# string: The String to slide over
|
16
|
+
# n: The Integer defining the size of the ngrams
|
17
|
+
#
|
18
|
+
# Examples
|
19
|
+
#
|
20
|
+
# sliding('01234', 2)
|
21
|
+
# # => ['01', '12', '23', '34']
|
22
|
+
#
|
23
|
+
def Entropic.sliding(string, n)
|
24
|
+
(0..string.length - n).map { |i| (string[i, n]).to_s }
|
25
|
+
end
|
26
|
+
|
27
|
+
# Public: a counter for ngrams
|
28
|
+
class NGramCounter
|
29
|
+
attr_accessor :size, :counts, :total
|
30
|
+
def initialize(size)
|
31
|
+
@size = size
|
32
|
+
@counts = Hash.new(0)
|
33
|
+
@total = 0
|
34
|
+
end
|
35
|
+
|
36
|
+
# Public: update a counter with a string, and a multiplier
|
37
|
+
#
|
38
|
+
# Examples
|
39
|
+
#
|
40
|
+
# counter = NGramCounter.new(2)
|
41
|
+
# counter.update_with_multiplier('01234', 1)
|
42
|
+
#
|
43
|
+
# string: The String to update with
|
44
|
+
# multiplier: The Integer describing how much weight (will often be 1)
|
45
|
+
#
|
46
|
+
def update_with_multiplier(string, multiplier)
|
47
|
+
Entropic.sliding(string, @size).each do |ngram|
|
48
|
+
@counts[ngram] += multiplier
|
49
|
+
@total += multiplier
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Public: update a counter with a string, with a multiplier of 1
|
54
|
+
#
|
55
|
+
# Examples
|
56
|
+
#
|
57
|
+
# counter = NGramCounter.new(2)
|
58
|
+
# counter.update('01234')
|
59
|
+
#
|
60
|
+
# string: The String to update with
|
61
|
+
#
|
62
|
+
def update(string)
|
63
|
+
update_with_multiplier(string, 1)
|
64
|
+
end
|
65
|
+
|
66
|
+
# Public: get count for string, with default
|
67
|
+
#
|
68
|
+
# Examples
|
69
|
+
#
|
70
|
+
# counter = NGramCounter.new(2)
|
71
|
+
# counter.update('01234')
|
72
|
+
# counter.count('01', 0)
|
73
|
+
# #=> 1
|
74
|
+
# counter.count('bob, 0)
|
75
|
+
# #=> 0
|
76
|
+
#
|
77
|
+
# ngram: The String to check
|
78
|
+
# if_not_found : what to update with
|
79
|
+
#
|
80
|
+
def count(ngram, if_not_found)
|
81
|
+
@counts.fetch(ngram, if_not_found)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Public; A model for entropy
|
86
|
+
class Model
|
87
|
+
VERSION = '1.0.0'.freeze
|
88
|
+
attr_accessor :size, :map
|
89
|
+
|
90
|
+
def initialize(size)
|
91
|
+
@size = size
|
92
|
+
@map = {}
|
93
|
+
(1..size).each { |key| @map[key] = NGramCounter.new(key) }
|
94
|
+
end
|
95
|
+
|
96
|
+
# Public: update a model with a string, and a multiplier
|
97
|
+
#
|
98
|
+
# Examples
|
99
|
+
#
|
100
|
+
# model = Model.new(2)
|
101
|
+
# model.update_with_multiplier('01234', 1)
|
102
|
+
#
|
103
|
+
# string: The String to update with
|
104
|
+
# multiplier: The Integer describing how much weight (will often be 1)
|
105
|
+
#
|
106
|
+
def update_with_multiplier(string, multiplier)
|
107
|
+
@map.each do |_, counter|
|
108
|
+
counter.update_with_multiplier(string, multiplier)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
# Public: update a model with a string, with mulitplier or 1
|
113
|
+
#
|
114
|
+
# Examples
|
115
|
+
#
|
116
|
+
# model = Model.new(2)
|
117
|
+
# model.update('01234')
|
118
|
+
#
|
119
|
+
# string: The String to update with
|
120
|
+
#
|
121
|
+
def update(string)
|
122
|
+
update_with_multiplier(string, 1)
|
123
|
+
end
|
124
|
+
|
125
|
+
# Public: log probability of a ngram string in a model
|
126
|
+
# returns value of first suffix of string
|
127
|
+
# or log_prob of a 1-gram appearing once if no suffix found
|
128
|
+
#
|
129
|
+
# Examples
|
130
|
+
#
|
131
|
+
# model = Model.new(2)
|
132
|
+
# model.update('01234')
|
133
|
+
# model.log_prob('01')
|
134
|
+
#
|
135
|
+
# string: The String to query
|
136
|
+
#
|
137
|
+
def log_prob(key)
|
138
|
+
last_total = 1
|
139
|
+
if @map.all? { |_, m| m.counts.empty? } || !key || key == ''
|
140
|
+
return Math.log(0, 2.0) # -Infinity
|
141
|
+
end
|
142
|
+
|
143
|
+
(1..key.size).each do |i|
|
144
|
+
k = key[-i..-1]
|
145
|
+
counter = @map.fetch(k.size, nil)
|
146
|
+
next unless counter
|
147
|
+
count = counter.counts.fetch(k, nil)
|
148
|
+
return Math.log(count, 2.0) - Math.log(counter.total, 2.0) if count
|
149
|
+
last_total = counter.total
|
150
|
+
end
|
151
|
+
# found it nowhere. Return '1 count' from last total
|
152
|
+
Math.log(1.0, 2.0) - Math.log(last_total, 2.0)
|
153
|
+
end
|
154
|
+
|
155
|
+
# Public: dump model to some io object
|
156
|
+
#
|
157
|
+
# io: the IOWriter to write to
|
158
|
+
#
|
159
|
+
def dump(io)
|
160
|
+
@map.each do |k, m|
|
161
|
+
m.counts.each do |ngram, count|
|
162
|
+
io.write("#{k}\t#{ngram}\t#{count}\n")
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
# Public: predict the log_prob sum and average over a string
|
168
|
+
# which will be split into ngrams
|
169
|
+
#
|
170
|
+
# string: The String to query
|
171
|
+
#
|
172
|
+
# returns: a dictionary of
|
173
|
+
# - log_prob_total
|
174
|
+
# - log_prob_average
|
175
|
+
# - size (number of ngrams in string)
|
176
|
+
def predict(string)
|
177
|
+
ngrams = Entropic.sliding(string, @size)
|
178
|
+
log_prob_total = ngrams.map { |ngram| log_prob(ngram) }.inject(0.0, :+)
|
179
|
+
log_prob_average = log_prob_total / ngrams.size.to_f
|
180
|
+
{ log_prob_total: log_prob_total, log_prob_average: log_prob_average, size: ngrams.size }
|
181
|
+
end
|
182
|
+
|
183
|
+
# Public: create a Model from reading from an IO object
|
184
|
+
#
|
185
|
+
# io: the IOReader
|
186
|
+
#
|
187
|
+
# returns: Model with stats filled in, and size of largest ngram
|
188
|
+
def self.read(io)
|
189
|
+
model = Model.new(0)
|
190
|
+
max_size = 0
|
191
|
+
io.each_line do |string|
|
192
|
+
ngram_size, ngram, count = string.strip.split(/\t/)
|
193
|
+
ngram_size = ngram_size.to_i
|
194
|
+
count = count.to_f
|
195
|
+
model.map[ngram_size] = NGramCounter.new(ngram_size) unless model.map.include?(ngram_size)
|
196
|
+
counter = model.map[ngram_size]
|
197
|
+
counter.total += count
|
198
|
+
counter.counts[ngram] = count
|
199
|
+
max_size = ngram_size if ngram_size > max_size
|
200
|
+
end
|
201
|
+
model.size = max_size
|
202
|
+
model
|
203
|
+
end
|
204
|
+
|
205
|
+
# Public: Train a model on a bunch of data, line by line
|
206
|
+
#
|
207
|
+
# io: the IOReader
|
208
|
+
#
|
209
|
+
def train(io)
|
210
|
+
io.each_line do |string|
|
211
|
+
update(string)
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
# Public: Train a model on a bunch of data, line by line,
|
216
|
+
# with a multiplier
|
217
|
+
# each data line should be <string><tab><multiplier>
|
218
|
+
#
|
219
|
+
# io: the IOReader
|
220
|
+
#
|
221
|
+
def train_with_multiplier(io)
|
222
|
+
io.each_line do |string|
|
223
|
+
text, count = string.strip.split(/\t/)
|
224
|
+
count = count.to_i
|
225
|
+
update_with_multiplier(text, count)
|
226
|
+
end
|
227
|
+
end
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
metadata
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: entropic
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Will Fitzgerald
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2018-01-12 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.14'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.14'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '3.0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '3.0'
|
55
|
+
description: Train and predict string entropy based on character n-grams.
|
56
|
+
email:
|
57
|
+
- willf@github.com
|
58
|
+
executables: []
|
59
|
+
extensions: []
|
60
|
+
extra_rdoc_files: []
|
61
|
+
files:
|
62
|
+
- ".gitignore"
|
63
|
+
- ".rspec"
|
64
|
+
- ".travis.yml"
|
65
|
+
- Gemfile
|
66
|
+
- README.md
|
67
|
+
- Rakefile
|
68
|
+
- bin/console
|
69
|
+
- bin/setup
|
70
|
+
- entropic.gemspec
|
71
|
+
- lib/entropic.rb
|
72
|
+
- lib/entropic/version.rb
|
73
|
+
homepage: https://github.com/willf/entropic
|
74
|
+
licenses: []
|
75
|
+
metadata: {}
|
76
|
+
post_install_message:
|
77
|
+
rdoc_options: []
|
78
|
+
require_paths:
|
79
|
+
- lib
|
80
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
81
|
+
requirements:
|
82
|
+
- - ">="
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
version: '0'
|
85
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
requirements: []
|
91
|
+
rubyforge_project:
|
92
|
+
rubygems_version: 2.4.5.1
|
93
|
+
signing_key:
|
94
|
+
specification_version: 4
|
95
|
+
summary: Train and predict string entropy based on character n-grams.
|
96
|
+
test_files: []
|