entropic 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: d35ae5edab8f2a86c92301ba0a35c1c90bfd3d26
4
+ data.tar.gz: 3463138f664205b707808c9ae8783120258c897f
5
+ SHA512:
6
+ metadata.gz: 862964de635c81b16806aa8f9acb1a5e347ac399208d688ed4b5b702ddac216ad29a0f5d339c3e12d66a43eaad05c370ffdfa3bba96740a3f1f2c8b7e2aa7f4d
7
+ data.tar.gz: 2847b9f7d5ae73f36be387594085fd211203e9c365ee25b2cf9655b7ccd1565a9fa2b7d64961ebbb23cccebb9f889607a3b9756309dd0c69efd70f6aa848e055
data/.gitignore ADDED
@@ -0,0 +1,14 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+
11
+ # rspec failure tracking
12
+ .rspec_status
13
+
14
+ tags
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,5 @@
1
+ sudo: false
2
+ language: ruby
3
+ rvm:
4
+ - 2.2.5
5
+ before_install: gem install bundler -v 1.14.6
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in entropic.gemspec
4
+ gemspec
data/README.md ADDED
@@ -0,0 +1,72 @@
1
+ # Entropic
2
+
3
+ Entropic trains and predicts entropy of strings based on character n-gram models. For example:
4
+
5
+ ```ruby
6
+ require 'entropic'
7
+ >> m = Entropic::Model.read(open('https://raw.githubusercontent.com/willf/entropy/master/data/google_books_2.tsv')); true
8
+ => true
9
+ >> m.predict("entropy")
10
+ => {:log_prob_total=>-37.181802347513745, :log_prob_average=>-6.1969670579189575, :size=>6}
11
+ >> m.predict("yportne")
12
+ => {:log_prob_total=>-34.25705444264748, :log_prob_average=>-5.70950907377458, :size=>6}
13
+ ```
14
+
15
+ The string 'yportne' is much less likely than the string 'entropy'.
16
+
17
+ You can also train a model, using strings one per line.
18
+
19
+ ```ruby
20
+ >> n = Entropic::Model.new(2); true
21
+ => true
22
+ >> File.open('/tmp/training.txt') {|f| n.train(f)}; true
23
+ => true
24
+ >> n.predict('love')
25
+ => {:log_prob_total=>-15.396216763909154, :log_prob_average=>-5.132072254636385, :size=>3}
26
+ ```
27
+
28
+ You can also train a model, using strings and a count of the number of times it appers, tab separated.
29
+
30
+ ```ruby
31
+ >> o = Entropic::Model.new(2); true
32
+ => true
33
+ >> File.open('/tmp/training_with_counts.txt') {|f| o.train_with_multiplier(f)}; true
34
+ => true
35
+ >> o.predict('love')
36
+ => {:log_prob_total=>-15.396216763909154, :log_prob_average=>-5.132072254636385, :size=>3}
37
+ ```
38
+
39
+ You can also dump a model, to be read later.
40
+
41
+ ```ruby
42
+ >> File.open('/tmp/save.tsv','w') {|f| o.dump(f)}; true
43
+ => true
44
+ ```
45
+
46
+ ## Installation
47
+
48
+ Add this line to your application's Gemfile:
49
+
50
+ ```ruby
51
+ gem 'entropic'
52
+ ```
53
+
54
+ And then execute:
55
+
56
+ $ bundle
57
+
58
+ Or install it yourself as:
59
+
60
+ $ gem install entropic
61
+
62
+
63
+ ## Development
64
+
65
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
66
+
67
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
68
+
69
+ ## Contributing
70
+
71
+ Bug reports and pull requests are welcome on GitHub at https://github.com/willf/entropic.
72
+
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "entropic"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
data/entropic.gemspec ADDED
@@ -0,0 +1,35 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'entropic/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "entropic"
8
+ spec.version = Entropic::VERSION
9
+ spec.authors = ["Will Fitzgerald"]
10
+ spec.email = ["willf@github.com"]
11
+
12
+ spec.description = %q{Train and predict string entropy based on character n-grams.}
13
+ spec.summary = spec.description
14
+ spec.homepage = "https://github.com/willf/entropic"
15
+
16
+ # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
17
+ # to allow pushing to a single host or delete this section to allow pushing to any host.
18
+ # if spec.respond_to?(:metadata)
19
+ # spec.metadata['allowed_push_host'] = "TODO: Set to 'http://mygemserver.com'"
20
+ # else
21
+ # raise "RubyGems 2.0 or newer is required to protect against " \
22
+ # "public gem pushes."
23
+ # end
24
+
25
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
26
+ f.match(%r{^(test|spec|features)/})
27
+ end
28
+ spec.bindir = "exe"
29
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
30
+ spec.require_paths = ["lib"]
31
+
32
+ spec.add_development_dependency "bundler", "~> 1.14"
33
+ spec.add_development_dependency "rake", "~> 10.0"
34
+ spec.add_development_dependency "rspec", "~> 3.0"
35
+ end
@@ -0,0 +1,3 @@
1
+ module Entropic
2
+ VERSION = "0.1.0"
3
+ end
data/lib/entropic.rb ADDED
@@ -0,0 +1,230 @@
1
+ require "entropic/version"
2
+
3
+ # Public: classes and methods useful for estimating entropy on strings.
4
+ #
5
+ # Examples
6
+ #
7
+ # model = Entropic.Model.read("ngrams.tsv")
8
+ # model.predict("the")
9
+ # # => { log_prob_total: -101.1, log_prob_average: -20.02, size: 5 }
10
+ #
11
+ module Entropic
12
+
13
+ # Public: create a sliding window of ngrams from a string
14
+ #
15
+ # string: The String to slide over
16
+ # n: The Integer defining the size of the ngrams
17
+ #
18
+ # Examples
19
+ #
20
+ # sliding('01234', 2)
21
+ # # => ['01', '12', '23', '34']
22
+ #
23
+ def Entropic.sliding(string, n)
24
+ (0..string.length - n).map { |i| (string[i, n]).to_s }
25
+ end
26
+
27
+ # Public: a counter for ngrams
28
+ class NGramCounter
29
+ attr_accessor :size, :counts, :total
30
+ def initialize(size)
31
+ @size = size
32
+ @counts = Hash.new(0)
33
+ @total = 0
34
+ end
35
+
36
+ # Public: update a counter with a string, and a multiplier
37
+ #
38
+ # Examples
39
+ #
40
+ # counter = NGramCounter.new(2)
41
+ # counter.update_with_multiplier('01234', 1)
42
+ #
43
+ # string: The String to update with
44
+ # multiplier: The Integer describing how much weight (will often be 1)
45
+ #
46
+ def update_with_multiplier(string, multiplier)
47
+ Entropic.sliding(string, @size).each do |ngram|
48
+ @counts[ngram] += multiplier
49
+ @total += multiplier
50
+ end
51
+ end
52
+
53
+ # Public: update a counter with a string, with a multiplier of 1
54
+ #
55
+ # Examples
56
+ #
57
+ # counter = NGramCounter.new(2)
58
+ # counter.update('01234')
59
+ #
60
+ # string: The String to update with
61
+ #
62
+ def update(string)
63
+ update_with_multiplier(string, 1)
64
+ end
65
+
66
+ # Public: get count for string, with default
67
+ #
68
+ # Examples
69
+ #
70
+ # counter = NGramCounter.new(2)
71
+ # counter.update('01234')
72
+ # counter.count('01', 0)
73
+ # #=> 1
74
+ # counter.count('bob, 0)
75
+ # #=> 0
76
+ #
77
+ # ngram: The String to check
78
+ # if_not_found : what to update with
79
+ #
80
+ def count(ngram, if_not_found)
81
+ @counts.fetch(ngram, if_not_found)
82
+ end
83
+ end
84
+
85
+ # Public; A model for entropy
86
+ class Model
87
+ VERSION = '1.0.0'.freeze
88
+ attr_accessor :size, :map
89
+
90
+ def initialize(size)
91
+ @size = size
92
+ @map = {}
93
+ (1..size).each { |key| @map[key] = NGramCounter.new(key) }
94
+ end
95
+
96
+ # Public: update a model with a string, and a multiplier
97
+ #
98
+ # Examples
99
+ #
100
+ # model = Model.new(2)
101
+ # model.update_with_multiplier('01234', 1)
102
+ #
103
+ # string: The String to update with
104
+ # multiplier: The Integer describing how much weight (will often be 1)
105
+ #
106
+ def update_with_multiplier(string, multiplier)
107
+ @map.each do |_, counter|
108
+ counter.update_with_multiplier(string, multiplier)
109
+ end
110
+ end
111
+
112
+ # Public: update a model with a string, with mulitplier or 1
113
+ #
114
+ # Examples
115
+ #
116
+ # model = Model.new(2)
117
+ # model.update('01234')
118
+ #
119
+ # string: The String to update with
120
+ #
121
+ def update(string)
122
+ update_with_multiplier(string, 1)
123
+ end
124
+
125
+ # Public: log probability of a ngram string in a model
126
+ # returns value of first suffix of string
127
+ # or log_prob of a 1-gram appearing once if no suffix found
128
+ #
129
+ # Examples
130
+ #
131
+ # model = Model.new(2)
132
+ # model.update('01234')
133
+ # model.log_prob('01')
134
+ #
135
+ # string: The String to query
136
+ #
137
+ def log_prob(key)
138
+ last_total = 1
139
+ if @map.all? { |_, m| m.counts.empty? } || !key || key == ''
140
+ return Math.log(0, 2.0) # -Infinity
141
+ end
142
+
143
+ (1..key.size).each do |i|
144
+ k = key[-i..-1]
145
+ counter = @map.fetch(k.size, nil)
146
+ next unless counter
147
+ count = counter.counts.fetch(k, nil)
148
+ return Math.log(count, 2.0) - Math.log(counter.total, 2.0) if count
149
+ last_total = counter.total
150
+ end
151
+ # found it nowhere. Return '1 count' from last total
152
+ Math.log(1.0, 2.0) - Math.log(last_total, 2.0)
153
+ end
154
+
155
+ # Public: dump model to some io object
156
+ #
157
+ # io: the IOWriter to write to
158
+ #
159
+ def dump(io)
160
+ @map.each do |k, m|
161
+ m.counts.each do |ngram, count|
162
+ io.write("#{k}\t#{ngram}\t#{count}\n")
163
+ end
164
+ end
165
+ end
166
+
167
+ # Public: predict the log_prob sum and average over a string
168
+ # which will be split into ngrams
169
+ #
170
+ # string: The String to query
171
+ #
172
+ # returns: a dictionary of
173
+ # - log_prob_total
174
+ # - log_prob_average
175
+ # - size (number of ngrams in string)
176
+ def predict(string)
177
+ ngrams = Entropic.sliding(string, @size)
178
+ log_prob_total = ngrams.map { |ngram| log_prob(ngram) }.inject(0.0, :+)
179
+ log_prob_average = log_prob_total / ngrams.size.to_f
180
+ { log_prob_total: log_prob_total, log_prob_average: log_prob_average, size: ngrams.size }
181
+ end
182
+
183
+ # Public: create a Model from reading from an IO object
184
+ #
185
+ # io: the IOReader
186
+ #
187
+ # returns: Model with stats filled in, and size of largest ngram
188
+ def self.read(io)
189
+ model = Model.new(0)
190
+ max_size = 0
191
+ io.each_line do |string|
192
+ ngram_size, ngram, count = string.strip.split(/\t/)
193
+ ngram_size = ngram_size.to_i
194
+ count = count.to_f
195
+ model.map[ngram_size] = NGramCounter.new(ngram_size) unless model.map.include?(ngram_size)
196
+ counter = model.map[ngram_size]
197
+ counter.total += count
198
+ counter.counts[ngram] = count
199
+ max_size = ngram_size if ngram_size > max_size
200
+ end
201
+ model.size = max_size
202
+ model
203
+ end
204
+
205
+ # Public: Train a model on a bunch of data, line by line
206
+ #
207
+ # io: the IOReader
208
+ #
209
+ def train(io)
210
+ io.each_line do |string|
211
+ update(string)
212
+ end
213
+ end
214
+
215
+ # Public: Train a model on a bunch of data, line by line,
216
+ # with a multiplier
217
+ # each data line should be <string><tab><multiplier>
218
+ #
219
+ # io: the IOReader
220
+ #
221
+ def train_with_multiplier(io)
222
+ io.each_line do |string|
223
+ text, count = string.strip.split(/\t/)
224
+ count = count.to_i
225
+ update_with_multiplier(text, count)
226
+ end
227
+ end
228
+ end
229
+ end
230
+
metadata ADDED
@@ -0,0 +1,96 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: entropic
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Will Fitzgerald
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2018-01-12 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.14'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.14'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.0'
55
+ description: Train and predict string entropy based on character n-grams.
56
+ email:
57
+ - willf@github.com
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - ".gitignore"
63
+ - ".rspec"
64
+ - ".travis.yml"
65
+ - Gemfile
66
+ - README.md
67
+ - Rakefile
68
+ - bin/console
69
+ - bin/setup
70
+ - entropic.gemspec
71
+ - lib/entropic.rb
72
+ - lib/entropic/version.rb
73
+ homepage: https://github.com/willf/entropic
74
+ licenses: []
75
+ metadata: {}
76
+ post_install_message:
77
+ rdoc_options: []
78
+ require_paths:
79
+ - lib
80
+ required_ruby_version: !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - ">="
83
+ - !ruby/object:Gem::Version
84
+ version: '0'
85
+ required_rubygems_version: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ requirements: []
91
+ rubyforge_project:
92
+ rubygems_version: 2.4.5.1
93
+ signing_key:
94
+ specification_version: 4
95
+ summary: Train and predict string entropy based on character n-grams.
96
+ test_files: []