entropic 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: d35ae5edab8f2a86c92301ba0a35c1c90bfd3d26
4
+ data.tar.gz: 3463138f664205b707808c9ae8783120258c897f
5
+ SHA512:
6
+ metadata.gz: 862964de635c81b16806aa8f9acb1a5e347ac399208d688ed4b5b702ddac216ad29a0f5d339c3e12d66a43eaad05c370ffdfa3bba96740a3f1f2c8b7e2aa7f4d
7
+ data.tar.gz: 2847b9f7d5ae73f36be387594085fd211203e9c365ee25b2cf9655b7ccd1565a9fa2b7d64961ebbb23cccebb9f889607a3b9756309dd0c69efd70f6aa848e055
data/.gitignore ADDED
@@ -0,0 +1,14 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+
11
+ # rspec failure tracking
12
+ .rspec_status
13
+
14
+ tags
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,5 @@
1
+ sudo: false
2
+ language: ruby
3
+ rvm:
4
+ - 2.2.5
5
+ before_install: gem install bundler -v 1.14.6
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in entropic.gemspec
4
+ gemspec
data/README.md ADDED
@@ -0,0 +1,72 @@
1
+ # Entropic
2
+
3
+ Entropic trains and predicts entropy of strings based on character n-gram models. For example:
4
+
5
+ ```ruby
6
+ require 'entropic'
7
+ >> m = Entropic::Model.read(open('https://raw.githubusercontent.com/willf/entropy/master/data/google_books_2.tsv')); true
8
+ => true
9
+ >> m.predict("entropy")
10
+ => {:log_prob_total=>-37.181802347513745, :log_prob_average=>-6.1969670579189575, :size=>6}
11
+ >> m.predict("yportne")
12
+ => {:log_prob_total=>-34.25705444264748, :log_prob_average=>-5.70950907377458, :size=>6}
13
+ ```
14
+
15
+ The string 'yportne' is much less likely than the string 'entropy'.
16
+
17
+ You can also train a model, using strings one per line.
18
+
19
+ ```ruby
20
+ >> n = Entropic::Model.new(2); true
21
+ => true
22
+ >> File.open('/tmp/training.txt') {|f| n.train(f)}; true
23
+ => true
24
+ >> n.predict('love')
25
+ => {:log_prob_total=>-15.396216763909154, :log_prob_average=>-5.132072254636385, :size=>3}
26
+ ```
27
+
28
+ You can also train a model, using strings and a count of the number of times it appers, tab separated.
29
+
30
+ ```ruby
31
+ >> o = Entropic::Model.new(2); true
32
+ => true
33
+ >> File.open('/tmp/training_with_counts.txt') {|f| o.train_with_multiplier(f)}; true
34
+ => true
35
+ >> o.predict('love')
36
+ => {:log_prob_total=>-15.396216763909154, :log_prob_average=>-5.132072254636385, :size=>3}
37
+ ```
38
+
39
+ You can also dump a model, to be read later.
40
+
41
+ ```ruby
42
+ >> File.open('/tmp/save.tsv','w') {|f| o.dump(f)}; true
43
+ => true
44
+ ```
45
+
46
+ ## Installation
47
+
48
+ Add this line to your application's Gemfile:
49
+
50
+ ```ruby
51
+ gem 'entropic'
52
+ ```
53
+
54
+ And then execute:
55
+
56
+ $ bundle
57
+
58
+ Or install it yourself as:
59
+
60
+ $ gem install entropic
61
+
62
+
63
+ ## Development
64
+
65
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
66
+
67
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
68
+
69
+ ## Contributing
70
+
71
+ Bug reports and pull requests are welcome on GitHub at https://github.com/willf/entropic.
72
+
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "entropic"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
data/entropic.gemspec ADDED
@@ -0,0 +1,35 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'entropic/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "entropic"
8
+ spec.version = Entropic::VERSION
9
+ spec.authors = ["Will Fitzgerald"]
10
+ spec.email = ["willf@github.com"]
11
+
12
+ spec.description = %q{Train and predict string entropy based on character n-grams.}
13
+ spec.summary = spec.description
14
+ spec.homepage = "https://github.com/willf/entropic"
15
+
16
+ # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
17
+ # to allow pushing to a single host or delete this section to allow pushing to any host.
18
+ # if spec.respond_to?(:metadata)
19
+ # spec.metadata['allowed_push_host'] = "TODO: Set to 'http://mygemserver.com'"
20
+ # else
21
+ # raise "RubyGems 2.0 or newer is required to protect against " \
22
+ # "public gem pushes."
23
+ # end
24
+
25
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
26
+ f.match(%r{^(test|spec|features)/})
27
+ end
28
+ spec.bindir = "exe"
29
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
30
+ spec.require_paths = ["lib"]
31
+
32
+ spec.add_development_dependency "bundler", "~> 1.14"
33
+ spec.add_development_dependency "rake", "~> 10.0"
34
+ spec.add_development_dependency "rspec", "~> 3.0"
35
+ end
@@ -0,0 +1,3 @@
1
+ module Entropic
2
+ VERSION = "0.1.0"
3
+ end
data/lib/entropic.rb ADDED
@@ -0,0 +1,230 @@
1
+ require "entropic/version"
2
+
3
+ # Public: classes and methods useful for estimating entropy on strings.
4
+ #
5
+ # Examples
6
+ #
7
+ # model = Entropic.Model.read("ngrams.tsv")
8
+ # model.predict("the")
9
+ # # => { log_prob_total: -101.1, log_prob_average: -20.02, size: 5 }
10
+ #
11
+ module Entropic
12
+
13
+ # Public: create a sliding window of ngrams from a string
14
+ #
15
+ # string: The String to slide over
16
+ # n: The Integer defining the size of the ngrams
17
+ #
18
+ # Examples
19
+ #
20
+ # sliding('01234', 2)
21
+ # # => ['01', '12', '23', '34']
22
+ #
23
+ def Entropic.sliding(string, n)
24
+ (0..string.length - n).map { |i| (string[i, n]).to_s }
25
+ end
26
+
27
+ # Public: a counter for ngrams
28
+ class NGramCounter
29
+ attr_accessor :size, :counts, :total
30
+ def initialize(size)
31
+ @size = size
32
+ @counts = Hash.new(0)
33
+ @total = 0
34
+ end
35
+
36
+ # Public: update a counter with a string, and a multiplier
37
+ #
38
+ # Examples
39
+ #
40
+ # counter = NGramCounter.new(2)
41
+ # counter.update_with_multiplier('01234', 1)
42
+ #
43
+ # string: The String to update with
44
+ # multiplier: The Integer describing how much weight (will often be 1)
45
+ #
46
+ def update_with_multiplier(string, multiplier)
47
+ Entropic.sliding(string, @size).each do |ngram|
48
+ @counts[ngram] += multiplier
49
+ @total += multiplier
50
+ end
51
+ end
52
+
53
+ # Public: update a counter with a string, with a multiplier of 1
54
+ #
55
+ # Examples
56
+ #
57
+ # counter = NGramCounter.new(2)
58
+ # counter.update('01234')
59
+ #
60
+ # string: The String to update with
61
+ #
62
+ def update(string)
63
+ update_with_multiplier(string, 1)
64
+ end
65
+
66
+ # Public: get count for string, with default
67
+ #
68
+ # Examples
69
+ #
70
+ # counter = NGramCounter.new(2)
71
+ # counter.update('01234')
72
+ # counter.count('01', 0)
73
+ # #=> 1
74
+ # counter.count('bob, 0)
75
+ # #=> 0
76
+ #
77
+ # ngram: The String to check
78
+ # if_not_found : what to update with
79
+ #
80
+ def count(ngram, if_not_found)
81
+ @counts.fetch(ngram, if_not_found)
82
+ end
83
+ end
84
+
85
+ # Public; A model for entropy
86
+ class Model
87
+ VERSION = '1.0.0'.freeze
88
+ attr_accessor :size, :map
89
+
90
+ def initialize(size)
91
+ @size = size
92
+ @map = {}
93
+ (1..size).each { |key| @map[key] = NGramCounter.new(key) }
94
+ end
95
+
96
+ # Public: update a model with a string, and a multiplier
97
+ #
98
+ # Examples
99
+ #
100
+ # model = Model.new(2)
101
+ # model.update_with_multiplier('01234', 1)
102
+ #
103
+ # string: The String to update with
104
+ # multiplier: The Integer describing how much weight (will often be 1)
105
+ #
106
+ def update_with_multiplier(string, multiplier)
107
+ @map.each do |_, counter|
108
+ counter.update_with_multiplier(string, multiplier)
109
+ end
110
+ end
111
+
112
+ # Public: update a model with a string, with mulitplier or 1
113
+ #
114
+ # Examples
115
+ #
116
+ # model = Model.new(2)
117
+ # model.update('01234')
118
+ #
119
+ # string: The String to update with
120
+ #
121
+ def update(string)
122
+ update_with_multiplier(string, 1)
123
+ end
124
+
125
+ # Public: log probability of a ngram string in a model
126
+ # returns value of first suffix of string
127
+ # or log_prob of a 1-gram appearing once if no suffix found
128
+ #
129
+ # Examples
130
+ #
131
+ # model = Model.new(2)
132
+ # model.update('01234')
133
+ # model.log_prob('01')
134
+ #
135
+ # string: The String to query
136
+ #
137
+ def log_prob(key)
138
+ last_total = 1
139
+ if @map.all? { |_, m| m.counts.empty? } || !key || key == ''
140
+ return Math.log(0, 2.0) # -Infinity
141
+ end
142
+
143
+ (1..key.size).each do |i|
144
+ k = key[-i..-1]
145
+ counter = @map.fetch(k.size, nil)
146
+ next unless counter
147
+ count = counter.counts.fetch(k, nil)
148
+ return Math.log(count, 2.0) - Math.log(counter.total, 2.0) if count
149
+ last_total = counter.total
150
+ end
151
+ # found it nowhere. Return '1 count' from last total
152
+ Math.log(1.0, 2.0) - Math.log(last_total, 2.0)
153
+ end
154
+
155
+ # Public: dump model to some io object
156
+ #
157
+ # io: the IOWriter to write to
158
+ #
159
+ def dump(io)
160
+ @map.each do |k, m|
161
+ m.counts.each do |ngram, count|
162
+ io.write("#{k}\t#{ngram}\t#{count}\n")
163
+ end
164
+ end
165
+ end
166
+
167
+ # Public: predict the log_prob sum and average over a string
168
+ # which will be split into ngrams
169
+ #
170
+ # string: The String to query
171
+ #
172
+ # returns: a dictionary of
173
+ # - log_prob_total
174
+ # - log_prob_average
175
+ # - size (number of ngrams in string)
176
+ def predict(string)
177
+ ngrams = Entropic.sliding(string, @size)
178
+ log_prob_total = ngrams.map { |ngram| log_prob(ngram) }.inject(0.0, :+)
179
+ log_prob_average = log_prob_total / ngrams.size.to_f
180
+ { log_prob_total: log_prob_total, log_prob_average: log_prob_average, size: ngrams.size }
181
+ end
182
+
183
+ # Public: create a Model from reading from an IO object
184
+ #
185
+ # io: the IOReader
186
+ #
187
+ # returns: Model with stats filled in, and size of largest ngram
188
+ def self.read(io)
189
+ model = Model.new(0)
190
+ max_size = 0
191
+ io.each_line do |string|
192
+ ngram_size, ngram, count = string.strip.split(/\t/)
193
+ ngram_size = ngram_size.to_i
194
+ count = count.to_f
195
+ model.map[ngram_size] = NGramCounter.new(ngram_size) unless model.map.include?(ngram_size)
196
+ counter = model.map[ngram_size]
197
+ counter.total += count
198
+ counter.counts[ngram] = count
199
+ max_size = ngram_size if ngram_size > max_size
200
+ end
201
+ model.size = max_size
202
+ model
203
+ end
204
+
205
+ # Public: Train a model on a bunch of data, line by line
206
+ #
207
+ # io: the IOReader
208
+ #
209
+ def train(io)
210
+ io.each_line do |string|
211
+ update(string)
212
+ end
213
+ end
214
+
215
+ # Public: Train a model on a bunch of data, line by line,
216
+ # with a multiplier
217
+ # each data line should be <string><tab><multiplier>
218
+ #
219
+ # io: the IOReader
220
+ #
221
+ def train_with_multiplier(io)
222
+ io.each_line do |string|
223
+ text, count = string.strip.split(/\t/)
224
+ count = count.to_i
225
+ update_with_multiplier(text, count)
226
+ end
227
+ end
228
+ end
229
+ end
230
+
metadata ADDED
@@ -0,0 +1,96 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: entropic
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Will Fitzgerald
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2018-01-12 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.14'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.14'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.0'
55
+ description: Train and predict string entropy based on character n-grams.
56
+ email:
57
+ - willf@github.com
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - ".gitignore"
63
+ - ".rspec"
64
+ - ".travis.yml"
65
+ - Gemfile
66
+ - README.md
67
+ - Rakefile
68
+ - bin/console
69
+ - bin/setup
70
+ - entropic.gemspec
71
+ - lib/entropic.rb
72
+ - lib/entropic/version.rb
73
+ homepage: https://github.com/willf/entropic
74
+ licenses: []
75
+ metadata: {}
76
+ post_install_message:
77
+ rdoc_options: []
78
+ require_paths:
79
+ - lib
80
+ required_ruby_version: !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - ">="
83
+ - !ruby/object:Gem::Version
84
+ version: '0'
85
+ required_rubygems_version: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ requirements: []
91
+ rubyforge_project:
92
+ rubygems_version: 2.4.5.1
93
+ signing_key:
94
+ specification_version: 4
95
+ summary: Train and predict string entropy based on character n-grams.
96
+ test_files: []