segment_ruby 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/.rspec +2 -0
  4. data/.travis.yml +5 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE +21 -0
  7. data/README.md +48 -0
  8. data/Rakefile +6 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/data/segment_ruby/anchor/README.md +36 -0
  12. data/data/segment_ruby/anchor/frequencies.tsv +100000 -0
  13. data/data/segment_ruby/anchor/total.tsv +1 -0
  14. data/data/segment_ruby/google_books/2_frequencies.tsv +250000 -0
  15. data/data/segment_ruby/google_books/2_total.tsv +1 -0
  16. data/data/segment_ruby/google_books/README.md +65 -0
  17. data/data/segment_ruby/google_books/frequencies.tsv +333333 -0
  18. data/data/segment_ruby/google_books/total.tsv +1 -0
  19. data/data/segment_ruby/norvig/2_frequencies.tsv +286358 -0
  20. data/data/segment_ruby/norvig/2_total.tsv +1 -0
  21. data/data/segment_ruby/norvig/README.md +65 -0
  22. data/data/segment_ruby/norvig/frequencies.tsv +333333 -0
  23. data/data/segment_ruby/norvig/total.tsv +1 -0
  24. data/data/segment_ruby/small/2_frequencies.tsv +20000 -0
  25. data/data/segment_ruby/small/2_total.tsv +1 -0
  26. data/data/segment_ruby/small/README.md +6 -0
  27. data/data/segment_ruby/small/frequencies.tsv +100000 -0
  28. data/data/segment_ruby/small/total.tsv +1 -0
  29. data/data/segment_ruby/twitter/2_frequencies.tsv +100000 -0
  30. data/data/segment_ruby/twitter/2_total.tsv +1 -0
  31. data/data/segment_ruby/twitter/README.md +66 -0
  32. data/data/segment_ruby/twitter/frequencies.tsv +100000 -0
  33. data/data/segment_ruby/twitter/total.tsv +1 -0
  34. data/lib/segment_ruby.rb +138 -0
  35. data/lib/segment_ruby/version.rb +3 -0
  36. data/segment_ruby.gemspec +35 -0
  37. metadata +134 -0
@@ -0,0 +1 @@
1
+ 1174595007
@@ -0,0 +1,138 @@
1
+ require "segment_ruby/version"
2
+ require 'pathname'
3
+
4
+ # Based on "Natural Language Corpus Data"
5
+ # from the book "Beautiful Data" (Segaran and Hammerbacher, 2009)
6
+ # by Peter Novig
7
+
8
+ module SegmentRuby
9
+
10
+ LOG_1 = Math.log2(1)
11
+
12
+ class ProbabilityDistribution
13
+ def initialize(total_file_name, data_file_name)
14
+ @total_file_name = total_file_name
15
+ @data_file_name = total_file_name
16
+ begin
17
+ total = File.read(total_file_name).to_i
18
+ @log_total= Math.log2(total)
19
+ rescue
20
+ @log_total= Math.log2(10**1000)
21
+ end
22
+
23
+ @table = Hash.new{|w| -Float::INFINITY}
24
+ File.open(data_file_name).each_line do |line|
25
+ data = line.split(/\s/)
26
+ freq = data[-1].to_i
27
+ keys = data[0..-2]
28
+ key = keys.join(' ')
29
+ log_p = Math.log2(freq) - @log_total
30
+ @table[key] = log_p
31
+ end
32
+ true
33
+ end
34
+
35
+ def table
36
+ @table
37
+ end
38
+
39
+ def files
40
+ [@total_file_name, @data_file_name]
41
+ end
42
+
43
+ def log_prob(w)
44
+ @table[w]
45
+ end
46
+
47
+ def prob(w)
48
+ 2**@table[w]
49
+ end
50
+
51
+ def log_total
52
+ @log_total
53
+ end
54
+
55
+ def total
56
+ 2**@log_total
57
+ end
58
+
59
+ def has_key?(w)
60
+ @table.has_key?(w)
61
+ end
62
+ end
63
+
64
+ class Analyzer
65
+
66
+ def initialize(model='small', max_word_length=20)
67
+ @model = model
68
+ @max_word_length = max_word_length
69
+ # unigram log probabilities
70
+ @ulp = ProbabilityDistribution.new(total_file_name(''), freq_file_name(''))
71
+ # bigram log probabilities
72
+ btf = total_file_name('2_')
73
+ bff = freq_file_name('2_')
74
+ @blp = ((File.exists?(btf) and File.exists?(bff)) ? ProbabilityDistribution.new(btf, bff) : false)
75
+ true
76
+ end
77
+
78
+ def model
79
+ @model
80
+ end
81
+
82
+ def max_word_length
83
+ @max_word_length
84
+ end
85
+
86
+ def ulp
87
+ @ulp
88
+ end
89
+
90
+ def blp
91
+ @blp
92
+ end
93
+
94
+ def log_Pr(w)
95
+ @ulp.log_prob(w)
96
+ end
97
+
98
+ def log_CPr(w, prev)
99
+ key = [prev, w].join(' ')
100
+ (@blp and @blp.has_key?(key)) ? @blp.log_prob(key) : @ulp.log_prob(w)
101
+ end
102
+
103
+ def total_file_name(prefix)
104
+ File.join(__dir__, "..", "data", "segment_ruby", @model, prefix + 'total.tsv')
105
+ end
106
+
107
+ def freq_file_name(prefix)
108
+ File.join(__dir__, "..", "data", "segment_ruby", @model, prefix + 'frequencies.tsv')
109
+ end
110
+
111
+ # Returns all the splits of a string up to a given length
112
+ def splits(text)
113
+ (0..[@max_word_length,text.size-1].min).map{|i| [text[0..i], text[i+1..text.size] ] }
114
+ end
115
+
116
+ def combine(pFirst, first, segmented)
117
+ pRem,rem = segmented
118
+ [pFirst+pRem, [first]+rem]
119
+ end
120
+
121
+ def segment_r(text, prev, n, memo)
122
+ return [0.0, []] if not text or (text.size == 0)
123
+ return memo[text] if memo.has_key?(text)
124
+ log_p_segment = splits(text).map do |first, rem|
125
+ log_p = log_CPr(first, prev)
126
+ combine(log_p, first, segment_r(rem, first, n+1, memo))
127
+ end.max
128
+ memo[text] = log_p_segment
129
+ log_p_segment
130
+ end
131
+
132
+ def segment(text, prev='<S>')
133
+ p, segmentation = segment_r(text, prev, 0, Hash.new)
134
+ segmentation
135
+ end
136
+
137
+ end
138
+ end
@@ -0,0 +1,3 @@
1
+ module SegmentRuby
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,35 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'segment_ruby/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "segment_ruby"
8
+ spec.version = SegmentRuby::VERSION
9
+ spec.authors = ["Will Fitzgerald"]
10
+ spec.email = ["will.fitzgerald@pobox.com"]
11
+
12
+ spec.summary = %q{segments text according word frequency using the Viterbi algorithm.}
13
+ spec.homepage = "https://github.com/willf/segment_ruby"
14
+
15
+ # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
16
+ # to allow pushing to a single host or delete this section to allow pushing to any host.
17
+ # if spec.respond_to?(:metadata)
18
+ # spec.metadata['allowed_push_host'] = "TODO: Set to 'http://mygemserver.com'"
19
+ # else
20
+ # raise "RubyGems 2.0 or newer is required to protect against " \
21
+ # "public gem pushes."
22
+ # end
23
+
24
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
25
+ f.match(%r{^(test|spec|features)/})
26
+ end
27
+ spec.bindir = "exe"
28
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
29
+ spec.require_paths = ["lib"]
30
+
31
+ spec.add_development_dependency "bundler", "~> 1.13"
32
+ spec.add_development_dependency "rake", "~> 10.0"
33
+ spec.add_development_dependency "rspec", "~> 3.0"
34
+ spec.add_development_dependency 'pry', '~> 0.9.12.2'
35
+ end
metadata ADDED
@@ -0,0 +1,134 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: segment_ruby
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Will Fitzgerald
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2016-10-13 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.13'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.13'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: pry
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: 0.9.12.2
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: 0.9.12.2
69
+ description:
70
+ email:
71
+ - will.fitzgerald@pobox.com
72
+ executables: []
73
+ extensions: []
74
+ extra_rdoc_files: []
75
+ files:
76
+ - ".gitignore"
77
+ - ".rspec"
78
+ - ".travis.yml"
79
+ - Gemfile
80
+ - LICENSE
81
+ - README.md
82
+ - Rakefile
83
+ - bin/console
84
+ - bin/setup
85
+ - data/segment_ruby/anchor/README.md
86
+ - data/segment_ruby/anchor/frequencies.tsv
87
+ - data/segment_ruby/anchor/total.tsv
88
+ - data/segment_ruby/google_books/2_frequencies.tsv
89
+ - data/segment_ruby/google_books/2_total.tsv
90
+ - data/segment_ruby/google_books/README.md
91
+ - data/segment_ruby/google_books/frequencies.tsv
92
+ - data/segment_ruby/google_books/total.tsv
93
+ - data/segment_ruby/norvig/2_frequencies.tsv
94
+ - data/segment_ruby/norvig/2_total.tsv
95
+ - data/segment_ruby/norvig/README.md
96
+ - data/segment_ruby/norvig/frequencies.tsv
97
+ - data/segment_ruby/norvig/total.tsv
98
+ - data/segment_ruby/small/2_frequencies.tsv
99
+ - data/segment_ruby/small/2_total.tsv
100
+ - data/segment_ruby/small/README.md
101
+ - data/segment_ruby/small/frequencies.tsv
102
+ - data/segment_ruby/small/total.tsv
103
+ - data/segment_ruby/twitter/2_frequencies.tsv
104
+ - data/segment_ruby/twitter/2_total.tsv
105
+ - data/segment_ruby/twitter/README.md
106
+ - data/segment_ruby/twitter/frequencies.tsv
107
+ - data/segment_ruby/twitter/total.tsv
108
+ - lib/segment_ruby.rb
109
+ - lib/segment_ruby/version.rb
110
+ - segment_ruby.gemspec
111
+ homepage: https://github.com/willf/segment_ruby
112
+ licenses: []
113
+ metadata: {}
114
+ post_install_message:
115
+ rdoc_options: []
116
+ require_paths:
117
+ - lib
118
+ required_ruby_version: !ruby/object:Gem::Requirement
119
+ requirements:
120
+ - - ">="
121
+ - !ruby/object:Gem::Version
122
+ version: '0'
123
+ required_rubygems_version: !ruby/object:Gem::Requirement
124
+ requirements:
125
+ - - ">="
126
+ - !ruby/object:Gem::Version
127
+ version: '0'
128
+ requirements: []
129
+ rubyforge_project:
130
+ rubygems_version: 2.4.5.1
131
+ signing_key:
132
+ specification_version: 4
133
+ summary: segments text according word frequency using the Viterbi algorithm.
134
+ test_files: []