segment_ruby 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/.rspec +2 -0
  4. data/.travis.yml +5 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE +21 -0
  7. data/README.md +48 -0
  8. data/Rakefile +6 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/data/segment_ruby/anchor/README.md +36 -0
  12. data/data/segment_ruby/anchor/frequencies.tsv +100000 -0
  13. data/data/segment_ruby/anchor/total.tsv +1 -0
  14. data/data/segment_ruby/google_books/2_frequencies.tsv +250000 -0
  15. data/data/segment_ruby/google_books/2_total.tsv +1 -0
  16. data/data/segment_ruby/google_books/README.md +65 -0
  17. data/data/segment_ruby/google_books/frequencies.tsv +333333 -0
  18. data/data/segment_ruby/google_books/total.tsv +1 -0
  19. data/data/segment_ruby/norvig/2_frequencies.tsv +286358 -0
  20. data/data/segment_ruby/norvig/2_total.tsv +1 -0
  21. data/data/segment_ruby/norvig/README.md +65 -0
  22. data/data/segment_ruby/norvig/frequencies.tsv +333333 -0
  23. data/data/segment_ruby/norvig/total.tsv +1 -0
  24. data/data/segment_ruby/small/2_frequencies.tsv +20000 -0
  25. data/data/segment_ruby/small/2_total.tsv +1 -0
  26. data/data/segment_ruby/small/README.md +6 -0
  27. data/data/segment_ruby/small/frequencies.tsv +100000 -0
  28. data/data/segment_ruby/small/total.tsv +1 -0
  29. data/data/segment_ruby/twitter/2_frequencies.tsv +100000 -0
  30. data/data/segment_ruby/twitter/2_total.tsv +1 -0
  31. data/data/segment_ruby/twitter/README.md +66 -0
  32. data/data/segment_ruby/twitter/frequencies.tsv +100000 -0
  33. data/data/segment_ruby/twitter/total.tsv +1 -0
  34. data/lib/segment_ruby.rb +138 -0
  35. data/lib/segment_ruby/version.rb +3 -0
  36. data/segment_ruby.gemspec +35 -0
  37. metadata +134 -0
@@ -0,0 +1 @@
1
+ 1174595007
@@ -0,0 +1,138 @@
1
+ require "segment_ruby/version"
2
+ require 'pathname'
3
+
4
+ # Based on "Natural Language Corpus Data"
5
+ # from the book "Beautiful Data" (Segaran and Hammerbacher, 2009)
6
+ # by Peter Novig
7
+
8
+ module SegmentRuby
9
+
10
+ LOG_1 = Math.log2(1)
11
+
12
+ class ProbabilityDistribution
13
+ def initialize(total_file_name, data_file_name)
14
+ @total_file_name = total_file_name
15
+ @data_file_name = total_file_name
16
+ begin
17
+ total = File.read(total_file_name).to_i
18
+ @log_total= Math.log2(total)
19
+ rescue
20
+ @log_total= Math.log2(10**1000)
21
+ end
22
+
23
+ @table = Hash.new{|w| -Float::INFINITY}
24
+ File.open(data_file_name).each_line do |line|
25
+ data = line.split(/\s/)
26
+ freq = data[-1].to_i
27
+ keys = data[0..-2]
28
+ key = keys.join(' ')
29
+ log_p = Math.log2(freq) - @log_total
30
+ @table[key] = log_p
31
+ end
32
+ true
33
+ end
34
+
35
+ def table
36
+ @table
37
+ end
38
+
39
+ def files
40
+ [@total_file_name, @data_file_name]
41
+ end
42
+
43
+ def log_prob(w)
44
+ @table[w]
45
+ end
46
+
47
+ def prob(w)
48
+ 2**@table[w]
49
+ end
50
+
51
+ def log_total
52
+ @log_total
53
+ end
54
+
55
+ def total
56
+ 2**@log_total
57
+ end
58
+
59
+ def has_key?(w)
60
+ @table.has_key?(w)
61
+ end
62
+ end
63
+
64
+ class Analyzer
65
+
66
+ def initialize(model='small', max_word_length=20)
67
+ @model = model
68
+ @max_word_length = max_word_length
69
+ # unigram log probabilities
70
+ @ulp = ProbabilityDistribution.new(total_file_name(''), freq_file_name(''))
71
+ # bigram log probabilities
72
+ btf = total_file_name('2_')
73
+ bff = freq_file_name('2_')
74
+ @blp = ((File.exists?(btf) and File.exists?(bff)) ? ProbabilityDistribution.new(btf, bff) : false)
75
+ true
76
+ end
77
+
78
+ def model
79
+ @model
80
+ end
81
+
82
+ def max_word_length
83
+ @max_word_length
84
+ end
85
+
86
+ def ulp
87
+ @ulp
88
+ end
89
+
90
+ def blp
91
+ @blp
92
+ end
93
+
94
+ def log_Pr(w)
95
+ @ulp.log_prob(w)
96
+ end
97
+
98
+ def log_CPr(w, prev)
99
+ key = [prev, w].join(' ')
100
+ (@blp and @blp.has_key?(key)) ? @blp.log_prob(key) : @ulp.log_prob(w)
101
+ end
102
+
103
+ def total_file_name(prefix)
104
+ File.join(__dir__, "..", "data", "segment_ruby", @model, prefix + 'total.tsv')
105
+ end
106
+
107
+ def freq_file_name(prefix)
108
+ File.join(__dir__, "..", "data", "segment_ruby", @model, prefix + 'frequencies.tsv')
109
+ end
110
+
111
+ # Returns all the splits of a string up to a given length
112
+ def splits(text)
113
+ (0..[@max_word_length,text.size-1].min).map{|i| [text[0..i], text[i+1..text.size] ] }
114
+ end
115
+
116
+ def combine(pFirst, first, segmented)
117
+ pRem,rem = segmented
118
+ [pFirst+pRem, [first]+rem]
119
+ end
120
+
121
+ def segment_r(text, prev, n, memo)
122
+ return [0.0, []] if not text or (text.size == 0)
123
+ return memo[text] if memo.has_key?(text)
124
+ log_p_segment = splits(text).map do |first, rem|
125
+ log_p = log_CPr(first, prev)
126
+ combine(log_p, first, segment_r(rem, first, n+1, memo))
127
+ end.max
128
+ memo[text] = log_p_segment
129
+ log_p_segment
130
+ end
131
+
132
+ def segment(text, prev='<S>')
133
+ p, segmentation = segment_r(text, prev, 0, Hash.new)
134
+ segmentation
135
+ end
136
+
137
+ end
138
+ end
@@ -0,0 +1,3 @@
1
+ module SegmentRuby
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,35 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'segment_ruby/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "segment_ruby"
8
+ spec.version = SegmentRuby::VERSION
9
+ spec.authors = ["Will Fitzgerald"]
10
+ spec.email = ["will.fitzgerald@pobox.com"]
11
+
12
+ spec.summary = %q{segments text according word frequency using the Viterbi algorithm.}
13
+ spec.homepage = "https://github.com/willf/segment_ruby"
14
+
15
+ # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
16
+ # to allow pushing to a single host or delete this section to allow pushing to any host.
17
+ # if spec.respond_to?(:metadata)
18
+ # spec.metadata['allowed_push_host'] = "TODO: Set to 'http://mygemserver.com'"
19
+ # else
20
+ # raise "RubyGems 2.0 or newer is required to protect against " \
21
+ # "public gem pushes."
22
+ # end
23
+
24
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
25
+ f.match(%r{^(test|spec|features)/})
26
+ end
27
+ spec.bindir = "exe"
28
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
29
+ spec.require_paths = ["lib"]
30
+
31
+ spec.add_development_dependency "bundler", "~> 1.13"
32
+ spec.add_development_dependency "rake", "~> 10.0"
33
+ spec.add_development_dependency "rspec", "~> 3.0"
34
+ spec.add_development_dependency 'pry', '~> 0.9.12.2'
35
+ end
metadata ADDED
@@ -0,0 +1,134 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: segment_ruby
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Will Fitzgerald
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2016-10-13 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.13'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.13'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: pry
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: 0.9.12.2
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: 0.9.12.2
69
+ description:
70
+ email:
71
+ - will.fitzgerald@pobox.com
72
+ executables: []
73
+ extensions: []
74
+ extra_rdoc_files: []
75
+ files:
76
+ - ".gitignore"
77
+ - ".rspec"
78
+ - ".travis.yml"
79
+ - Gemfile
80
+ - LICENSE
81
+ - README.md
82
+ - Rakefile
83
+ - bin/console
84
+ - bin/setup
85
+ - data/segment_ruby/anchor/README.md
86
+ - data/segment_ruby/anchor/frequencies.tsv
87
+ - data/segment_ruby/anchor/total.tsv
88
+ - data/segment_ruby/google_books/2_frequencies.tsv
89
+ - data/segment_ruby/google_books/2_total.tsv
90
+ - data/segment_ruby/google_books/README.md
91
+ - data/segment_ruby/google_books/frequencies.tsv
92
+ - data/segment_ruby/google_books/total.tsv
93
+ - data/segment_ruby/norvig/2_frequencies.tsv
94
+ - data/segment_ruby/norvig/2_total.tsv
95
+ - data/segment_ruby/norvig/README.md
96
+ - data/segment_ruby/norvig/frequencies.tsv
97
+ - data/segment_ruby/norvig/total.tsv
98
+ - data/segment_ruby/small/2_frequencies.tsv
99
+ - data/segment_ruby/small/2_total.tsv
100
+ - data/segment_ruby/small/README.md
101
+ - data/segment_ruby/small/frequencies.tsv
102
+ - data/segment_ruby/small/total.tsv
103
+ - data/segment_ruby/twitter/2_frequencies.tsv
104
+ - data/segment_ruby/twitter/2_total.tsv
105
+ - data/segment_ruby/twitter/README.md
106
+ - data/segment_ruby/twitter/frequencies.tsv
107
+ - data/segment_ruby/twitter/total.tsv
108
+ - lib/segment_ruby.rb
109
+ - lib/segment_ruby/version.rb
110
+ - segment_ruby.gemspec
111
+ homepage: https://github.com/willf/segment_ruby
112
+ licenses: []
113
+ metadata: {}
114
+ post_install_message:
115
+ rdoc_options: []
116
+ require_paths:
117
+ - lib
118
+ required_ruby_version: !ruby/object:Gem::Requirement
119
+ requirements:
120
+ - - ">="
121
+ - !ruby/object:Gem::Version
122
+ version: '0'
123
+ required_rubygems_version: !ruby/object:Gem::Requirement
124
+ requirements:
125
+ - - ">="
126
+ - !ruby/object:Gem::Version
127
+ version: '0'
128
+ requirements: []
129
+ rubyforge_project:
130
+ rubygems_version: 2.4.5.1
131
+ signing_key:
132
+ specification_version: 4
133
+ summary: segments text according word frequency using the Viterbi algorithm.
134
+ test_files: []