segment_ruby 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.rspec +2 -0
- data/.travis.yml +5 -0
- data/Gemfile +4 -0
- data/LICENSE +21 -0
- data/README.md +48 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/data/segment_ruby/anchor/README.md +36 -0
- data/data/segment_ruby/anchor/frequencies.tsv +100000 -0
- data/data/segment_ruby/anchor/total.tsv +1 -0
- data/data/segment_ruby/google_books/2_frequencies.tsv +250000 -0
- data/data/segment_ruby/google_books/2_total.tsv +1 -0
- data/data/segment_ruby/google_books/README.md +65 -0
- data/data/segment_ruby/google_books/frequencies.tsv +333333 -0
- data/data/segment_ruby/google_books/total.tsv +1 -0
- data/data/segment_ruby/norvig/2_frequencies.tsv +286358 -0
- data/data/segment_ruby/norvig/2_total.tsv +1 -0
- data/data/segment_ruby/norvig/README.md +65 -0
- data/data/segment_ruby/norvig/frequencies.tsv +333333 -0
- data/data/segment_ruby/norvig/total.tsv +1 -0
- data/data/segment_ruby/small/2_frequencies.tsv +20000 -0
- data/data/segment_ruby/small/2_total.tsv +1 -0
- data/data/segment_ruby/small/README.md +6 -0
- data/data/segment_ruby/small/frequencies.tsv +100000 -0
- data/data/segment_ruby/small/total.tsv +1 -0
- data/data/segment_ruby/twitter/2_frequencies.tsv +100000 -0
- data/data/segment_ruby/twitter/2_total.tsv +1 -0
- data/data/segment_ruby/twitter/README.md +66 -0
- data/data/segment_ruby/twitter/frequencies.tsv +100000 -0
- data/data/segment_ruby/twitter/total.tsv +1 -0
- data/lib/segment_ruby.rb +138 -0
- data/lib/segment_ruby/version.rb +3 -0
- data/segment_ruby.gemspec +35 -0
- metadata +134 -0
@@ -0,0 +1 @@
|
|
1
|
+
1174595007
|
data/lib/segment_ruby.rb
ADDED
@@ -0,0 +1,138 @@
|
|
1
|
+
require "segment_ruby/version"
|
2
|
+
require 'pathname'
|
3
|
+
|
4
|
+
# Based on "Natural Language Corpus Data"
|
5
|
+
# from the book "Beautiful Data" (Segaran and Hammerbacher, 2009)
|
6
|
+
# by Peter Novig
|
7
|
+
|
8
|
+
module SegmentRuby
|
9
|
+
|
10
|
+
LOG_1 = Math.log2(1)
|
11
|
+
|
12
|
+
class ProbabilityDistribution
|
13
|
+
def initialize(total_file_name, data_file_name)
|
14
|
+
@total_file_name = total_file_name
|
15
|
+
@data_file_name = total_file_name
|
16
|
+
begin
|
17
|
+
total = File.read(total_file_name).to_i
|
18
|
+
@log_total= Math.log2(total)
|
19
|
+
rescue
|
20
|
+
@log_total= Math.log2(10**1000)
|
21
|
+
end
|
22
|
+
|
23
|
+
@table = Hash.new{|w| -Float::INFINITY}
|
24
|
+
File.open(data_file_name).each_line do |line|
|
25
|
+
data = line.split(/\s/)
|
26
|
+
freq = data[-1].to_i
|
27
|
+
keys = data[0..-2]
|
28
|
+
key = keys.join(' ')
|
29
|
+
log_p = Math.log2(freq) - @log_total
|
30
|
+
@table[key] = log_p
|
31
|
+
end
|
32
|
+
true
|
33
|
+
end
|
34
|
+
|
35
|
+
def table
|
36
|
+
@table
|
37
|
+
end
|
38
|
+
|
39
|
+
def files
|
40
|
+
[@total_file_name, @data_file_name]
|
41
|
+
end
|
42
|
+
|
43
|
+
def log_prob(w)
|
44
|
+
@table[w]
|
45
|
+
end
|
46
|
+
|
47
|
+
def prob(w)
|
48
|
+
2**@table[w]
|
49
|
+
end
|
50
|
+
|
51
|
+
def log_total
|
52
|
+
@log_total
|
53
|
+
end
|
54
|
+
|
55
|
+
def total
|
56
|
+
2**@log_total
|
57
|
+
end
|
58
|
+
|
59
|
+
def has_key?(w)
|
60
|
+
@table.has_key?(w)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
class Analyzer
|
65
|
+
|
66
|
+
def initialize(model='small', max_word_length=20)
|
67
|
+
@model = model
|
68
|
+
@max_word_length = max_word_length
|
69
|
+
# unigram log probabilities
|
70
|
+
@ulp = ProbabilityDistribution.new(total_file_name(''), freq_file_name(''))
|
71
|
+
# bigram log probabilities
|
72
|
+
btf = total_file_name('2_')
|
73
|
+
bff = freq_file_name('2_')
|
74
|
+
@blp = ((File.exists?(btf) and File.exists?(bff)) ? ProbabilityDistribution.new(btf, bff) : false)
|
75
|
+
true
|
76
|
+
end
|
77
|
+
|
78
|
+
def model
|
79
|
+
@model
|
80
|
+
end
|
81
|
+
|
82
|
+
def max_word_length
|
83
|
+
@max_word_length
|
84
|
+
end
|
85
|
+
|
86
|
+
def ulp
|
87
|
+
@ulp
|
88
|
+
end
|
89
|
+
|
90
|
+
def blp
|
91
|
+
@blp
|
92
|
+
end
|
93
|
+
|
94
|
+
def log_Pr(w)
|
95
|
+
@ulp.log_prob(w)
|
96
|
+
end
|
97
|
+
|
98
|
+
def log_CPr(w, prev)
|
99
|
+
key = [prev, w].join(' ')
|
100
|
+
(@blp and @blp.has_key?(key)) ? @blp.log_prob(key) : @ulp.log_prob(w)
|
101
|
+
end
|
102
|
+
|
103
|
+
def total_file_name(prefix)
|
104
|
+
File.join(__dir__, "..", "data", "segment_ruby", @model, prefix + 'total.tsv')
|
105
|
+
end
|
106
|
+
|
107
|
+
def freq_file_name(prefix)
|
108
|
+
File.join(__dir__, "..", "data", "segment_ruby", @model, prefix + 'frequencies.tsv')
|
109
|
+
end
|
110
|
+
|
111
|
+
# Returns all the splits of a string up to a given length
|
112
|
+
def splits(text)
|
113
|
+
(0..[@max_word_length,text.size-1].min).map{|i| [text[0..i], text[i+1..text.size] ] }
|
114
|
+
end
|
115
|
+
|
116
|
+
def combine(pFirst, first, segmented)
|
117
|
+
pRem,rem = segmented
|
118
|
+
[pFirst+pRem, [first]+rem]
|
119
|
+
end
|
120
|
+
|
121
|
+
def segment_r(text, prev, n, memo)
|
122
|
+
return [0.0, []] if not text or (text.size == 0)
|
123
|
+
return memo[text] if memo.has_key?(text)
|
124
|
+
log_p_segment = splits(text).map do |first, rem|
|
125
|
+
log_p = log_CPr(first, prev)
|
126
|
+
combine(log_p, first, segment_r(rem, first, n+1, memo))
|
127
|
+
end.max
|
128
|
+
memo[text] = log_p_segment
|
129
|
+
log_p_segment
|
130
|
+
end
|
131
|
+
|
132
|
+
def segment(text, prev='<S>')
|
133
|
+
p, segmentation = segment_r(text, prev, 0, Hash.new)
|
134
|
+
segmentation
|
135
|
+
end
|
136
|
+
|
137
|
+
end
|
138
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'segment_ruby/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "segment_ruby"
|
8
|
+
spec.version = SegmentRuby::VERSION
|
9
|
+
spec.authors = ["Will Fitzgerald"]
|
10
|
+
spec.email = ["will.fitzgerald@pobox.com"]
|
11
|
+
|
12
|
+
spec.summary = %q{segments text according word frequency using the Viterbi algorithm.}
|
13
|
+
spec.homepage = "https://github.com/willf/segment_ruby"
|
14
|
+
|
15
|
+
# Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
|
16
|
+
# to allow pushing to a single host or delete this section to allow pushing to any host.
|
17
|
+
# if spec.respond_to?(:metadata)
|
18
|
+
# spec.metadata['allowed_push_host'] = "TODO: Set to 'http://mygemserver.com'"
|
19
|
+
# else
|
20
|
+
# raise "RubyGems 2.0 or newer is required to protect against " \
|
21
|
+
# "public gem pushes."
|
22
|
+
# end
|
23
|
+
|
24
|
+
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
25
|
+
f.match(%r{^(test|spec|features)/})
|
26
|
+
end
|
27
|
+
spec.bindir = "exe"
|
28
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
29
|
+
spec.require_paths = ["lib"]
|
30
|
+
|
31
|
+
spec.add_development_dependency "bundler", "~> 1.13"
|
32
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
33
|
+
spec.add_development_dependency "rspec", "~> 3.0"
|
34
|
+
spec.add_development_dependency 'pry', '~> 0.9.12.2'
|
35
|
+
end
|
metadata
ADDED
@@ -0,0 +1,134 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: segment_ruby
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Will Fitzgerald
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-10-13 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.13'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.13'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '3.0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '3.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: pry
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 0.9.12.2
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 0.9.12.2
|
69
|
+
description:
|
70
|
+
email:
|
71
|
+
- will.fitzgerald@pobox.com
|
72
|
+
executables: []
|
73
|
+
extensions: []
|
74
|
+
extra_rdoc_files: []
|
75
|
+
files:
|
76
|
+
- ".gitignore"
|
77
|
+
- ".rspec"
|
78
|
+
- ".travis.yml"
|
79
|
+
- Gemfile
|
80
|
+
- LICENSE
|
81
|
+
- README.md
|
82
|
+
- Rakefile
|
83
|
+
- bin/console
|
84
|
+
- bin/setup
|
85
|
+
- data/segment_ruby/anchor/README.md
|
86
|
+
- data/segment_ruby/anchor/frequencies.tsv
|
87
|
+
- data/segment_ruby/anchor/total.tsv
|
88
|
+
- data/segment_ruby/google_books/2_frequencies.tsv
|
89
|
+
- data/segment_ruby/google_books/2_total.tsv
|
90
|
+
- data/segment_ruby/google_books/README.md
|
91
|
+
- data/segment_ruby/google_books/frequencies.tsv
|
92
|
+
- data/segment_ruby/google_books/total.tsv
|
93
|
+
- data/segment_ruby/norvig/2_frequencies.tsv
|
94
|
+
- data/segment_ruby/norvig/2_total.tsv
|
95
|
+
- data/segment_ruby/norvig/README.md
|
96
|
+
- data/segment_ruby/norvig/frequencies.tsv
|
97
|
+
- data/segment_ruby/norvig/total.tsv
|
98
|
+
- data/segment_ruby/small/2_frequencies.tsv
|
99
|
+
- data/segment_ruby/small/2_total.tsv
|
100
|
+
- data/segment_ruby/small/README.md
|
101
|
+
- data/segment_ruby/small/frequencies.tsv
|
102
|
+
- data/segment_ruby/small/total.tsv
|
103
|
+
- data/segment_ruby/twitter/2_frequencies.tsv
|
104
|
+
- data/segment_ruby/twitter/2_total.tsv
|
105
|
+
- data/segment_ruby/twitter/README.md
|
106
|
+
- data/segment_ruby/twitter/frequencies.tsv
|
107
|
+
- data/segment_ruby/twitter/total.tsv
|
108
|
+
- lib/segment_ruby.rb
|
109
|
+
- lib/segment_ruby/version.rb
|
110
|
+
- segment_ruby.gemspec
|
111
|
+
homepage: https://github.com/willf/segment_ruby
|
112
|
+
licenses: []
|
113
|
+
metadata: {}
|
114
|
+
post_install_message:
|
115
|
+
rdoc_options: []
|
116
|
+
require_paths:
|
117
|
+
- lib
|
118
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
119
|
+
requirements:
|
120
|
+
- - ">="
|
121
|
+
- !ruby/object:Gem::Version
|
122
|
+
version: '0'
|
123
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
124
|
+
requirements:
|
125
|
+
- - ">="
|
126
|
+
- !ruby/object:Gem::Version
|
127
|
+
version: '0'
|
128
|
+
requirements: []
|
129
|
+
rubyforge_project:
|
130
|
+
rubygems_version: 2.4.5.1
|
131
|
+
signing_key:
|
132
|
+
specification_version: 4
|
133
|
+
summary: segments text according word frequency using the Viterbi algorithm.
|
134
|
+
test_files: []
|