segment_ruby 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +6 -6
- data/lib/segment_ruby/version.rb +1 -1
- data/lib/segment_ruby.rb +40 -49
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 26ffd5e24afa505dec8c31cac0cad0cecafc27bb
|
4
|
+
data.tar.gz: 4171510ada22f97967654a976da3c014b5fab121
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fcf6a77d8778367c64563d09be67080b935282250fc625cdb2169ea9f461a0dea2a04ef47a400b7f70ed4f2745c534cd5600b3eb13b9b94285aaf6eb65609348
|
7
|
+
data.tar.gz: c7868ead403e68c2154016150456676269376b136dce0de73c30370ce168e27abf839335a46c4854af037a155238241c79ef89a172163055dc16651ef95e5bfa
|
data/README.md
CHANGED
@@ -23,17 +23,17 @@ Or install it yourself as:
|
|
23
23
|
|
24
24
|
```
|
25
25
|
require 'segment_ruby'
|
26
|
-
t = SegmentRuby::Analyzer.new(
|
26
|
+
t = SegmentRuby::Analyzer.new(:twitter)
|
27
27
|
t.segment("theboywholived")
|
28
28
|
=> ["the", "boy", "who", "lived"]
|
29
29
|
```
|
30
30
|
Models include:
|
31
31
|
|
32
|
-
-
|
33
|
-
-
|
34
|
-
-
|
35
|
-
-
|
36
|
-
-
|
32
|
+
- `:norvig`: based on Google web data
|
33
|
+
- `:google_books`: based on Google books data
|
34
|
+
- `:anchor`: based on Web anchor text
|
35
|
+
- `:twitter`: based on Twitter data
|
36
|
+
- `:small`: smaller version of the Google books data
|
37
37
|
|
38
38
|
The default model is `small`. Use it if is seems to work for you.
|
39
39
|
|
data/lib/segment_ruby/version.rb
CHANGED
data/lib/segment_ruby.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
require_relative "./segment_ruby/version"
|
2
2
|
require 'pathname'
|
3
3
|
|
4
4
|
# Based on "Natural Language Corpus Data"
|
@@ -13,126 +13,117 @@ module SegmentRuby
|
|
13
13
|
def initialize(total_file_name, data_file_name)
|
14
14
|
@total_file_name = total_file_name
|
15
15
|
@data_file_name = total_file_name
|
16
|
-
|
16
|
+
|
17
|
+
@log_total = begin
|
17
18
|
total = File.read(total_file_name).to_i
|
18
|
-
|
19
|
+
Math.log2(total)
|
19
20
|
rescue
|
20
|
-
|
21
|
+
Math.log2(10**1000)
|
21
22
|
end
|
22
23
|
|
23
|
-
@table = Hash.new{|w| -Float::INFINITY}
|
24
|
+
@table = Hash.new { |w| -Float::INFINITY }
|
25
|
+
|
24
26
|
File.open(data_file_name).each_line do |line|
|
25
27
|
data = line.split(/\s/)
|
26
28
|
freq = data[-1].to_i
|
27
29
|
keys = data[0..-2]
|
28
30
|
key = keys.join(' ')
|
29
|
-
log_p = Math.log2(freq) -
|
30
|
-
|
31
|
+
log_p = Math.log2(freq) - log_total
|
32
|
+
|
33
|
+
table[key] = log_p
|
31
34
|
end
|
32
|
-
true
|
33
35
|
end
|
34
36
|
|
35
|
-
|
36
|
-
@table
|
37
|
-
end
|
37
|
+
attr_reader :log_total, :table
|
38
38
|
|
39
39
|
def files
|
40
40
|
[@total_file_name, @data_file_name]
|
41
41
|
end
|
42
42
|
|
43
43
|
def log_prob(w)
|
44
|
-
|
44
|
+
table[w]
|
45
45
|
end
|
46
46
|
|
47
47
|
def prob(w)
|
48
|
-
2
|
49
|
-
end
|
50
|
-
|
51
|
-
def log_total
|
52
|
-
@log_total
|
48
|
+
2**table[w]
|
53
49
|
end
|
54
50
|
|
55
51
|
def total
|
56
|
-
2
|
52
|
+
2**log_total
|
57
53
|
end
|
58
54
|
|
59
55
|
def has_key?(w)
|
60
|
-
|
56
|
+
table.has_key?(w)
|
61
57
|
end
|
62
58
|
end
|
63
59
|
|
64
60
|
class Analyzer
|
65
|
-
|
66
|
-
|
67
|
-
@model = model
|
61
|
+
def initialize(model_name=:small, max_word_length=20)
|
62
|
+
@model_name = model_name
|
68
63
|
@max_word_length = max_word_length
|
64
|
+
|
69
65
|
# unigram log probabilities
|
70
|
-
@ulp = ProbabilityDistribution.new(total_file_name
|
66
|
+
@ulp = ProbabilityDistribution.new(total_file_name, freq_file_name)
|
67
|
+
|
71
68
|
# bigram log probabilities
|
72
69
|
btf = total_file_name('2_')
|
73
70
|
bff = freq_file_name('2_')
|
74
|
-
@blp = (
|
75
|
-
true
|
76
|
-
end
|
77
|
-
|
78
|
-
def model
|
79
|
-
@model
|
71
|
+
@blp = (File.exists?(btf) and File.exists?(bff) ? ProbabilityDistribution.new(btf, bff) : false)
|
80
72
|
end
|
81
73
|
|
82
|
-
|
83
|
-
@max_word_length
|
84
|
-
end
|
85
|
-
|
86
|
-
def ulp
|
87
|
-
@ulp
|
88
|
-
end
|
89
|
-
|
90
|
-
def blp
|
91
|
-
@blp
|
92
|
-
end
|
74
|
+
attr_reader :blp, :max_word_length, :model_name, :ulp
|
93
75
|
|
94
76
|
def log_Pr(w)
|
95
|
-
|
77
|
+
ulp.log_prob(w)
|
96
78
|
end
|
97
79
|
|
98
80
|
def log_CPr(w, prev)
|
99
81
|
key = [prev, w].join(' ')
|
100
|
-
|
82
|
+
|
83
|
+
blp and blp.has_key?(key) ? blp.log_prob(key) : ulp.log_prob(w)
|
84
|
+
end
|
85
|
+
|
86
|
+
def total_file_name(prefix='')
|
87
|
+
File.join(model_path, prefix + 'total.tsv')
|
101
88
|
end
|
102
89
|
|
103
|
-
def
|
104
|
-
File.join(
|
90
|
+
def freq_file_name(prefix='')
|
91
|
+
File.join(model_path, prefix + 'frequencies.tsv')
|
105
92
|
end
|
106
93
|
|
107
|
-
def
|
108
|
-
File.join(__dir__, "..", "data", "segment_ruby",
|
94
|
+
def model_path
|
95
|
+
@model_path ||= File.join(__dir__, "..", "data", "segment_ruby", model_name.to_s)
|
109
96
|
end
|
110
97
|
|
111
98
|
# Returns all the splits of a string up to a given length
|
112
99
|
def splits(text)
|
113
|
-
(0..[
|
100
|
+
(0..[max_word_length, text.size-1].min).map { |i| [text[0..i], text[i+1..text.size]] }
|
114
101
|
end
|
115
102
|
|
116
103
|
def combine(pFirst, first, segmented)
|
117
104
|
pRem,rem = segmented
|
105
|
+
|
118
106
|
[pFirst+pRem, [first]+rem]
|
119
107
|
end
|
120
108
|
|
121
109
|
def segment_r(text, prev, n, memo)
|
122
110
|
return [0.0, []] if not text or (text.size == 0)
|
123
111
|
return memo[text] if memo.has_key?(text)
|
112
|
+
|
124
113
|
log_p_segment = splits(text).map do |first, rem|
|
125
114
|
log_p = log_CPr(first, prev)
|
126
115
|
combine(log_p, first, segment_r(rem, first, n+1, memo))
|
127
116
|
end.max
|
117
|
+
|
128
118
|
memo[text] = log_p_segment
|
119
|
+
|
129
120
|
log_p_segment
|
130
121
|
end
|
131
122
|
|
132
123
|
def segment(text, prev='<S>')
|
133
|
-
|
124
|
+
_, segmentation = segment_r(text, prev, 0, Hash.new)
|
125
|
+
|
134
126
|
segmentation
|
135
127
|
end
|
136
|
-
|
137
128
|
end
|
138
129
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: segment_ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Will Fitzgerald
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-10-
|
11
|
+
date: 2016-10-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|