segment_ruby 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +6 -6
- data/lib/segment_ruby/version.rb +1 -1
- data/lib/segment_ruby.rb +40 -49
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 26ffd5e24afa505dec8c31cac0cad0cecafc27bb
|
4
|
+
data.tar.gz: 4171510ada22f97967654a976da3c014b5fab121
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fcf6a77d8778367c64563d09be67080b935282250fc625cdb2169ea9f461a0dea2a04ef47a400b7f70ed4f2745c534cd5600b3eb13b9b94285aaf6eb65609348
|
7
|
+
data.tar.gz: c7868ead403e68c2154016150456676269376b136dce0de73c30370ce168e27abf839335a46c4854af037a155238241c79ef89a172163055dc16651ef95e5bfa
|
data/README.md
CHANGED
@@ -23,17 +23,17 @@ Or install it yourself as:
|
|
23
23
|
|
24
24
|
```
|
25
25
|
require 'segment_ruby'
|
26
|
-
t = SegmentRuby::Analyzer.new(
|
26
|
+
t = SegmentRuby::Analyzer.new(:twitter)
|
27
27
|
t.segment("theboywholived")
|
28
28
|
=> ["the", "boy", "who", "lived"]
|
29
29
|
```
|
30
30
|
Models include:
|
31
31
|
|
32
|
-
-
|
33
|
-
-
|
34
|
-
-
|
35
|
-
-
|
36
|
-
-
|
32
|
+
- `:norvig`: based on Google web data
|
33
|
+
- `:google_books`: based on Google books data
|
34
|
+
- `:anchor`: based on Web anchor text
|
35
|
+
- `:twitter`: based on Twitter data
|
36
|
+
- `:small`: smaller version of the Google books data
|
37
37
|
|
38
38
|
The default model is `small`. Use it if is seems to work for you.
|
39
39
|
|
data/lib/segment_ruby/version.rb
CHANGED
data/lib/segment_ruby.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
require_relative "./segment_ruby/version"
|
2
2
|
require 'pathname'
|
3
3
|
|
4
4
|
# Based on "Natural Language Corpus Data"
|
@@ -13,126 +13,117 @@ module SegmentRuby
|
|
13
13
|
def initialize(total_file_name, data_file_name)
|
14
14
|
@total_file_name = total_file_name
|
15
15
|
@data_file_name = total_file_name
|
16
|
-
|
16
|
+
|
17
|
+
@log_total = begin
|
17
18
|
total = File.read(total_file_name).to_i
|
18
|
-
|
19
|
+
Math.log2(total)
|
19
20
|
rescue
|
20
|
-
|
21
|
+
Math.log2(10**1000)
|
21
22
|
end
|
22
23
|
|
23
|
-
@table = Hash.new{|w| -Float::INFINITY}
|
24
|
+
@table = Hash.new { |w| -Float::INFINITY }
|
25
|
+
|
24
26
|
File.open(data_file_name).each_line do |line|
|
25
27
|
data = line.split(/\s/)
|
26
28
|
freq = data[-1].to_i
|
27
29
|
keys = data[0..-2]
|
28
30
|
key = keys.join(' ')
|
29
|
-
log_p = Math.log2(freq) -
|
30
|
-
|
31
|
+
log_p = Math.log2(freq) - log_total
|
32
|
+
|
33
|
+
table[key] = log_p
|
31
34
|
end
|
32
|
-
true
|
33
35
|
end
|
34
36
|
|
35
|
-
|
36
|
-
@table
|
37
|
-
end
|
37
|
+
attr_reader :log_total, :table
|
38
38
|
|
39
39
|
def files
|
40
40
|
[@total_file_name, @data_file_name]
|
41
41
|
end
|
42
42
|
|
43
43
|
def log_prob(w)
|
44
|
-
|
44
|
+
table[w]
|
45
45
|
end
|
46
46
|
|
47
47
|
def prob(w)
|
48
|
-
2
|
49
|
-
end
|
50
|
-
|
51
|
-
def log_total
|
52
|
-
@log_total
|
48
|
+
2**table[w]
|
53
49
|
end
|
54
50
|
|
55
51
|
def total
|
56
|
-
2
|
52
|
+
2**log_total
|
57
53
|
end
|
58
54
|
|
59
55
|
def has_key?(w)
|
60
|
-
|
56
|
+
table.has_key?(w)
|
61
57
|
end
|
62
58
|
end
|
63
59
|
|
64
60
|
class Analyzer
|
65
|
-
|
66
|
-
|
67
|
-
@model = model
|
61
|
+
def initialize(model_name=:small, max_word_length=20)
|
62
|
+
@model_name = model_name
|
68
63
|
@max_word_length = max_word_length
|
64
|
+
|
69
65
|
# unigram log probabilities
|
70
|
-
@ulp = ProbabilityDistribution.new(total_file_name
|
66
|
+
@ulp = ProbabilityDistribution.new(total_file_name, freq_file_name)
|
67
|
+
|
71
68
|
# bigram log probabilities
|
72
69
|
btf = total_file_name('2_')
|
73
70
|
bff = freq_file_name('2_')
|
74
|
-
@blp = (
|
75
|
-
true
|
76
|
-
end
|
77
|
-
|
78
|
-
def model
|
79
|
-
@model
|
71
|
+
@blp = (File.exists?(btf) and File.exists?(bff) ? ProbabilityDistribution.new(btf, bff) : false)
|
80
72
|
end
|
81
73
|
|
82
|
-
|
83
|
-
@max_word_length
|
84
|
-
end
|
85
|
-
|
86
|
-
def ulp
|
87
|
-
@ulp
|
88
|
-
end
|
89
|
-
|
90
|
-
def blp
|
91
|
-
@blp
|
92
|
-
end
|
74
|
+
attr_reader :blp, :max_word_length, :model_name, :ulp
|
93
75
|
|
94
76
|
def log_Pr(w)
|
95
|
-
|
77
|
+
ulp.log_prob(w)
|
96
78
|
end
|
97
79
|
|
98
80
|
def log_CPr(w, prev)
|
99
81
|
key = [prev, w].join(' ')
|
100
|
-
|
82
|
+
|
83
|
+
blp and blp.has_key?(key) ? blp.log_prob(key) : ulp.log_prob(w)
|
84
|
+
end
|
85
|
+
|
86
|
+
def total_file_name(prefix='')
|
87
|
+
File.join(model_path, prefix + 'total.tsv')
|
101
88
|
end
|
102
89
|
|
103
|
-
def
|
104
|
-
File.join(
|
90
|
+
def freq_file_name(prefix='')
|
91
|
+
File.join(model_path, prefix + 'frequencies.tsv')
|
105
92
|
end
|
106
93
|
|
107
|
-
def
|
108
|
-
File.join(__dir__, "..", "data", "segment_ruby",
|
94
|
+
def model_path
|
95
|
+
@model_path ||= File.join(__dir__, "..", "data", "segment_ruby", model_name.to_s)
|
109
96
|
end
|
110
97
|
|
111
98
|
# Returns all the splits of a string up to a given length
|
112
99
|
def splits(text)
|
113
|
-
(0..[
|
100
|
+
(0..[max_word_length, text.size-1].min).map { |i| [text[0..i], text[i+1..text.size]] }
|
114
101
|
end
|
115
102
|
|
116
103
|
def combine(pFirst, first, segmented)
|
117
104
|
pRem,rem = segmented
|
105
|
+
|
118
106
|
[pFirst+pRem, [first]+rem]
|
119
107
|
end
|
120
108
|
|
121
109
|
def segment_r(text, prev, n, memo)
|
122
110
|
return [0.0, []] if not text or (text.size == 0)
|
123
111
|
return memo[text] if memo.has_key?(text)
|
112
|
+
|
124
113
|
log_p_segment = splits(text).map do |first, rem|
|
125
114
|
log_p = log_CPr(first, prev)
|
126
115
|
combine(log_p, first, segment_r(rem, first, n+1, memo))
|
127
116
|
end.max
|
117
|
+
|
128
118
|
memo[text] = log_p_segment
|
119
|
+
|
129
120
|
log_p_segment
|
130
121
|
end
|
131
122
|
|
132
123
|
def segment(text, prev='<S>')
|
133
|
-
|
124
|
+
_, segmentation = segment_r(text, prev, 0, Hash.new)
|
125
|
+
|
134
126
|
segmentation
|
135
127
|
end
|
136
|
-
|
137
128
|
end
|
138
129
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: segment_ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Will Fitzgerald
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-10-
|
11
|
+
date: 2016-10-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|