maxixe 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/Gemfile +1 -0
- data/README.md +10 -0
- data/lib/maxixe.rb +39 -15
- data/lib/maxixe/version.rb +1 -1
- data/maxixe.gemspec +2 -2
- data/spec/segmenter/segmenter_spec.rb +10 -5
- data/spec/trainer/trainer_spec.rb +22 -2
- metadata +10 -16
- data/lib/.maxixe.rb.swp +0 -0
- data/spec/segmenter/.segmenter_spec.rb.swp +0 -0
- data/spec/trainer/.first_file.swp +0 -0
- data/spec/trainer/.second_file.swp +0 -0
- data/spec/trainer/.trainer_spec.rb.swp +0 -0
data/.gitignore
CHANGED
data/Gemfile
CHANGED
data/README.md
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# Maxixe
|
|
2
|
+
### A simple statistical segmenter for any language
|
|
3
|
+
|
|
4
|
+
## About
|
|
5
|
+
|
|
6
|
+
Maxixe is an implementation of the Tango algorithm describe in the paper "Mostly-unsupervised statistical segmentation of Japanese kanji sequences" by Ando and Lee. While the paper deals with Japanese characters, it should work on any unsegmented text given enough corpus data and a tuning of the algorithm paramenters.
|
|
7
|
+
|
|
8
|
+
## How to use
|
|
9
|
+
|
|
10
|
+
First, you need a hash that contains the count of all n-grams in a given corpus.
|
data/lib/maxixe.rb
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
require "text"
|
|
1
2
|
module Maxixe
|
|
2
3
|
class Segmenter
|
|
3
4
|
|
|
@@ -98,28 +99,51 @@ module Maxixe
|
|
|
98
99
|
|
|
99
100
|
class Trainer
|
|
100
101
|
|
|
101
|
-
def self.
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
102
|
+
def self.generate_corpus_from_io(n , io)
|
|
103
|
+
result = n.inject({}){|r, c_n| r[c_n.to_s] = Hash.new{0}; r}
|
|
104
|
+
io.each_line do |line|
|
|
105
|
+
n.each do |c_n|
|
|
106
|
+
n_grams = line.each_char.each_cons(c_n).map(&:join).to_a
|
|
107
|
+
n_grams.each do |n_gram|
|
|
108
|
+
result[c_n.to_s][n_gram] += 1
|
|
109
|
+
end
|
|
110
|
+
end
|
|
105
111
|
end
|
|
112
|
+
result
|
|
106
113
|
end
|
|
107
114
|
|
|
108
|
-
def self.
|
|
109
|
-
|
|
115
|
+
def self.optimize(index, samples)
|
|
116
|
+
res = check_recognition(index, samples)
|
|
117
|
+
min = nil
|
|
118
|
+
res.each do |n, ts|
|
|
119
|
+
ts.each do |t, score|
|
|
120
|
+
if !min or score < min[1]
|
|
121
|
+
min = [[n,t],score]
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
{:n => min[0][0], :t => min[0][1], :score => min[1]}
|
|
126
|
+
end
|
|
110
127
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
128
|
+
def self.check_recognition(index, samples)
|
|
129
|
+
# Get all subsets of N
|
|
130
|
+
ns = 1.upto(index.keys.size).map{|i| index.keys.combination(i).to_a}.flatten(1)
|
|
131
|
+
results = ns.inject({}) do |res, n|
|
|
132
|
+
n_index = index.select{|key, value| n.include? key}
|
|
133
|
+
m = Maxixe::Segmenter.new(n_index)
|
|
134
|
+
|
|
135
|
+
t_values = ((0.1)..(1.0)).step(0.1).inject({}) do |res, t|
|
|
136
|
+
difference = samples.inject(0) do |result, (not_split, split)|
|
|
137
|
+
temp = m.segment(not_split, t)
|
|
138
|
+
result += Text::Levenshtein.distance(temp, split)
|
|
119
139
|
end
|
|
140
|
+
res[t] = difference
|
|
141
|
+
res
|
|
120
142
|
end
|
|
143
|
+
res[n] = t_values
|
|
144
|
+
res
|
|
121
145
|
end
|
|
122
|
-
|
|
146
|
+
results
|
|
123
147
|
end
|
|
124
148
|
end
|
|
125
149
|
end
|
data/lib/maxixe/version.rb
CHANGED
data/maxixe.gemspec
CHANGED
|
@@ -10,11 +10,11 @@ Gem::Specification.new do |s|
|
|
|
10
10
|
s.email = ["maxixe@rogerbraun.net"]
|
|
11
11
|
s.homepage = "https://github.com/rogerbraun/Maxixe"
|
|
12
12
|
s.summary = %q{A small statistical segmenter for any language.}
|
|
13
|
-
s.description = %q{Maxixe is an implementation of the Tango algorithm describe in the paper "Mostly-unsupervised statistical segmentation of Japanese kanji sequences" by Ando and Lee. While the paper deals with Japanese characters, it should work on any unsegmented text given enough corpus data and a tuning of the algorithm
|
|
13
|
+
s.description = %q{Maxixe is an implementation of the Tango algorithm describe in the paper "Mostly-unsupervised statistical segmentation of Japanese kanji sequences" by Ando and Lee. While the paper deals with Japanese characters, it should work on any unsegmented text given enough corpus data and a tuning of the algorithm parameters.}
|
|
14
14
|
|
|
15
15
|
s.rubyforge_project = "maxixe"
|
|
16
16
|
|
|
17
|
-
s.add_dependency "
|
|
17
|
+
s.add_dependency "text"
|
|
18
18
|
s.add_development_dependency "rspec"
|
|
19
19
|
|
|
20
20
|
s.files = `git ls-files`.split("\n")
|
|
@@ -56,12 +56,17 @@ describe Maxixe::Segmenter do
|
|
|
56
56
|
end
|
|
57
57
|
|
|
58
58
|
describe "Segmenting Text" do
|
|
59
|
-
before(:each) do
|
|
60
|
-
@segmenter = Maxixe::Segmenter.new({"2"=>{"AB"=>2, "BC"=>2, "CD"=>1, "DE"=>1, "EF"=>1, "FG"=>1, "G\n"=>1, "CX"=>1, "XY"=>1, "YZ"=>1, "Z\n"=>1}, "3"=>{"ABC"=>2, "BCD"=>1, "CDE"=>1, "DEF"=>1, "EFG"=>1, "FG\n"=>1, "BCX"=>1, "CXY"=>1, "XYZ"=>1, "YZ\n"=>1}})
|
|
61
|
-
end
|
|
62
59
|
|
|
63
|
-
it "should
|
|
64
|
-
|
|
60
|
+
it "should do some examples" do
|
|
61
|
+
index = Maxixe::Trainer.generate_corpus_from_io([3], "ILIKEMYDOG
|
|
62
|
+
THISHOUSEISMYHOUSE
|
|
63
|
+
MYDOGISSONICE
|
|
64
|
+
INMYHOUSETHEREAREFOURDOGS
|
|
65
|
+
IWANTAHOUSEFORMYDOG")
|
|
66
|
+
|
|
67
|
+
m = Maxixe::Segmenter.new(index,0.3)
|
|
68
|
+
m.segment("FOURNICEDOGS").should == "FOUR NICE DOGS"
|
|
69
|
+
m.segment("MYDOGISINTHEHOUSE").should == "MY DOG IS IN THE HOUSE"
|
|
65
70
|
end
|
|
66
71
|
end
|
|
67
72
|
end
|
|
@@ -2,12 +2,32 @@ require "spec_helper"
|
|
|
2
2
|
|
|
3
3
|
describe Maxixe::Trainer do
|
|
4
4
|
|
|
5
|
-
it "should generate n-gram data from
|
|
5
|
+
it "should generate n-gram data from IOs" do
|
|
6
6
|
|
|
7
7
|
pwd = File.dirname(__FILE__)
|
|
8
8
|
|
|
9
|
-
Maxixe::Trainer.
|
|
9
|
+
Maxixe::Trainer.generate_corpus_from_io([2,3], open(File.join(pwd, "first_file"))).should == {"2"=>{"AB"=>1, "BC"=>1, "CD"=>1, "DE"=>1, "EF"=>1, "FG"=>1, "G\n"=>1}, "3"=>{"ABC"=>1, "BCD"=>1, "CDE"=>1, "DEF"=>1, "EFG"=>1, "FG\n"=>1}}
|
|
10
10
|
|
|
11
11
|
end
|
|
12
12
|
|
|
13
|
+
it "should be able to find the optimal threshold and n values" do
|
|
14
|
+
pre_segmented = [["MYDOGISINTHEHOUSE", "MY DOG IS IN THE HOUSE"],
|
|
15
|
+
["FOURNICEDOGS", "FOUR NICE DOGS"],
|
|
16
|
+
["MYCATLIKESMYDOG", "MY CAT LIKES MY DOG"]]
|
|
17
|
+
index = Maxixe::Trainer.generate_corpus_from_io([2,3,4,5], "ILIKEMYDOG
|
|
18
|
+
THISHOUSEISMYHOUSE
|
|
19
|
+
MYDOGISSONICE
|
|
20
|
+
WHOLIKESDOGSANYWAY
|
|
21
|
+
CATSANDDOGSUSUALLYFIGHT
|
|
22
|
+
INMYHOUSETHEREAREFOURDOGS
|
|
23
|
+
IWANTAHOUSEFORMYDOG")
|
|
24
|
+
|
|
25
|
+
optimal = Maxixe::Trainer.optimize(index, pre_segmented)
|
|
26
|
+
optimal[:n].should == ["2","4"]
|
|
27
|
+
optimal[:score].should == 0
|
|
28
|
+
optimal[:t].should be_within(0.01).of(0.5)
|
|
29
|
+
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
|
|
13
33
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: maxixe
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0
|
|
4
|
+
version: 0.1.0
|
|
5
5
|
prerelease:
|
|
6
6
|
platform: ruby
|
|
7
7
|
authors:
|
|
@@ -9,12 +9,11 @@ authors:
|
|
|
9
9
|
autorequire:
|
|
10
10
|
bindir: bin
|
|
11
11
|
cert_chain: []
|
|
12
|
-
date: 2011-
|
|
13
|
-
default_executable:
|
|
12
|
+
date: 2011-09-11 00:00:00.000000000 Z
|
|
14
13
|
dependencies:
|
|
15
14
|
- !ruby/object:Gem::Dependency
|
|
16
|
-
name:
|
|
17
|
-
requirement: &
|
|
15
|
+
name: text
|
|
16
|
+
requirement: &81697730 !ruby/object:Gem::Requirement
|
|
18
17
|
none: false
|
|
19
18
|
requirements:
|
|
20
19
|
- - ! '>='
|
|
@@ -22,10 +21,10 @@ dependencies:
|
|
|
22
21
|
version: '0'
|
|
23
22
|
type: :runtime
|
|
24
23
|
prerelease: false
|
|
25
|
-
version_requirements: *
|
|
24
|
+
version_requirements: *81697730
|
|
26
25
|
- !ruby/object:Gem::Dependency
|
|
27
26
|
name: rspec
|
|
28
|
-
requirement: &
|
|
27
|
+
requirement: &81697520 !ruby/object:Gem::Requirement
|
|
29
28
|
none: false
|
|
30
29
|
requirements:
|
|
31
30
|
- - ! '>='
|
|
@@ -33,11 +32,11 @@ dependencies:
|
|
|
33
32
|
version: '0'
|
|
34
33
|
type: :development
|
|
35
34
|
prerelease: false
|
|
36
|
-
version_requirements: *
|
|
35
|
+
version_requirements: *81697520
|
|
37
36
|
description: Maxixe is an implementation of the Tango algorithm describe in the paper
|
|
38
37
|
"Mostly-unsupervised statistical segmentation of Japanese kanji sequences" by Ando
|
|
39
38
|
and Lee. While the paper deals with Japanese characters, it should work on any unsegmented
|
|
40
|
-
text given enough corpus data and a tuning of the algorithm
|
|
39
|
+
text given enough corpus data and a tuning of the algorithm parameters.
|
|
41
40
|
email:
|
|
42
41
|
- maxixe@rogerbraun.net
|
|
43
42
|
executables: []
|
|
@@ -47,21 +46,16 @@ files:
|
|
|
47
46
|
- .gitignore
|
|
48
47
|
- .rspec
|
|
49
48
|
- Gemfile
|
|
49
|
+
- README.md
|
|
50
50
|
- Rakefile
|
|
51
|
-
- lib/.maxixe.rb.swp
|
|
52
51
|
- lib/maxixe.rb
|
|
53
52
|
- lib/maxixe/version.rb
|
|
54
53
|
- maxixe.gemspec
|
|
55
|
-
- spec/segmenter/.segmenter_spec.rb.swp
|
|
56
54
|
- spec/segmenter/segmenter_spec.rb
|
|
57
55
|
- spec/spec_helper.rb
|
|
58
|
-
- spec/trainer/.first_file.swp
|
|
59
|
-
- spec/trainer/.second_file.swp
|
|
60
|
-
- spec/trainer/.trainer_spec.rb.swp
|
|
61
56
|
- spec/trainer/first_file
|
|
62
57
|
- spec/trainer/second_file
|
|
63
58
|
- spec/trainer/trainer_spec.rb
|
|
64
|
-
has_rdoc: true
|
|
65
59
|
homepage: https://github.com/rogerbraun/Maxixe
|
|
66
60
|
licenses: []
|
|
67
61
|
post_install_message:
|
|
@@ -82,7 +76,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
82
76
|
version: '0'
|
|
83
77
|
requirements: []
|
|
84
78
|
rubyforge_project: maxixe
|
|
85
|
-
rubygems_version: 1.
|
|
79
|
+
rubygems_version: 1.8.9
|
|
86
80
|
signing_key:
|
|
87
81
|
specification_version: 3
|
|
88
82
|
summary: A small statistical segmenter for any language.
|
data/lib/.maxixe.rb.swp
DELETED
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|