maxixe 0.0.1 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/Gemfile +1 -0
- data/README.md +10 -0
- data/lib/maxixe.rb +39 -15
- data/lib/maxixe/version.rb +1 -1
- data/maxixe.gemspec +2 -2
- data/spec/segmenter/segmenter_spec.rb +10 -5
- data/spec/trainer/trainer_spec.rb +22 -2
- metadata +10 -16
- data/lib/.maxixe.rb.swp +0 -0
- data/spec/segmenter/.segmenter_spec.rb.swp +0 -0
- data/spec/trainer/.first_file.swp +0 -0
- data/spec/trainer/.second_file.swp +0 -0
- data/spec/trainer/.trainer_spec.rb.swp +0 -0
data/.gitignore
CHANGED
data/Gemfile
CHANGED
data/README.md
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
# Maxixe
|
2
|
+
### A simple statistical segmenter for any language
|
3
|
+
|
4
|
+
## About
|
5
|
+
|
6
|
+
Maxixe is an implementation of the Tango algorithm describe in the paper "Mostly-unsupervised statistical segmentation of Japanese kanji sequences" by Ando and Lee. While the paper deals with Japanese characters, it should work on any unsegmented text given enough corpus data and a tuning of the algorithm paramenters.
|
7
|
+
|
8
|
+
## How to use
|
9
|
+
|
10
|
+
First, you need a hash that contains the count of all n-grams in a given corpus.
|
data/lib/maxixe.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require "text"
|
1
2
|
module Maxixe
|
2
3
|
class Segmenter
|
3
4
|
|
@@ -98,28 +99,51 @@ module Maxixe
|
|
98
99
|
|
99
100
|
class Trainer
|
100
101
|
|
101
|
-
def self.
|
102
|
-
|
103
|
-
|
104
|
-
|
102
|
+
def self.generate_corpus_from_io(n , io)
|
103
|
+
result = n.inject({}){|r, c_n| r[c_n.to_s] = Hash.new{0}; r}
|
104
|
+
io.each_line do |line|
|
105
|
+
n.each do |c_n|
|
106
|
+
n_grams = line.each_char.each_cons(c_n).map(&:join).to_a
|
107
|
+
n_grams.each do |n_gram|
|
108
|
+
result[c_n.to_s][n_gram] += 1
|
109
|
+
end
|
110
|
+
end
|
105
111
|
end
|
112
|
+
result
|
106
113
|
end
|
107
114
|
|
108
|
-
def self.
|
109
|
-
|
115
|
+
def self.optimize(index, samples)
|
116
|
+
res = check_recognition(index, samples)
|
117
|
+
min = nil
|
118
|
+
res.each do |n, ts|
|
119
|
+
ts.each do |t, score|
|
120
|
+
if !min or score < min[1]
|
121
|
+
min = [[n,t],score]
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
{:n => min[0][0], :t => min[0][1], :score => min[1]}
|
126
|
+
end
|
110
127
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
128
|
+
def self.check_recognition(index, samples)
|
129
|
+
# Get all subsets of N
|
130
|
+
ns = 1.upto(index.keys.size).map{|i| index.keys.combination(i).to_a}.flatten(1)
|
131
|
+
results = ns.inject({}) do |res, n|
|
132
|
+
n_index = index.select{|key, value| n.include? key}
|
133
|
+
m = Maxixe::Segmenter.new(n_index)
|
134
|
+
|
135
|
+
t_values = ((0.1)..(1.0)).step(0.1).inject({}) do |res, t|
|
136
|
+
difference = samples.inject(0) do |result, (not_split, split)|
|
137
|
+
temp = m.segment(not_split, t)
|
138
|
+
result += Text::Levenshtein.distance(temp, split)
|
119
139
|
end
|
140
|
+
res[t] = difference
|
141
|
+
res
|
120
142
|
end
|
143
|
+
res[n] = t_values
|
144
|
+
res
|
121
145
|
end
|
122
|
-
|
146
|
+
results
|
123
147
|
end
|
124
148
|
end
|
125
149
|
end
|
data/lib/maxixe/version.rb
CHANGED
data/maxixe.gemspec
CHANGED
@@ -10,11 +10,11 @@ Gem::Specification.new do |s|
|
|
10
10
|
s.email = ["maxixe@rogerbraun.net"]
|
11
11
|
s.homepage = "https://github.com/rogerbraun/Maxixe"
|
12
12
|
s.summary = %q{A small statistical segmenter for any language.}
|
13
|
-
s.description = %q{Maxixe is an implementation of the Tango algorithm describe in the paper "Mostly-unsupervised statistical segmentation of Japanese kanji sequences" by Ando and Lee. While the paper deals with Japanese characters, it should work on any unsegmented text given enough corpus data and a tuning of the algorithm
|
13
|
+
s.description = %q{Maxixe is an implementation of the Tango algorithm describe in the paper "Mostly-unsupervised statistical segmentation of Japanese kanji sequences" by Ando and Lee. While the paper deals with Japanese characters, it should work on any unsegmented text given enough corpus data and a tuning of the algorithm parameters.}
|
14
14
|
|
15
15
|
s.rubyforge_project = "maxixe"
|
16
16
|
|
17
|
-
s.add_dependency "
|
17
|
+
s.add_dependency "text"
|
18
18
|
s.add_development_dependency "rspec"
|
19
19
|
|
20
20
|
s.files = `git ls-files`.split("\n")
|
@@ -56,12 +56,17 @@ describe Maxixe::Segmenter do
|
|
56
56
|
end
|
57
57
|
|
58
58
|
describe "Segmenting Text" do
|
59
|
-
before(:each) do
|
60
|
-
@segmenter = Maxixe::Segmenter.new({"2"=>{"AB"=>2, "BC"=>2, "CD"=>1, "DE"=>1, "EF"=>1, "FG"=>1, "G\n"=>1, "CX"=>1, "XY"=>1, "YZ"=>1, "Z\n"=>1}, "3"=>{"ABC"=>2, "BCD"=>1, "CDE"=>1, "DEF"=>1, "EFG"=>1, "FG\n"=>1, "BCX"=>1, "CXY"=>1, "XYZ"=>1, "YZ\n"=>1}})
|
61
|
-
end
|
62
59
|
|
63
|
-
it "should
|
64
|
-
|
60
|
+
it "should do some examples" do
|
61
|
+
index = Maxixe::Trainer.generate_corpus_from_io([3], "ILIKEMYDOG
|
62
|
+
THISHOUSEISMYHOUSE
|
63
|
+
MYDOGISSONICE
|
64
|
+
INMYHOUSETHEREAREFOURDOGS
|
65
|
+
IWANTAHOUSEFORMYDOG")
|
66
|
+
|
67
|
+
m = Maxixe::Segmenter.new(index,0.3)
|
68
|
+
m.segment("FOURNICEDOGS").should == "FOUR NICE DOGS"
|
69
|
+
m.segment("MYDOGISINTHEHOUSE").should == "MY DOG IS IN THE HOUSE"
|
65
70
|
end
|
66
71
|
end
|
67
72
|
end
|
@@ -2,12 +2,32 @@ require "spec_helper"
|
|
2
2
|
|
3
3
|
describe Maxixe::Trainer do
|
4
4
|
|
5
|
-
it "should generate n-gram data from
|
5
|
+
it "should generate n-gram data from IOs" do
|
6
6
|
|
7
7
|
pwd = File.dirname(__FILE__)
|
8
8
|
|
9
|
-
Maxixe::Trainer.
|
9
|
+
Maxixe::Trainer.generate_corpus_from_io([2,3], open(File.join(pwd, "first_file"))).should == {"2"=>{"AB"=>1, "BC"=>1, "CD"=>1, "DE"=>1, "EF"=>1, "FG"=>1, "G\n"=>1}, "3"=>{"ABC"=>1, "BCD"=>1, "CDE"=>1, "DEF"=>1, "EFG"=>1, "FG\n"=>1}}
|
10
10
|
|
11
11
|
end
|
12
12
|
|
13
|
+
it "should be able to find the optimal threshold and n values" do
|
14
|
+
pre_segmented = [["MYDOGISINTHEHOUSE", "MY DOG IS IN THE HOUSE"],
|
15
|
+
["FOURNICEDOGS", "FOUR NICE DOGS"],
|
16
|
+
["MYCATLIKESMYDOG", "MY CAT LIKES MY DOG"]]
|
17
|
+
index = Maxixe::Trainer.generate_corpus_from_io([2,3,4,5], "ILIKEMYDOG
|
18
|
+
THISHOUSEISMYHOUSE
|
19
|
+
MYDOGISSONICE
|
20
|
+
WHOLIKESDOGSANYWAY
|
21
|
+
CATSANDDOGSUSUALLYFIGHT
|
22
|
+
INMYHOUSETHEREAREFOURDOGS
|
23
|
+
IWANTAHOUSEFORMYDOG")
|
24
|
+
|
25
|
+
optimal = Maxixe::Trainer.optimize(index, pre_segmented)
|
26
|
+
optimal[:n].should == ["2","4"]
|
27
|
+
optimal[:score].should == 0
|
28
|
+
optimal[:t].should be_within(0.01).of(0.5)
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
|
13
33
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: maxixe
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,12 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-
|
13
|
-
default_executable:
|
12
|
+
date: 2011-09-11 00:00:00.000000000 Z
|
14
13
|
dependencies:
|
15
14
|
- !ruby/object:Gem::Dependency
|
16
|
-
name:
|
17
|
-
requirement: &
|
15
|
+
name: text
|
16
|
+
requirement: &81697730 !ruby/object:Gem::Requirement
|
18
17
|
none: false
|
19
18
|
requirements:
|
20
19
|
- - ! '>='
|
@@ -22,10 +21,10 @@ dependencies:
|
|
22
21
|
version: '0'
|
23
22
|
type: :runtime
|
24
23
|
prerelease: false
|
25
|
-
version_requirements: *
|
24
|
+
version_requirements: *81697730
|
26
25
|
- !ruby/object:Gem::Dependency
|
27
26
|
name: rspec
|
28
|
-
requirement: &
|
27
|
+
requirement: &81697520 !ruby/object:Gem::Requirement
|
29
28
|
none: false
|
30
29
|
requirements:
|
31
30
|
- - ! '>='
|
@@ -33,11 +32,11 @@ dependencies:
|
|
33
32
|
version: '0'
|
34
33
|
type: :development
|
35
34
|
prerelease: false
|
36
|
-
version_requirements: *
|
35
|
+
version_requirements: *81697520
|
37
36
|
description: Maxixe is an implementation of the Tango algorithm describe in the paper
|
38
37
|
"Mostly-unsupervised statistical segmentation of Japanese kanji sequences" by Ando
|
39
38
|
and Lee. While the paper deals with Japanese characters, it should work on any unsegmented
|
40
|
-
text given enough corpus data and a tuning of the algorithm
|
39
|
+
text given enough corpus data and a tuning of the algorithm parameters.
|
41
40
|
email:
|
42
41
|
- maxixe@rogerbraun.net
|
43
42
|
executables: []
|
@@ -47,21 +46,16 @@ files:
|
|
47
46
|
- .gitignore
|
48
47
|
- .rspec
|
49
48
|
- Gemfile
|
49
|
+
- README.md
|
50
50
|
- Rakefile
|
51
|
-
- lib/.maxixe.rb.swp
|
52
51
|
- lib/maxixe.rb
|
53
52
|
- lib/maxixe/version.rb
|
54
53
|
- maxixe.gemspec
|
55
|
-
- spec/segmenter/.segmenter_spec.rb.swp
|
56
54
|
- spec/segmenter/segmenter_spec.rb
|
57
55
|
- spec/spec_helper.rb
|
58
|
-
- spec/trainer/.first_file.swp
|
59
|
-
- spec/trainer/.second_file.swp
|
60
|
-
- spec/trainer/.trainer_spec.rb.swp
|
61
56
|
- spec/trainer/first_file
|
62
57
|
- spec/trainer/second_file
|
63
58
|
- spec/trainer/trainer_spec.rb
|
64
|
-
has_rdoc: true
|
65
59
|
homepage: https://github.com/rogerbraun/Maxixe
|
66
60
|
licenses: []
|
67
61
|
post_install_message:
|
@@ -82,7 +76,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
82
76
|
version: '0'
|
83
77
|
requirements: []
|
84
78
|
rubyforge_project: maxixe
|
85
|
-
rubygems_version: 1.
|
79
|
+
rubygems_version: 1.8.9
|
86
80
|
signing_key:
|
87
81
|
specification_version: 3
|
88
82
|
summary: A small statistical segmenter for any language.
|
data/lib/.maxixe.rb.swp
DELETED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|