maxixe 0.0.1 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -2,3 +2,4 @@
2
2
  .bundle
3
3
  Gemfile.lock
4
4
  pkg/*
5
+ *.sw?
data/Gemfile CHANGED
@@ -2,3 +2,4 @@ source "http://rubygems.org"
2
2
 
3
3
  # Specify your gem's dependencies in maxixe.gemspec
4
4
  gemspec
5
+
@@ -0,0 +1,10 @@
1
+ # Maxixe
2
+ ### A simple statistical segmenter for any language
3
+
4
+ ## About
5
+
6
+ Maxixe is an implementation of the Tango algorithm describe in the paper "Mostly-unsupervised statistical segmentation of Japanese kanji sequences" by Ando and Lee. While the paper deals with Japanese characters, it should work on any unsegmented text given enough corpus data and a tuning of the algorithm paramenters.
7
+
8
+ ## How to use
9
+
10
+ First, you need a hash that contains the count of all n-grams in a given corpus.
@@ -1,3 +1,4 @@
1
+ require "text"
1
2
  module Maxixe
2
3
  class Segmenter
3
4
 
@@ -98,28 +99,51 @@ module Maxixe
98
99
 
99
100
  class Trainer
100
101
 
101
- def self.generate_and_dump(n, output, *files)
102
- res = self.generate_training_data(n, *files)
103
- File.open(output,"w") do |file|
104
- Yajl::Encoder.encode res, file
102
+ def self.generate_corpus_from_io(n , io)
103
+ result = n.inject({}){|r, c_n| r[c_n.to_s] = Hash.new{0}; r}
104
+ io.each_line do |line|
105
+ n.each do |c_n|
106
+ n_grams = line.each_char.each_cons(c_n).map(&:join).to_a
107
+ n_grams.each do |n_gram|
108
+ result[c_n.to_s][n_gram] += 1
109
+ end
110
+ end
105
111
  end
112
+ result
106
113
  end
107
114
 
108
- def self.generate_training_data(n, *files)
109
- result = n.inject({}){|r, c_n| r[c_n.to_s] = Hash.new{0}; r}
115
+ def self.optimize(index, samples)
116
+ res = check_recognition(index, samples)
117
+ min = nil
118
+ res.each do |n, ts|
119
+ ts.each do |t, score|
120
+ if !min or score < min[1]
121
+ min = [[n,t],score]
122
+ end
123
+ end
124
+ end
125
+ {:n => min[0][0], :t => min[0][1], :score => min[1]}
126
+ end
110
127
 
111
- files.each do |file|
112
- input = open(file)
113
- input.each_line do |line|
114
- n.each do |c_n|
115
- n_grams = line.each_char.each_cons(c_n).map(&:join).to_a
116
- n_grams.each do |n_gram|
117
- result[c_n.to_s][n_gram] += 1
118
- end
128
+ def self.check_recognition(index, samples)
129
+ # Get all subsets of N
130
+ ns = 1.upto(index.keys.size).map{|i| index.keys.combination(i).to_a}.flatten(1)
131
+ results = ns.inject({}) do |res, n|
132
+ n_index = index.select{|key, value| n.include? key}
133
+ m = Maxixe::Segmenter.new(n_index)
134
+
135
+ t_values = ((0.1)..(1.0)).step(0.1).inject({}) do |res, t|
136
+ difference = samples.inject(0) do |result, (not_split, split)|
137
+ temp = m.segment(not_split, t)
138
+ result += Text::Levenshtein.distance(temp, split)
119
139
  end
140
+ res[t] = difference
141
+ res
120
142
  end
143
+ res[n] = t_values
144
+ res
121
145
  end
122
- result
146
+ results
123
147
  end
124
148
  end
125
149
  end
@@ -1,3 +1,3 @@
1
1
  module Maxixe
2
- VERSION = "0.0.1"
2
+ VERSION = "0.1.0"
3
3
  end
@@ -10,11 +10,11 @@ Gem::Specification.new do |s|
10
10
  s.email = ["maxixe@rogerbraun.net"]
11
11
  s.homepage = "https://github.com/rogerbraun/Maxixe"
12
12
  s.summary = %q{A small statistical segmenter for any language.}
13
- s.description = %q{Maxixe is an implementation of the Tango algorithm describe in the paper "Mostly-unsupervised statistical segmentation of Japanese kanji sequences" by Ando and Lee. While the paper deals with Japanese characters, it should work on any unsegmented text given enough corpus data and a tuning of the algorithm paramenters.}
13
+ s.description = %q{Maxixe is an implementation of the Tango algorithm describe in the paper "Mostly-unsupervised statistical segmentation of Japanese kanji sequences" by Ando and Lee. While the paper deals with Japanese characters, it should work on any unsegmented text given enough corpus data and a tuning of the algorithm parameters.}
14
14
 
15
15
  s.rubyforge_project = "maxixe"
16
16
 
17
- s.add_dependency "yajl-ruby"
17
+ s.add_dependency "text"
18
18
  s.add_development_dependency "rspec"
19
19
 
20
20
  s.files = `git ls-files`.split("\n")
@@ -56,12 +56,17 @@ describe Maxixe::Segmenter do
56
56
  end
57
57
 
58
58
  describe "Segmenting Text" do
59
- before(:each) do
60
- @segmenter = Maxixe::Segmenter.new({"2"=>{"AB"=>2, "BC"=>2, "CD"=>1, "DE"=>1, "EF"=>1, "FG"=>1, "G\n"=>1, "CX"=>1, "XY"=>1, "YZ"=>1, "Z\n"=>1}, "3"=>{"ABC"=>2, "BCD"=>1, "CDE"=>1, "DEF"=>1, "EFG"=>1, "FG\n"=>1, "BCX"=>1, "CXY"=>1, "XYZ"=>1, "YZ\n"=>1}})
61
- end
62
59
 
63
- it "should be able to segment text" do
64
- @segmenter.segment("ABCDE").should == "ABC DE"
60
+ it "should do some examples" do
61
+ index = Maxixe::Trainer.generate_corpus_from_io([3], "ILIKEMYDOG
62
+ THISHOUSEISMYHOUSE
63
+ MYDOGISSONICE
64
+ INMYHOUSETHEREAREFOURDOGS
65
+ IWANTAHOUSEFORMYDOG")
66
+
67
+ m = Maxixe::Segmenter.new(index,0.3)
68
+ m.segment("FOURNICEDOGS").should == "FOUR NICE DOGS"
69
+ m.segment("MYDOGISINTHEHOUSE").should == "MY DOG IS IN THE HOUSE"
65
70
  end
66
71
  end
67
72
  end
@@ -2,12 +2,32 @@ require "spec_helper"
2
2
 
3
3
  describe Maxixe::Trainer do
4
4
 
5
- it "should generate n-gram data from a set of files" do
5
+ it "should generate n-gram data from IOs" do
6
6
 
7
7
  pwd = File.dirname(__FILE__)
8
8
 
9
- Maxixe::Trainer.generate_training_data([2,3], File.join(pwd, "first_file"), File.join(pwd,"second_file")).should == {"2"=>{"AB"=>2, "BC"=>2, "CD"=>1, "DE"=>1, "EF"=>1, "FG"=>1, "G\n"=>1, "CX"=>1, "XY"=>1, "YZ"=>1, "Z\n"=>1}, "3"=>{"ABC"=>2, "BCD"=>1, "CDE"=>1, "DEF"=>1, "EFG"=>1, "FG\n"=>1, "BCX"=>1, "CXY"=>1, "XYZ"=>1, "YZ\n"=>1}}
9
+ Maxixe::Trainer.generate_corpus_from_io([2,3], open(File.join(pwd, "first_file"))).should == {"2"=>{"AB"=>1, "BC"=>1, "CD"=>1, "DE"=>1, "EF"=>1, "FG"=>1, "G\n"=>1}, "3"=>{"ABC"=>1, "BCD"=>1, "CDE"=>1, "DEF"=>1, "EFG"=>1, "FG\n"=>1}}
10
10
 
11
11
  end
12
12
 
13
+ it "should be able to find the optimal threshold and n values" do
14
+ pre_segmented = [["MYDOGISINTHEHOUSE", "MY DOG IS IN THE HOUSE"],
15
+ ["FOURNICEDOGS", "FOUR NICE DOGS"],
16
+ ["MYCATLIKESMYDOG", "MY CAT LIKES MY DOG"]]
17
+ index = Maxixe::Trainer.generate_corpus_from_io([2,3,4,5], "ILIKEMYDOG
18
+ THISHOUSEISMYHOUSE
19
+ MYDOGISSONICE
20
+ WHOLIKESDOGSANYWAY
21
+ CATSANDDOGSUSUALLYFIGHT
22
+ INMYHOUSETHEREAREFOURDOGS
23
+ IWANTAHOUSEFORMYDOG")
24
+
25
+ optimal = Maxixe::Trainer.optimize(index, pre_segmented)
26
+ optimal[:n].should == ["2","4"]
27
+ optimal[:score].should == 0
28
+ optimal[:t].should be_within(0.01).of(0.5)
29
+
30
+ end
31
+
32
+
13
33
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: maxixe
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,12 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-08-20 00:00:00.000000000 %:z
13
- default_executable:
12
+ date: 2011-09-11 00:00:00.000000000 Z
14
13
  dependencies:
15
14
  - !ruby/object:Gem::Dependency
16
- name: yajl-ruby
17
- requirement: &72352710 !ruby/object:Gem::Requirement
15
+ name: text
16
+ requirement: &81697730 !ruby/object:Gem::Requirement
18
17
  none: false
19
18
  requirements:
20
19
  - - ! '>='
@@ -22,10 +21,10 @@ dependencies:
22
21
  version: '0'
23
22
  type: :runtime
24
23
  prerelease: false
25
- version_requirements: *72352710
24
+ version_requirements: *81697730
26
25
  - !ruby/object:Gem::Dependency
27
26
  name: rspec
28
- requirement: &72352370 !ruby/object:Gem::Requirement
27
+ requirement: &81697520 !ruby/object:Gem::Requirement
29
28
  none: false
30
29
  requirements:
31
30
  - - ! '>='
@@ -33,11 +32,11 @@ dependencies:
33
32
  version: '0'
34
33
  type: :development
35
34
  prerelease: false
36
- version_requirements: *72352370
35
+ version_requirements: *81697520
37
36
  description: Maxixe is an implementation of the Tango algorithm describe in the paper
38
37
  "Mostly-unsupervised statistical segmentation of Japanese kanji sequences" by Ando
39
38
  and Lee. While the paper deals with Japanese characters, it should work on any unsegmented
40
- text given enough corpus data and a tuning of the algorithm paramenters.
39
+ text given enough corpus data and a tuning of the algorithm parameters.
41
40
  email:
42
41
  - maxixe@rogerbraun.net
43
42
  executables: []
@@ -47,21 +46,16 @@ files:
47
46
  - .gitignore
48
47
  - .rspec
49
48
  - Gemfile
49
+ - README.md
50
50
  - Rakefile
51
- - lib/.maxixe.rb.swp
52
51
  - lib/maxixe.rb
53
52
  - lib/maxixe/version.rb
54
53
  - maxixe.gemspec
55
- - spec/segmenter/.segmenter_spec.rb.swp
56
54
  - spec/segmenter/segmenter_spec.rb
57
55
  - spec/spec_helper.rb
58
- - spec/trainer/.first_file.swp
59
- - spec/trainer/.second_file.swp
60
- - spec/trainer/.trainer_spec.rb.swp
61
56
  - spec/trainer/first_file
62
57
  - spec/trainer/second_file
63
58
  - spec/trainer/trainer_spec.rb
64
- has_rdoc: true
65
59
  homepage: https://github.com/rogerbraun/Maxixe
66
60
  licenses: []
67
61
  post_install_message:
@@ -82,7 +76,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
82
76
  version: '0'
83
77
  requirements: []
84
78
  rubyforge_project: maxixe
85
- rubygems_version: 1.6.1
79
+ rubygems_version: 1.8.9
86
80
  signing_key:
87
81
  specification_version: 3
88
82
  summary: A small statistical segmenter for any language.
Binary file
Binary file