maxixe 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -2,3 +2,4 @@
2
2
  .bundle
3
3
  Gemfile.lock
4
4
  pkg/*
5
+ *.sw?
data/Gemfile CHANGED
@@ -2,3 +2,4 @@ source "http://rubygems.org"
2
2
 
3
3
  # Specify your gem's dependencies in maxixe.gemspec
4
4
  gemspec
5
+
@@ -0,0 +1,10 @@
1
+ # Maxixe
2
+ ### A simple statistical segmenter for any language
3
+
4
+ ## About
5
+
6
+ Maxixe is an implementation of the Tango algorithm describe in the paper "Mostly-unsupervised statistical segmentation of Japanese kanji sequences" by Ando and Lee. While the paper deals with Japanese characters, it should work on any unsegmented text given enough corpus data and a tuning of the algorithm paramenters.
7
+
8
+ ## How to use
9
+
10
+ First, you need a hash that contains the count of all n-grams in a given corpus.
@@ -1,3 +1,4 @@
1
+ require "text"
1
2
  module Maxixe
2
3
  class Segmenter
3
4
 
@@ -98,28 +99,51 @@ module Maxixe
98
99
 
99
100
  class Trainer
100
101
 
101
- def self.generate_and_dump(n, output, *files)
102
- res = self.generate_training_data(n, *files)
103
- File.open(output,"w") do |file|
104
- Yajl::Encoder.encode res, file
102
+ def self.generate_corpus_from_io(n , io)
103
+ result = n.inject({}){|r, c_n| r[c_n.to_s] = Hash.new{0}; r}
104
+ io.each_line do |line|
105
+ n.each do |c_n|
106
+ n_grams = line.each_char.each_cons(c_n).map(&:join).to_a
107
+ n_grams.each do |n_gram|
108
+ result[c_n.to_s][n_gram] += 1
109
+ end
110
+ end
105
111
  end
112
+ result
106
113
  end
107
114
 
108
- def self.generate_training_data(n, *files)
109
- result = n.inject({}){|r, c_n| r[c_n.to_s] = Hash.new{0}; r}
115
+ def self.optimize(index, samples)
116
+ res = check_recognition(index, samples)
117
+ min = nil
118
+ res.each do |n, ts|
119
+ ts.each do |t, score|
120
+ if !min or score < min[1]
121
+ min = [[n,t],score]
122
+ end
123
+ end
124
+ end
125
+ {:n => min[0][0], :t => min[0][1], :score => min[1]}
126
+ end
110
127
 
111
- files.each do |file|
112
- input = open(file)
113
- input.each_line do |line|
114
- n.each do |c_n|
115
- n_grams = line.each_char.each_cons(c_n).map(&:join).to_a
116
- n_grams.each do |n_gram|
117
- result[c_n.to_s][n_gram] += 1
118
- end
128
+ def self.check_recognition(index, samples)
129
+ # Get all subsets of N
130
+ ns = 1.upto(index.keys.size).map{|i| index.keys.combination(i).to_a}.flatten(1)
131
+ results = ns.inject({}) do |res, n|
132
+ n_index = index.select{|key, value| n.include? key}
133
+ m = Maxixe::Segmenter.new(n_index)
134
+
135
+ t_values = ((0.1)..(1.0)).step(0.1).inject({}) do |res, t|
136
+ difference = samples.inject(0) do |result, (not_split, split)|
137
+ temp = m.segment(not_split, t)
138
+ result += Text::Levenshtein.distance(temp, split)
119
139
  end
140
+ res[t] = difference
141
+ res
120
142
  end
143
+ res[n] = t_values
144
+ res
121
145
  end
122
- result
146
+ results
123
147
  end
124
148
  end
125
149
  end
@@ -1,3 +1,3 @@
1
1
  module Maxixe
2
- VERSION = "0.0.1"
2
+ VERSION = "0.1.0"
3
3
  end
@@ -10,11 +10,11 @@ Gem::Specification.new do |s|
10
10
  s.email = ["maxixe@rogerbraun.net"]
11
11
  s.homepage = "https://github.com/rogerbraun/Maxixe"
12
12
  s.summary = %q{A small statistical segmenter for any language.}
13
- s.description = %q{Maxixe is an implementation of the Tango algorithm describe in the paper "Mostly-unsupervised statistical segmentation of Japanese kanji sequences" by Ando and Lee. While the paper deals with Japanese characters, it should work on any unsegmented text given enough corpus data and a tuning of the algorithm paramenters.}
13
+ s.description = %q{Maxixe is an implementation of the Tango algorithm describe in the paper "Mostly-unsupervised statistical segmentation of Japanese kanji sequences" by Ando and Lee. While the paper deals with Japanese characters, it should work on any unsegmented text given enough corpus data and a tuning of the algorithm parameters.}
14
14
 
15
15
  s.rubyforge_project = "maxixe"
16
16
 
17
- s.add_dependency "yajl-ruby"
17
+ s.add_dependency "text"
18
18
  s.add_development_dependency "rspec"
19
19
 
20
20
  s.files = `git ls-files`.split("\n")
@@ -56,12 +56,17 @@ describe Maxixe::Segmenter do
56
56
  end
57
57
 
58
58
  describe "Segmenting Text" do
59
- before(:each) do
60
- @segmenter = Maxixe::Segmenter.new({"2"=>{"AB"=>2, "BC"=>2, "CD"=>1, "DE"=>1, "EF"=>1, "FG"=>1, "G\n"=>1, "CX"=>1, "XY"=>1, "YZ"=>1, "Z\n"=>1}, "3"=>{"ABC"=>2, "BCD"=>1, "CDE"=>1, "DEF"=>1, "EFG"=>1, "FG\n"=>1, "BCX"=>1, "CXY"=>1, "XYZ"=>1, "YZ\n"=>1}})
61
- end
62
59
 
63
- it "should be able to segment text" do
64
- @segmenter.segment("ABCDE").should == "ABC DE"
60
+ it "should do some examples" do
61
+ index = Maxixe::Trainer.generate_corpus_from_io([3], "ILIKEMYDOG
62
+ THISHOUSEISMYHOUSE
63
+ MYDOGISSONICE
64
+ INMYHOUSETHEREAREFOURDOGS
65
+ IWANTAHOUSEFORMYDOG")
66
+
67
+ m = Maxixe::Segmenter.new(index,0.3)
68
+ m.segment("FOURNICEDOGS").should == "FOUR NICE DOGS"
69
+ m.segment("MYDOGISINTHEHOUSE").should == "MY DOG IS IN THE HOUSE"
65
70
  end
66
71
  end
67
72
  end
@@ -2,12 +2,32 @@ require "spec_helper"
2
2
 
3
3
  describe Maxixe::Trainer do
4
4
 
5
- it "should generate n-gram data from a set of files" do
5
+ it "should generate n-gram data from IOs" do
6
6
 
7
7
  pwd = File.dirname(__FILE__)
8
8
 
9
- Maxixe::Trainer.generate_training_data([2,3], File.join(pwd, "first_file"), File.join(pwd,"second_file")).should == {"2"=>{"AB"=>2, "BC"=>2, "CD"=>1, "DE"=>1, "EF"=>1, "FG"=>1, "G\n"=>1, "CX"=>1, "XY"=>1, "YZ"=>1, "Z\n"=>1}, "3"=>{"ABC"=>2, "BCD"=>1, "CDE"=>1, "DEF"=>1, "EFG"=>1, "FG\n"=>1, "BCX"=>1, "CXY"=>1, "XYZ"=>1, "YZ\n"=>1}}
9
+ Maxixe::Trainer.generate_corpus_from_io([2,3], open(File.join(pwd, "first_file"))).should == {"2"=>{"AB"=>1, "BC"=>1, "CD"=>1, "DE"=>1, "EF"=>1, "FG"=>1, "G\n"=>1}, "3"=>{"ABC"=>1, "BCD"=>1, "CDE"=>1, "DEF"=>1, "EFG"=>1, "FG\n"=>1}}
10
10
 
11
11
  end
12
12
 
13
+ it "should be able to find the optimal threshold and n values" do
14
+ pre_segmented = [["MYDOGISINTHEHOUSE", "MY DOG IS IN THE HOUSE"],
15
+ ["FOURNICEDOGS", "FOUR NICE DOGS"],
16
+ ["MYCATLIKESMYDOG", "MY CAT LIKES MY DOG"]]
17
+ index = Maxixe::Trainer.generate_corpus_from_io([2,3,4,5], "ILIKEMYDOG
18
+ THISHOUSEISMYHOUSE
19
+ MYDOGISSONICE
20
+ WHOLIKESDOGSANYWAY
21
+ CATSANDDOGSUSUALLYFIGHT
22
+ INMYHOUSETHEREAREFOURDOGS
23
+ IWANTAHOUSEFORMYDOG")
24
+
25
+ optimal = Maxixe::Trainer.optimize(index, pre_segmented)
26
+ optimal[:n].should == ["2","4"]
27
+ optimal[:score].should == 0
28
+ optimal[:t].should be_within(0.01).of(0.5)
29
+
30
+ end
31
+
32
+
13
33
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: maxixe
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,12 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-08-20 00:00:00.000000000 %:z
13
- default_executable:
12
+ date: 2011-09-11 00:00:00.000000000 Z
14
13
  dependencies:
15
14
  - !ruby/object:Gem::Dependency
16
- name: yajl-ruby
17
- requirement: &72352710 !ruby/object:Gem::Requirement
15
+ name: text
16
+ requirement: &81697730 !ruby/object:Gem::Requirement
18
17
  none: false
19
18
  requirements:
20
19
  - - ! '>='
@@ -22,10 +21,10 @@ dependencies:
22
21
  version: '0'
23
22
  type: :runtime
24
23
  prerelease: false
25
- version_requirements: *72352710
24
+ version_requirements: *81697730
26
25
  - !ruby/object:Gem::Dependency
27
26
  name: rspec
28
- requirement: &72352370 !ruby/object:Gem::Requirement
27
+ requirement: &81697520 !ruby/object:Gem::Requirement
29
28
  none: false
30
29
  requirements:
31
30
  - - ! '>='
@@ -33,11 +32,11 @@ dependencies:
33
32
  version: '0'
34
33
  type: :development
35
34
  prerelease: false
36
- version_requirements: *72352370
35
+ version_requirements: *81697520
37
36
  description: Maxixe is an implementation of the Tango algorithm describe in the paper
38
37
  "Mostly-unsupervised statistical segmentation of Japanese kanji sequences" by Ando
39
38
  and Lee. While the paper deals with Japanese characters, it should work on any unsegmented
40
- text given enough corpus data and a tuning of the algorithm paramenters.
39
+ text given enough corpus data and a tuning of the algorithm parameters.
41
40
  email:
42
41
  - maxixe@rogerbraun.net
43
42
  executables: []
@@ -47,21 +46,16 @@ files:
47
46
  - .gitignore
48
47
  - .rspec
49
48
  - Gemfile
49
+ - README.md
50
50
  - Rakefile
51
- - lib/.maxixe.rb.swp
52
51
  - lib/maxixe.rb
53
52
  - lib/maxixe/version.rb
54
53
  - maxixe.gemspec
55
- - spec/segmenter/.segmenter_spec.rb.swp
56
54
  - spec/segmenter/segmenter_spec.rb
57
55
  - spec/spec_helper.rb
58
- - spec/trainer/.first_file.swp
59
- - spec/trainer/.second_file.swp
60
- - spec/trainer/.trainer_spec.rb.swp
61
56
  - spec/trainer/first_file
62
57
  - spec/trainer/second_file
63
58
  - spec/trainer/trainer_spec.rb
64
- has_rdoc: true
65
59
  homepage: https://github.com/rogerbraun/Maxixe
66
60
  licenses: []
67
61
  post_install_message:
@@ -82,7 +76,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
82
76
  version: '0'
83
77
  requirements: []
84
78
  rubyforge_project: maxixe
85
- rubygems_version: 1.6.1
79
+ rubygems_version: 1.8.9
86
80
  signing_key:
87
81
  specification_version: 3
88
82
  summary: A small statistical segmenter for any language.
Binary file
Binary file