maxixe 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/Rakefile +9 -0
- data/lib/.maxixe.rb.swp +0 -0
- data/lib/maxixe.rb +125 -0
- data/lib/maxixe/version.rb +3 -0
- data/maxixe.gemspec +24 -0
- data/spec/segmenter/.segmenter_spec.rb.swp +0 -0
- data/spec/segmenter/segmenter_spec.rb +67 -0
- data/spec/spec_helper.rb +5 -0
- data/spec/trainer/.first_file.swp +0 -0
- data/spec/trainer/.second_file.swp +0 -0
- data/spec/trainer/.trainer_spec.rb.swp +0 -0
- data/spec/trainer/first_file +1 -0
- data/spec/trainer/second_file +1 -0
- data/spec/trainer/trainer_spec.rb +13 -0
- metadata +89 -0
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/Rakefile
ADDED
data/lib/.maxixe.rb.swp
ADDED
Binary file
|
data/lib/maxixe.rb
ADDED
@@ -0,0 +1,125 @@
|
|
1
|
+
module Maxixe
|
2
|
+
class Segmenter
|
3
|
+
|
4
|
+
attr_accessor :t
|
5
|
+
|
6
|
+
def initialize(index, t = 0.5)
|
7
|
+
@index = index
|
8
|
+
@n = index.keys.map(&:to_i)
|
9
|
+
@t = t
|
10
|
+
end
|
11
|
+
|
12
|
+
def segment(str, t = nil)
|
13
|
+
|
14
|
+
n_grams = all_n_grams(str)
|
15
|
+
|
16
|
+
votes_for_all = n_grams.map{|n| compute_votes(straddling_and_non_straddling(n,str), n.first.size)}
|
17
|
+
|
18
|
+
averaged = average_votes(votes_for_all)
|
19
|
+
|
20
|
+
split_with_votes(averaged, str, t)
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
def split_with_votes(votes, str, t = nil)
|
25
|
+
points = []
|
26
|
+
votes.each_with_index do |vote, i|
|
27
|
+
treshold = vote > (t || @t)
|
28
|
+
maximum = if i > 0 and i < (votes.size - 1)
|
29
|
+
vote > votes[i - 1] and vote > votes[i + 1]
|
30
|
+
else false end
|
31
|
+
|
32
|
+
points << i if treshold or maximum
|
33
|
+
end
|
34
|
+
|
35
|
+
res = str.dup
|
36
|
+
offset = 1
|
37
|
+
points.each do |p|
|
38
|
+
res.insert(p + offset, " ")
|
39
|
+
offset += 1
|
40
|
+
end
|
41
|
+
|
42
|
+
res
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
def all_n_grams str
|
47
|
+
@n.map do |n| str.each_char.each_cons(n).to_a end
|
48
|
+
end
|
49
|
+
|
50
|
+
def token_count(n_gram)
|
51
|
+
@index[n_gram.length.to_s][n_gram] || 0
|
52
|
+
end
|
53
|
+
|
54
|
+
def straddling_and_non_straddling n_grams, str
|
55
|
+
(0..(str.length - 2)).map do |pos|
|
56
|
+
[non_straddling(n_grams, pos), straddling(n_grams, pos)]
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def non_straddling n_grams, pos
|
61
|
+
res = []
|
62
|
+
n_grams.each_with_index do |n_gram, i|
|
63
|
+
res << n_gram if i == pos + 1 or i == pos - (n_gram.size - 1)
|
64
|
+
end
|
65
|
+
res.map(&:join)
|
66
|
+
end
|
67
|
+
|
68
|
+
def straddling n_grams, pos
|
69
|
+
res = []
|
70
|
+
n_grams.each_with_index do |n_gram, i|
|
71
|
+
res << n_gram if i <= pos and i > pos - (n_gram.size - 1)
|
72
|
+
end
|
73
|
+
res.map(&:join)
|
74
|
+
end
|
75
|
+
|
76
|
+
def compute_votes positions_with_ngrams, n
|
77
|
+
positions_with_ngrams.map do |(non_strad, strad)|
|
78
|
+
compute_vote(non_strad, strad, n)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def compute_vote(non_strad, strad, n)
|
83
|
+
res = non_strad.inject(0) do |res, s|
|
84
|
+
res + strad.inject(0) do |res_2, t|
|
85
|
+
res_2 + ((token_count(s) > token_count(t)) ? 1 : 0)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
res / (2.0 * (n - 1))
|
89
|
+
end
|
90
|
+
|
91
|
+
def average_votes(votes)
|
92
|
+
votes.transpose.map do |vote_array|
|
93
|
+
vote_array.inject(&:+).to_f / vote_array.size
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
|
99
|
+
class Trainer
|
100
|
+
|
101
|
+
def self.generate_and_dump(n, output, *files)
|
102
|
+
res = self.generate_training_data(n, *files)
|
103
|
+
File.open(output,"w") do |file|
|
104
|
+
Yajl::Encoder.encode res, file
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def self.generate_training_data(n, *files)
|
109
|
+
result = n.inject({}){|r, c_n| r[c_n.to_s] = Hash.new{0}; r}
|
110
|
+
|
111
|
+
files.each do |file|
|
112
|
+
input = open(file)
|
113
|
+
input.each_line do |line|
|
114
|
+
n.each do |c_n|
|
115
|
+
n_grams = line.each_char.each_cons(c_n).map(&:join).to_a
|
116
|
+
n_grams.each do |n_gram|
|
117
|
+
result[c_n.to_s][n_gram] += 1
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
result
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
data/maxixe.gemspec
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "maxixe/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "maxixe"
|
7
|
+
s.version = Maxixe::VERSION
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.authors = ["Roger Braun"]
|
10
|
+
s.email = ["maxixe@rogerbraun.net"]
|
11
|
+
s.homepage = "https://github.com/rogerbraun/Maxixe"
|
12
|
+
s.summary = %q{A small statistical segmenter for any language.}
|
13
|
+
s.description = %q{Maxixe is an implementation of the Tango algorithm describe in the paper "Mostly-unsupervised statistical segmentation of Japanese kanji sequences" by Ando and Lee. While the paper deals with Japanese characters, it should work on any unsegmented text given enough corpus data and a tuning of the algorithm paramenters.}
|
14
|
+
|
15
|
+
s.rubyforge_project = "maxixe"
|
16
|
+
|
17
|
+
s.add_dependency "yajl-ruby"
|
18
|
+
s.add_development_dependency "rspec"
|
19
|
+
|
20
|
+
s.files = `git ls-files`.split("\n")
|
21
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
22
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
23
|
+
s.require_paths = ["lib"]
|
24
|
+
end
|
Binary file
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe Maxixe::Segmenter do
|
4
|
+
describe "internal functions" do
|
5
|
+
|
6
|
+
before(:each) do
|
7
|
+
@sentence = "1234567"
|
8
|
+
@two_grams = @sentence.each_char.each_cons(2).to_a
|
9
|
+
@three_grams = @sentence.each_char.each_cons(3).to_a
|
10
|
+
@segmenter = Maxixe::Segmenter.new({})
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should give all non_straddling n_grams for a given position" do
|
14
|
+
|
15
|
+
# only right segment exists
|
16
|
+
@segmenter.non_straddling(@two_grams, 0).should == ["23"]
|
17
|
+
@segmenter.non_straddling(@three_grams, 0).should == ["234"]
|
18
|
+
|
19
|
+
# only left segment exists
|
20
|
+
@segmenter.non_straddling(@two_grams, 5).should == ["56"]
|
21
|
+
@segmenter.non_straddling(@three_grams, 5).should == ["456"]
|
22
|
+
|
23
|
+
# both segments exists
|
24
|
+
@segmenter.non_straddling(@two_grams, 1).should == ["12","34"]
|
25
|
+
@segmenter.non_straddling(@three_grams, 2).should == ["123", "456"]
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should give all straddling n_grams for a given position" do
|
30
|
+
|
31
|
+
@segmenter.straddling(@two_grams, 1).should == ["23"]
|
32
|
+
@segmenter.straddling(@three_grams, 1).should == ["123", "234"]
|
33
|
+
@segmenter.straddling(@three_grams, 0).should == ["123"]
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should give all straddling and non straddling n-grams for a given string and all positions" do
|
38
|
+
|
39
|
+
res = @segmenter.straddling_and_non_straddling(@two_grams, @sentence)
|
40
|
+
|
41
|
+
res.size.should == @sentence.size - 1
|
42
|
+
|
43
|
+
res[0].should == [["23"],["12"]]
|
44
|
+
res[1].should == [["12","34"],["23"]]
|
45
|
+
|
46
|
+
res = @segmenter.straddling_and_non_straddling(@three_grams, @sentence)
|
47
|
+
res[0].should == [["234"],["123"]]
|
48
|
+
res[1].should == [["345"],["123","234"]]
|
49
|
+
|
50
|
+
end
|
51
|
+
|
52
|
+
it "should average votes" do
|
53
|
+
votes = [[1,0,1,0],[0,1,0,1]]
|
54
|
+
@segmenter.average_votes(votes).should == [0.5, 0.5, 0.5, 0.5]
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
describe "Segmenting Text" do
|
59
|
+
before(:each) do
|
60
|
+
@segmenter = Maxixe::Segmenter.new({"2"=>{"AB"=>2, "BC"=>2, "CD"=>1, "DE"=>1, "EF"=>1, "FG"=>1, "G\n"=>1, "CX"=>1, "XY"=>1, "YZ"=>1, "Z\n"=>1}, "3"=>{"ABC"=>2, "BCD"=>1, "CDE"=>1, "DEF"=>1, "EFG"=>1, "FG\n"=>1, "BCX"=>1, "CXY"=>1, "XYZ"=>1, "YZ\n"=>1}})
|
61
|
+
end
|
62
|
+
|
63
|
+
it "should be able to segment text" do
|
64
|
+
@segmenter.segment("ABCDE").should == "ABC DE"
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
data/spec/spec_helper.rb
ADDED
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1 @@
|
|
1
|
+
ABCDEFG
|
@@ -0,0 +1 @@
|
|
1
|
+
ABCXYZ
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe Maxixe::Trainer do
|
4
|
+
|
5
|
+
it "should generate n-gram data from a set of files" do
|
6
|
+
|
7
|
+
pwd = File.dirname(__FILE__)
|
8
|
+
|
9
|
+
Maxixe::Trainer.generate_training_data([2,3], File.join(pwd, "first_file"), File.join(pwd,"second_file")).should == {"2"=>{"AB"=>2, "BC"=>2, "CD"=>1, "DE"=>1, "EF"=>1, "FG"=>1, "G\n"=>1, "CX"=>1, "XY"=>1, "YZ"=>1, "Z\n"=>1}, "3"=>{"ABC"=>2, "BCD"=>1, "CDE"=>1, "DEF"=>1, "EFG"=>1, "FG\n"=>1, "BCX"=>1, "CXY"=>1, "XYZ"=>1, "YZ\n"=>1}}
|
10
|
+
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
metadata
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: maxixe
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Roger Braun
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-08-20 00:00:00.000000000 %:z
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: yajl-ruby
|
17
|
+
requirement: &72352710 !ruby/object:Gem::Requirement
|
18
|
+
none: false
|
19
|
+
requirements:
|
20
|
+
- - ! '>='
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: '0'
|
23
|
+
type: :runtime
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: *72352710
|
26
|
+
- !ruby/object:Gem::Dependency
|
27
|
+
name: rspec
|
28
|
+
requirement: &72352370 !ruby/object:Gem::Requirement
|
29
|
+
none: false
|
30
|
+
requirements:
|
31
|
+
- - ! '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: *72352370
|
37
|
+
description: Maxixe is an implementation of the Tango algorithm describe in the paper
|
38
|
+
"Mostly-unsupervised statistical segmentation of Japanese kanji sequences" by Ando
|
39
|
+
and Lee. While the paper deals with Japanese characters, it should work on any unsegmented
|
40
|
+
text given enough corpus data and a tuning of the algorithm paramenters.
|
41
|
+
email:
|
42
|
+
- maxixe@rogerbraun.net
|
43
|
+
executables: []
|
44
|
+
extensions: []
|
45
|
+
extra_rdoc_files: []
|
46
|
+
files:
|
47
|
+
- .gitignore
|
48
|
+
- .rspec
|
49
|
+
- Gemfile
|
50
|
+
- Rakefile
|
51
|
+
- lib/.maxixe.rb.swp
|
52
|
+
- lib/maxixe.rb
|
53
|
+
- lib/maxixe/version.rb
|
54
|
+
- maxixe.gemspec
|
55
|
+
- spec/segmenter/.segmenter_spec.rb.swp
|
56
|
+
- spec/segmenter/segmenter_spec.rb
|
57
|
+
- spec/spec_helper.rb
|
58
|
+
- spec/trainer/.first_file.swp
|
59
|
+
- spec/trainer/.second_file.swp
|
60
|
+
- spec/trainer/.trainer_spec.rb.swp
|
61
|
+
- spec/trainer/first_file
|
62
|
+
- spec/trainer/second_file
|
63
|
+
- spec/trainer/trainer_spec.rb
|
64
|
+
has_rdoc: true
|
65
|
+
homepage: https://github.com/rogerbraun/Maxixe
|
66
|
+
licenses: []
|
67
|
+
post_install_message:
|
68
|
+
rdoc_options: []
|
69
|
+
require_paths:
|
70
|
+
- lib
|
71
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
72
|
+
none: false
|
73
|
+
requirements:
|
74
|
+
- - ! '>='
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '0'
|
77
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
78
|
+
none: false
|
79
|
+
requirements:
|
80
|
+
- - ! '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
requirements: []
|
84
|
+
rubyforge_project: maxixe
|
85
|
+
rubygems_version: 1.6.1
|
86
|
+
signing_key:
|
87
|
+
specification_version: 3
|
88
|
+
summary: A small statistical segmenter for any language.
|
89
|
+
test_files: []
|