maxixe 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/Rakefile +9 -0
- data/lib/.maxixe.rb.swp +0 -0
- data/lib/maxixe.rb +125 -0
- data/lib/maxixe/version.rb +3 -0
- data/maxixe.gemspec +24 -0
- data/spec/segmenter/.segmenter_spec.rb.swp +0 -0
- data/spec/segmenter/segmenter_spec.rb +67 -0
- data/spec/spec_helper.rb +5 -0
- data/spec/trainer/.first_file.swp +0 -0
- data/spec/trainer/.second_file.swp +0 -0
- data/spec/trainer/.trainer_spec.rb.swp +0 -0
- data/spec/trainer/first_file +1 -0
- data/spec/trainer/second_file +1 -0
- data/spec/trainer/trainer_spec.rb +13 -0
- metadata +89 -0
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/Rakefile
ADDED
data/lib/.maxixe.rb.swp
ADDED
Binary file
|
data/lib/maxixe.rb
ADDED
@@ -0,0 +1,125 @@
|
|
1
|
+
module Maxixe
|
2
|
+
class Segmenter
|
3
|
+
|
4
|
+
attr_accessor :t
|
5
|
+
|
6
|
+
def initialize(index, t = 0.5)
|
7
|
+
@index = index
|
8
|
+
@n = index.keys.map(&:to_i)
|
9
|
+
@t = t
|
10
|
+
end
|
11
|
+
|
12
|
+
def segment(str, t = nil)
|
13
|
+
|
14
|
+
n_grams = all_n_grams(str)
|
15
|
+
|
16
|
+
votes_for_all = n_grams.map{|n| compute_votes(straddling_and_non_straddling(n,str), n.first.size)}
|
17
|
+
|
18
|
+
averaged = average_votes(votes_for_all)
|
19
|
+
|
20
|
+
split_with_votes(averaged, str, t)
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
def split_with_votes(votes, str, t = nil)
|
25
|
+
points = []
|
26
|
+
votes.each_with_index do |vote, i|
|
27
|
+
treshold = vote > (t || @t)
|
28
|
+
maximum = if i > 0 and i < (votes.size - 1)
|
29
|
+
vote > votes[i - 1] and vote > votes[i + 1]
|
30
|
+
else false end
|
31
|
+
|
32
|
+
points << i if treshold or maximum
|
33
|
+
end
|
34
|
+
|
35
|
+
res = str.dup
|
36
|
+
offset = 1
|
37
|
+
points.each do |p|
|
38
|
+
res.insert(p + offset, " ")
|
39
|
+
offset += 1
|
40
|
+
end
|
41
|
+
|
42
|
+
res
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
def all_n_grams str
|
47
|
+
@n.map do |n| str.each_char.each_cons(n).to_a end
|
48
|
+
end
|
49
|
+
|
50
|
+
def token_count(n_gram)
|
51
|
+
@index[n_gram.length.to_s][n_gram] || 0
|
52
|
+
end
|
53
|
+
|
54
|
+
def straddling_and_non_straddling n_grams, str
|
55
|
+
(0..(str.length - 2)).map do |pos|
|
56
|
+
[non_straddling(n_grams, pos), straddling(n_grams, pos)]
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def non_straddling n_grams, pos
|
61
|
+
res = []
|
62
|
+
n_grams.each_with_index do |n_gram, i|
|
63
|
+
res << n_gram if i == pos + 1 or i == pos - (n_gram.size - 1)
|
64
|
+
end
|
65
|
+
res.map(&:join)
|
66
|
+
end
|
67
|
+
|
68
|
+
def straddling n_grams, pos
|
69
|
+
res = []
|
70
|
+
n_grams.each_with_index do |n_gram, i|
|
71
|
+
res << n_gram if i <= pos and i > pos - (n_gram.size - 1)
|
72
|
+
end
|
73
|
+
res.map(&:join)
|
74
|
+
end
|
75
|
+
|
76
|
+
def compute_votes positions_with_ngrams, n
|
77
|
+
positions_with_ngrams.map do |(non_strad, strad)|
|
78
|
+
compute_vote(non_strad, strad, n)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def compute_vote(non_strad, strad, n)
|
83
|
+
res = non_strad.inject(0) do |res, s|
|
84
|
+
res + strad.inject(0) do |res_2, t|
|
85
|
+
res_2 + ((token_count(s) > token_count(t)) ? 1 : 0)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
res / (2.0 * (n - 1))
|
89
|
+
end
|
90
|
+
|
91
|
+
def average_votes(votes)
|
92
|
+
votes.transpose.map do |vote_array|
|
93
|
+
vote_array.inject(&:+).to_f / vote_array.size
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
|
99
|
+
class Trainer
|
100
|
+
|
101
|
+
def self.generate_and_dump(n, output, *files)
|
102
|
+
res = self.generate_training_data(n, *files)
|
103
|
+
File.open(output,"w") do |file|
|
104
|
+
Yajl::Encoder.encode res, file
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def self.generate_training_data(n, *files)
|
109
|
+
result = n.inject({}){|r, c_n| r[c_n.to_s] = Hash.new{0}; r}
|
110
|
+
|
111
|
+
files.each do |file|
|
112
|
+
input = open(file)
|
113
|
+
input.each_line do |line|
|
114
|
+
n.each do |c_n|
|
115
|
+
n_grams = line.each_char.each_cons(c_n).map(&:join).to_a
|
116
|
+
n_grams.each do |n_gram|
|
117
|
+
result[c_n.to_s][n_gram] += 1
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
result
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
data/maxixe.gemspec
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "maxixe/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "maxixe"
|
7
|
+
s.version = Maxixe::VERSION
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.authors = ["Roger Braun"]
|
10
|
+
s.email = ["maxixe@rogerbraun.net"]
|
11
|
+
s.homepage = "https://github.com/rogerbraun/Maxixe"
|
12
|
+
s.summary = %q{A small statistical segmenter for any language.}
|
13
|
+
s.description = %q{Maxixe is an implementation of the Tango algorithm describe in the paper "Mostly-unsupervised statistical segmentation of Japanese kanji sequences" by Ando and Lee. While the paper deals with Japanese characters, it should work on any unsegmented text given enough corpus data and a tuning of the algorithm paramenters.}
|
14
|
+
|
15
|
+
s.rubyforge_project = "maxixe"
|
16
|
+
|
17
|
+
s.add_dependency "yajl-ruby"
|
18
|
+
s.add_development_dependency "rspec"
|
19
|
+
|
20
|
+
s.files = `git ls-files`.split("\n")
|
21
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
22
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
23
|
+
s.require_paths = ["lib"]
|
24
|
+
end
|
Binary file
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe Maxixe::Segmenter do
|
4
|
+
describe "internal functions" do
|
5
|
+
|
6
|
+
before(:each) do
|
7
|
+
@sentence = "1234567"
|
8
|
+
@two_grams = @sentence.each_char.each_cons(2).to_a
|
9
|
+
@three_grams = @sentence.each_char.each_cons(3).to_a
|
10
|
+
@segmenter = Maxixe::Segmenter.new({})
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should give all non_straddling n_grams for a given position" do
|
14
|
+
|
15
|
+
# only right segment exists
|
16
|
+
@segmenter.non_straddling(@two_grams, 0).should == ["23"]
|
17
|
+
@segmenter.non_straddling(@three_grams, 0).should == ["234"]
|
18
|
+
|
19
|
+
# only left segment exists
|
20
|
+
@segmenter.non_straddling(@two_grams, 5).should == ["56"]
|
21
|
+
@segmenter.non_straddling(@three_grams, 5).should == ["456"]
|
22
|
+
|
23
|
+
# both segments exists
|
24
|
+
@segmenter.non_straddling(@two_grams, 1).should == ["12","34"]
|
25
|
+
@segmenter.non_straddling(@three_grams, 2).should == ["123", "456"]
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should give all straddling n_grams for a given position" do
|
30
|
+
|
31
|
+
@segmenter.straddling(@two_grams, 1).should == ["23"]
|
32
|
+
@segmenter.straddling(@three_grams, 1).should == ["123", "234"]
|
33
|
+
@segmenter.straddling(@three_grams, 0).should == ["123"]
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should give all straddling and non straddling n-grams for a given string and all positions" do
|
38
|
+
|
39
|
+
res = @segmenter.straddling_and_non_straddling(@two_grams, @sentence)
|
40
|
+
|
41
|
+
res.size.should == @sentence.size - 1
|
42
|
+
|
43
|
+
res[0].should == [["23"],["12"]]
|
44
|
+
res[1].should == [["12","34"],["23"]]
|
45
|
+
|
46
|
+
res = @segmenter.straddling_and_non_straddling(@three_grams, @sentence)
|
47
|
+
res[0].should == [["234"],["123"]]
|
48
|
+
res[1].should == [["345"],["123","234"]]
|
49
|
+
|
50
|
+
end
|
51
|
+
|
52
|
+
it "should average votes" do
|
53
|
+
votes = [[1,0,1,0],[0,1,0,1]]
|
54
|
+
@segmenter.average_votes(votes).should == [0.5, 0.5, 0.5, 0.5]
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
describe "Segmenting Text" do
|
59
|
+
before(:each) do
|
60
|
+
@segmenter = Maxixe::Segmenter.new({"2"=>{"AB"=>2, "BC"=>2, "CD"=>1, "DE"=>1, "EF"=>1, "FG"=>1, "G\n"=>1, "CX"=>1, "XY"=>1, "YZ"=>1, "Z\n"=>1}, "3"=>{"ABC"=>2, "BCD"=>1, "CDE"=>1, "DEF"=>1, "EFG"=>1, "FG\n"=>1, "BCX"=>1, "CXY"=>1, "XYZ"=>1, "YZ\n"=>1}})
|
61
|
+
end
|
62
|
+
|
63
|
+
it "should be able to segment text" do
|
64
|
+
@segmenter.segment("ABCDE").should == "ABC DE"
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
data/spec/spec_helper.rb
ADDED
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1 @@
|
|
1
|
+
ABCDEFG
|
@@ -0,0 +1 @@
|
|
1
|
+
ABCXYZ
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe Maxixe::Trainer do
|
4
|
+
|
5
|
+
it "should generate n-gram data from a set of files" do
|
6
|
+
|
7
|
+
pwd = File.dirname(__FILE__)
|
8
|
+
|
9
|
+
Maxixe::Trainer.generate_training_data([2,3], File.join(pwd, "first_file"), File.join(pwd,"second_file")).should == {"2"=>{"AB"=>2, "BC"=>2, "CD"=>1, "DE"=>1, "EF"=>1, "FG"=>1, "G\n"=>1, "CX"=>1, "XY"=>1, "YZ"=>1, "Z\n"=>1}, "3"=>{"ABC"=>2, "BCD"=>1, "CDE"=>1, "DEF"=>1, "EFG"=>1, "FG\n"=>1, "BCX"=>1, "CXY"=>1, "XYZ"=>1, "YZ\n"=>1}}
|
10
|
+
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
metadata
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: maxixe
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Roger Braun
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-08-20 00:00:00.000000000 %:z
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: yajl-ruby
|
17
|
+
requirement: &72352710 !ruby/object:Gem::Requirement
|
18
|
+
none: false
|
19
|
+
requirements:
|
20
|
+
- - ! '>='
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: '0'
|
23
|
+
type: :runtime
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: *72352710
|
26
|
+
- !ruby/object:Gem::Dependency
|
27
|
+
name: rspec
|
28
|
+
requirement: &72352370 !ruby/object:Gem::Requirement
|
29
|
+
none: false
|
30
|
+
requirements:
|
31
|
+
- - ! '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: *72352370
|
37
|
+
description: Maxixe is an implementation of the Tango algorithm describe in the paper
|
38
|
+
"Mostly-unsupervised statistical segmentation of Japanese kanji sequences" by Ando
|
39
|
+
and Lee. While the paper deals with Japanese characters, it should work on any unsegmented
|
40
|
+
text given enough corpus data and a tuning of the algorithm paramenters.
|
41
|
+
email:
|
42
|
+
- maxixe@rogerbraun.net
|
43
|
+
executables: []
|
44
|
+
extensions: []
|
45
|
+
extra_rdoc_files: []
|
46
|
+
files:
|
47
|
+
- .gitignore
|
48
|
+
- .rspec
|
49
|
+
- Gemfile
|
50
|
+
- Rakefile
|
51
|
+
- lib/.maxixe.rb.swp
|
52
|
+
- lib/maxixe.rb
|
53
|
+
- lib/maxixe/version.rb
|
54
|
+
- maxixe.gemspec
|
55
|
+
- spec/segmenter/.segmenter_spec.rb.swp
|
56
|
+
- spec/segmenter/segmenter_spec.rb
|
57
|
+
- spec/spec_helper.rb
|
58
|
+
- spec/trainer/.first_file.swp
|
59
|
+
- spec/trainer/.second_file.swp
|
60
|
+
- spec/trainer/.trainer_spec.rb.swp
|
61
|
+
- spec/trainer/first_file
|
62
|
+
- spec/trainer/second_file
|
63
|
+
- spec/trainer/trainer_spec.rb
|
64
|
+
has_rdoc: true
|
65
|
+
homepage: https://github.com/rogerbraun/Maxixe
|
66
|
+
licenses: []
|
67
|
+
post_install_message:
|
68
|
+
rdoc_options: []
|
69
|
+
require_paths:
|
70
|
+
- lib
|
71
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
72
|
+
none: false
|
73
|
+
requirements:
|
74
|
+
- - ! '>='
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '0'
|
77
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
78
|
+
none: false
|
79
|
+
requirements:
|
80
|
+
- - ! '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
requirements: []
|
84
|
+
rubyforge_project: maxixe
|
85
|
+
rubygems_version: 1.6.1
|
86
|
+
signing_key:
|
87
|
+
specification_version: 3
|
88
|
+
summary: A small statistical segmenter for any language.
|
89
|
+
test_files: []
|