maxixe 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format documentation
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in maxixe.gemspec
4
+ gemspec
data/Rakefile ADDED
@@ -0,0 +1,9 @@
1
+ require 'bundler'
2
+ require "rspec/core/rake_task"
3
+
4
+ Bundler::GemHelper.install_tasks
5
+
6
+ desc "Run specs"
7
+ RSpec::Core::RakeTask.new(:spec) do |t|
8
+ t.pattern = "./spec/**/*_spec.rb"
9
+ end
Binary file
data/lib/maxixe.rb ADDED
@@ -0,0 +1,125 @@
1
+ module Maxixe
2
+ class Segmenter
3
+
4
+ attr_accessor :t
5
+
6
+ def initialize(index, t = 0.5)
7
+ @index = index
8
+ @n = index.keys.map(&:to_i)
9
+ @t = t
10
+ end
11
+
12
+ def segment(str, t = nil)
13
+
14
+ n_grams = all_n_grams(str)
15
+
16
+ votes_for_all = n_grams.map{|n| compute_votes(straddling_and_non_straddling(n,str), n.first.size)}
17
+
18
+ averaged = average_votes(votes_for_all)
19
+
20
+ split_with_votes(averaged, str, t)
21
+
22
+ end
23
+
24
+ def split_with_votes(votes, str, t = nil)
25
+ points = []
26
+ votes.each_with_index do |vote, i|
27
+ treshold = vote > (t || @t)
28
+ maximum = if i > 0 and i < (votes.size - 1)
29
+ vote > votes[i - 1] and vote > votes[i + 1]
30
+ else false end
31
+
32
+ points << i if treshold or maximum
33
+ end
34
+
35
+ res = str.dup
36
+ offset = 1
37
+ points.each do |p|
38
+ res.insert(p + offset, " ")
39
+ offset += 1
40
+ end
41
+
42
+ res
43
+
44
+ end
45
+
46
+ def all_n_grams str
47
+ @n.map do |n| str.each_char.each_cons(n).to_a end
48
+ end
49
+
50
+ def token_count(n_gram)
51
+ @index[n_gram.length.to_s][n_gram] || 0
52
+ end
53
+
54
+ def straddling_and_non_straddling n_grams, str
55
+ (0..(str.length - 2)).map do |pos|
56
+ [non_straddling(n_grams, pos), straddling(n_grams, pos)]
57
+ end
58
+ end
59
+
60
+ def non_straddling n_grams, pos
61
+ res = []
62
+ n_grams.each_with_index do |n_gram, i|
63
+ res << n_gram if i == pos + 1 or i == pos - (n_gram.size - 1)
64
+ end
65
+ res.map(&:join)
66
+ end
67
+
68
+ def straddling n_grams, pos
69
+ res = []
70
+ n_grams.each_with_index do |n_gram, i|
71
+ res << n_gram if i <= pos and i > pos - (n_gram.size - 1)
72
+ end
73
+ res.map(&:join)
74
+ end
75
+
76
+ def compute_votes positions_with_ngrams, n
77
+ positions_with_ngrams.map do |(non_strad, strad)|
78
+ compute_vote(non_strad, strad, n)
79
+ end
80
+ end
81
+
82
+ def compute_vote(non_strad, strad, n)
83
+ res = non_strad.inject(0) do |res, s|
84
+ res + strad.inject(0) do |res_2, t|
85
+ res_2 + ((token_count(s) > token_count(t)) ? 1 : 0)
86
+ end
87
+ end
88
+ res / (2.0 * (n - 1))
89
+ end
90
+
91
+ def average_votes(votes)
92
+ votes.transpose.map do |vote_array|
93
+ vote_array.inject(&:+).to_f / vote_array.size
94
+ end
95
+ end
96
+ end
97
+
98
+
99
+ class Trainer
100
+
101
+ def self.generate_and_dump(n, output, *files)
102
+ res = self.generate_training_data(n, *files)
103
+ File.open(output,"w") do |file|
104
+ Yajl::Encoder.encode res, file
105
+ end
106
+ end
107
+
108
+ def self.generate_training_data(n, *files)
109
+ result = n.inject({}){|r, c_n| r[c_n.to_s] = Hash.new{0}; r}
110
+
111
+ files.each do |file|
112
+ input = open(file)
113
+ input.each_line do |line|
114
+ n.each do |c_n|
115
+ n_grams = line.each_char.each_cons(c_n).map(&:join).to_a
116
+ n_grams.each do |n_gram|
117
+ result[c_n.to_s][n_gram] += 1
118
+ end
119
+ end
120
+ end
121
+ end
122
+ result
123
+ end
124
+ end
125
+ end
@@ -0,0 +1,3 @@
1
+ module Maxixe
2
+ VERSION = "0.0.1"
3
+ end
data/maxixe.gemspec ADDED
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "maxixe/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "maxixe"
7
+ s.version = Maxixe::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ["Roger Braun"]
10
+ s.email = ["maxixe@rogerbraun.net"]
11
+ s.homepage = "https://github.com/rogerbraun/Maxixe"
12
+ s.summary = %q{A small statistical segmenter for any language.}
13
+ s.description = %q{Maxixe is an implementation of the Tango algorithm describe in the paper "Mostly-unsupervised statistical segmentation of Japanese kanji sequences" by Ando and Lee. While the paper deals with Japanese characters, it should work on any unsegmented text given enough corpus data and a tuning of the algorithm paramenters.}
14
+
15
+ s.rubyforge_project = "maxixe"
16
+
17
+ s.add_dependency "yajl-ruby"
18
+ s.add_development_dependency "rspec"
19
+
20
+ s.files = `git ls-files`.split("\n")
21
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
22
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
23
+ s.require_paths = ["lib"]
24
+ end
@@ -0,0 +1,67 @@
1
+ require "spec_helper"
2
+
3
+ describe Maxixe::Segmenter do
4
+ describe "internal functions" do
5
+
6
+ before(:each) do
7
+ @sentence = "1234567"
8
+ @two_grams = @sentence.each_char.each_cons(2).to_a
9
+ @three_grams = @sentence.each_char.each_cons(3).to_a
10
+ @segmenter = Maxixe::Segmenter.new({})
11
+ end
12
+
13
+ it "should give all non_straddling n_grams for a given position" do
14
+
15
+ # only right segment exists
16
+ @segmenter.non_straddling(@two_grams, 0).should == ["23"]
17
+ @segmenter.non_straddling(@three_grams, 0).should == ["234"]
18
+
19
+ # only left segment exists
20
+ @segmenter.non_straddling(@two_grams, 5).should == ["56"]
21
+ @segmenter.non_straddling(@three_grams, 5).should == ["456"]
22
+
23
+ # both segments exists
24
+ @segmenter.non_straddling(@two_grams, 1).should == ["12","34"]
25
+ @segmenter.non_straddling(@three_grams, 2).should == ["123", "456"]
26
+
27
+ end
28
+
29
+ it "should give all straddling n_grams for a given position" do
30
+
31
+ @segmenter.straddling(@two_grams, 1).should == ["23"]
32
+ @segmenter.straddling(@three_grams, 1).should == ["123", "234"]
33
+ @segmenter.straddling(@three_grams, 0).should == ["123"]
34
+
35
+ end
36
+
37
+ it "should give all straddling and non straddling n-grams for a given string and all positions" do
38
+
39
+ res = @segmenter.straddling_and_non_straddling(@two_grams, @sentence)
40
+
41
+ res.size.should == @sentence.size - 1
42
+
43
+ res[0].should == [["23"],["12"]]
44
+ res[1].should == [["12","34"],["23"]]
45
+
46
+ res = @segmenter.straddling_and_non_straddling(@three_grams, @sentence)
47
+ res[0].should == [["234"],["123"]]
48
+ res[1].should == [["345"],["123","234"]]
49
+
50
+ end
51
+
52
+ it "should average votes" do
53
+ votes = [[1,0,1,0],[0,1,0,1]]
54
+ @segmenter.average_votes(votes).should == [0.5, 0.5, 0.5, 0.5]
55
+ end
56
+ end
57
+
58
+ describe "Segmenting Text" do
59
+ before(:each) do
60
+ @segmenter = Maxixe::Segmenter.new({"2"=>{"AB"=>2, "BC"=>2, "CD"=>1, "DE"=>1, "EF"=>1, "FG"=>1, "G\n"=>1, "CX"=>1, "XY"=>1, "YZ"=>1, "Z\n"=>1}, "3"=>{"ABC"=>2, "BCD"=>1, "CDE"=>1, "DEF"=>1, "EFG"=>1, "FG\n"=>1, "BCX"=>1, "CXY"=>1, "XYZ"=>1, "YZ\n"=>1}})
61
+ end
62
+
63
+ it "should be able to segment text" do
64
+ @segmenter.segment("ABCDE").should == "ABC DE"
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,5 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+
4
+ require 'maxixe' # and any other gems you need
5
+
Binary file
Binary file
Binary file
@@ -0,0 +1 @@
1
+ ABCDEFG
@@ -0,0 +1 @@
1
+ ABCXYZ
@@ -0,0 +1,13 @@
1
+ require "spec_helper"
2
+
3
+ describe Maxixe::Trainer do
4
+
5
+ it "should generate n-gram data from a set of files" do
6
+
7
+ pwd = File.dirname(__FILE__)
8
+
9
+ Maxixe::Trainer.generate_training_data([2,3], File.join(pwd, "first_file"), File.join(pwd,"second_file")).should == {"2"=>{"AB"=>2, "BC"=>2, "CD"=>1, "DE"=>1, "EF"=>1, "FG"=>1, "G\n"=>1, "CX"=>1, "XY"=>1, "YZ"=>1, "Z\n"=>1}, "3"=>{"ABC"=>2, "BCD"=>1, "CDE"=>1, "DEF"=>1, "EFG"=>1, "FG\n"=>1, "BCX"=>1, "CXY"=>1, "XYZ"=>1, "YZ\n"=>1}}
10
+
11
+ end
12
+
13
+ end
metadata ADDED
@@ -0,0 +1,89 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: maxixe
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Roger Braun
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-08-20 00:00:00.000000000 %:z
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: yajl-ruby
17
+ requirement: &72352710 !ruby/object:Gem::Requirement
18
+ none: false
19
+ requirements:
20
+ - - ! '>='
21
+ - !ruby/object:Gem::Version
22
+ version: '0'
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: *72352710
26
+ - !ruby/object:Gem::Dependency
27
+ name: rspec
28
+ requirement: &72352370 !ruby/object:Gem::Requirement
29
+ none: false
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: *72352370
37
+ description: Maxixe is an implementation of the Tango algorithm describe in the paper
38
+ "Mostly-unsupervised statistical segmentation of Japanese kanji sequences" by Ando
39
+ and Lee. While the paper deals with Japanese characters, it should work on any unsegmented
40
+ text given enough corpus data and a tuning of the algorithm paramenters.
41
+ email:
42
+ - maxixe@rogerbraun.net
43
+ executables: []
44
+ extensions: []
45
+ extra_rdoc_files: []
46
+ files:
47
+ - .gitignore
48
+ - .rspec
49
+ - Gemfile
50
+ - Rakefile
51
+ - lib/.maxixe.rb.swp
52
+ - lib/maxixe.rb
53
+ - lib/maxixe/version.rb
54
+ - maxixe.gemspec
55
+ - spec/segmenter/.segmenter_spec.rb.swp
56
+ - spec/segmenter/segmenter_spec.rb
57
+ - spec/spec_helper.rb
58
+ - spec/trainer/.first_file.swp
59
+ - spec/trainer/.second_file.swp
60
+ - spec/trainer/.trainer_spec.rb.swp
61
+ - spec/trainer/first_file
62
+ - spec/trainer/second_file
63
+ - spec/trainer/trainer_spec.rb
64
+ has_rdoc: true
65
+ homepage: https://github.com/rogerbraun/Maxixe
66
+ licenses: []
67
+ post_install_message:
68
+ rdoc_options: []
69
+ require_paths:
70
+ - lib
71
+ required_ruby_version: !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ! '>='
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
77
+ required_rubygems_version: !ruby/object:Gem::Requirement
78
+ none: false
79
+ requirements:
80
+ - - ! '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ requirements: []
84
+ rubyforge_project: maxixe
85
+ rubygems_version: 1.6.1
86
+ signing_key:
87
+ specification_version: 3
88
+ summary: A small statistical segmenter for any language.
89
+ test_files: []