maxixe 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format documentation
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in maxixe.gemspec
4
+ gemspec
data/Rakefile ADDED
@@ -0,0 +1,9 @@
1
+ require 'bundler'
2
+ require "rspec/core/rake_task"
3
+
4
+ Bundler::GemHelper.install_tasks
5
+
6
+ desc "Run specs"
7
+ RSpec::Core::RakeTask.new(:spec) do |t|
8
+ t.pattern = "./spec/**/*_spec.rb"
9
+ end
Binary file
data/lib/maxixe.rb ADDED
@@ -0,0 +1,125 @@
1
+ module Maxixe
2
+ class Segmenter
3
+
4
+ attr_accessor :t
5
+
6
+ def initialize(index, t = 0.5)
7
+ @index = index
8
+ @n = index.keys.map(&:to_i)
9
+ @t = t
10
+ end
11
+
12
+ def segment(str, t = nil)
13
+
14
+ n_grams = all_n_grams(str)
15
+
16
+ votes_for_all = n_grams.map{|n| compute_votes(straddling_and_non_straddling(n,str), n.first.size)}
17
+
18
+ averaged = average_votes(votes_for_all)
19
+
20
+ split_with_votes(averaged, str, t)
21
+
22
+ end
23
+
24
+ def split_with_votes(votes, str, t = nil)
25
+ points = []
26
+ votes.each_with_index do |vote, i|
27
+ treshold = vote > (t || @t)
28
+ maximum = if i > 0 and i < (votes.size - 1)
29
+ vote > votes[i - 1] and vote > votes[i + 1]
30
+ else false end
31
+
32
+ points << i if treshold or maximum
33
+ end
34
+
35
+ res = str.dup
36
+ offset = 1
37
+ points.each do |p|
38
+ res.insert(p + offset, " ")
39
+ offset += 1
40
+ end
41
+
42
+ res
43
+
44
+ end
45
+
46
+ def all_n_grams str
47
+ @n.map do |n| str.each_char.each_cons(n).to_a end
48
+ end
49
+
50
+ def token_count(n_gram)
51
+ @index[n_gram.length.to_s][n_gram] || 0
52
+ end
53
+
54
+ def straddling_and_non_straddling n_grams, str
55
+ (0..(str.length - 2)).map do |pos|
56
+ [non_straddling(n_grams, pos), straddling(n_grams, pos)]
57
+ end
58
+ end
59
+
60
+ def non_straddling n_grams, pos
61
+ res = []
62
+ n_grams.each_with_index do |n_gram, i|
63
+ res << n_gram if i == pos + 1 or i == pos - (n_gram.size - 1)
64
+ end
65
+ res.map(&:join)
66
+ end
67
+
68
+ def straddling n_grams, pos
69
+ res = []
70
+ n_grams.each_with_index do |n_gram, i|
71
+ res << n_gram if i <= pos and i > pos - (n_gram.size - 1)
72
+ end
73
+ res.map(&:join)
74
+ end
75
+
76
+ def compute_votes positions_with_ngrams, n
77
+ positions_with_ngrams.map do |(non_strad, strad)|
78
+ compute_vote(non_strad, strad, n)
79
+ end
80
+ end
81
+
82
+ def compute_vote(non_strad, strad, n)
83
+ res = non_strad.inject(0) do |res, s|
84
+ res + strad.inject(0) do |res_2, t|
85
+ res_2 + ((token_count(s) > token_count(t)) ? 1 : 0)
86
+ end
87
+ end
88
+ res / (2.0 * (n - 1))
89
+ end
90
+
91
+ def average_votes(votes)
92
+ votes.transpose.map do |vote_array|
93
+ vote_array.inject(&:+).to_f / vote_array.size
94
+ end
95
+ end
96
+ end
97
+
98
+
99
+ class Trainer
100
+
101
+ def self.generate_and_dump(n, output, *files)
102
+ res = self.generate_training_data(n, *files)
103
+ File.open(output,"w") do |file|
104
+ Yajl::Encoder.encode res, file
105
+ end
106
+ end
107
+
108
+ def self.generate_training_data(n, *files)
109
+ result = n.inject({}){|r, c_n| r[c_n.to_s] = Hash.new{0}; r}
110
+
111
+ files.each do |file|
112
+ input = open(file)
113
+ input.each_line do |line|
114
+ n.each do |c_n|
115
+ n_grams = line.each_char.each_cons(c_n).map(&:join).to_a
116
+ n_grams.each do |n_gram|
117
+ result[c_n.to_s][n_gram] += 1
118
+ end
119
+ end
120
+ end
121
+ end
122
+ result
123
+ end
124
+ end
125
+ end
@@ -0,0 +1,3 @@
1
+ module Maxixe
2
+ VERSION = "0.0.1"
3
+ end
data/maxixe.gemspec ADDED
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "maxixe/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "maxixe"
7
+ s.version = Maxixe::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ["Roger Braun"]
10
+ s.email = ["maxixe@rogerbraun.net"]
11
+ s.homepage = "https://github.com/rogerbraun/Maxixe"
12
+ s.summary = %q{A small statistical segmenter for any language.}
13
+ s.description = %q{Maxixe is an implementation of the Tango algorithm describe in the paper "Mostly-unsupervised statistical segmentation of Japanese kanji sequences" by Ando and Lee. While the paper deals with Japanese characters, it should work on any unsegmented text given enough corpus data and a tuning of the algorithm paramenters.}
14
+
15
+ s.rubyforge_project = "maxixe"
16
+
17
+ s.add_dependency "yajl-ruby"
18
+ s.add_development_dependency "rspec"
19
+
20
+ s.files = `git ls-files`.split("\n")
21
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
22
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
23
+ s.require_paths = ["lib"]
24
+ end
@@ -0,0 +1,67 @@
1
+ require "spec_helper"
2
+
3
+ describe Maxixe::Segmenter do
4
+ describe "internal functions" do
5
+
6
+ before(:each) do
7
+ @sentence = "1234567"
8
+ @two_grams = @sentence.each_char.each_cons(2).to_a
9
+ @three_grams = @sentence.each_char.each_cons(3).to_a
10
+ @segmenter = Maxixe::Segmenter.new({})
11
+ end
12
+
13
+ it "should give all non_straddling n_grams for a given position" do
14
+
15
+ # only right segment exists
16
+ @segmenter.non_straddling(@two_grams, 0).should == ["23"]
17
+ @segmenter.non_straddling(@three_grams, 0).should == ["234"]
18
+
19
+ # only left segment exists
20
+ @segmenter.non_straddling(@two_grams, 5).should == ["56"]
21
+ @segmenter.non_straddling(@three_grams, 5).should == ["456"]
22
+
23
+ # both segments exists
24
+ @segmenter.non_straddling(@two_grams, 1).should == ["12","34"]
25
+ @segmenter.non_straddling(@three_grams, 2).should == ["123", "456"]
26
+
27
+ end
28
+
29
+ it "should give all straddling n_grams for a given position" do
30
+
31
+ @segmenter.straddling(@two_grams, 1).should == ["23"]
32
+ @segmenter.straddling(@three_grams, 1).should == ["123", "234"]
33
+ @segmenter.straddling(@three_grams, 0).should == ["123"]
34
+
35
+ end
36
+
37
+ it "should give all straddling and non straddling n-grams for a given string and all positions" do
38
+
39
+ res = @segmenter.straddling_and_non_straddling(@two_grams, @sentence)
40
+
41
+ res.size.should == @sentence.size - 1
42
+
43
+ res[0].should == [["23"],["12"]]
44
+ res[1].should == [["12","34"],["23"]]
45
+
46
+ res = @segmenter.straddling_and_non_straddling(@three_grams, @sentence)
47
+ res[0].should == [["234"],["123"]]
48
+ res[1].should == [["345"],["123","234"]]
49
+
50
+ end
51
+
52
+ it "should average votes" do
53
+ votes = [[1,0,1,0],[0,1,0,1]]
54
+ @segmenter.average_votes(votes).should == [0.5, 0.5, 0.5, 0.5]
55
+ end
56
+ end
57
+
58
+ describe "Segmenting Text" do
59
+ before(:each) do
60
+ @segmenter = Maxixe::Segmenter.new({"2"=>{"AB"=>2, "BC"=>2, "CD"=>1, "DE"=>1, "EF"=>1, "FG"=>1, "G\n"=>1, "CX"=>1, "XY"=>1, "YZ"=>1, "Z\n"=>1}, "3"=>{"ABC"=>2, "BCD"=>1, "CDE"=>1, "DEF"=>1, "EFG"=>1, "FG\n"=>1, "BCX"=>1, "CXY"=>1, "XYZ"=>1, "YZ\n"=>1}})
61
+ end
62
+
63
+ it "should be able to segment text" do
64
+ @segmenter.segment("ABCDE").should == "ABC DE"
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,5 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+
4
+ require 'maxixe' # and any other gems you need
5
+
Binary file
Binary file
Binary file
@@ -0,0 +1 @@
1
+ ABCDEFG
@@ -0,0 +1 @@
1
+ ABCXYZ
@@ -0,0 +1,13 @@
1
+ require "spec_helper"
2
+
3
+ describe Maxixe::Trainer do
4
+
5
+ it "should generate n-gram data from a set of files" do
6
+
7
+ pwd = File.dirname(__FILE__)
8
+
9
+ Maxixe::Trainer.generate_training_data([2,3], File.join(pwd, "first_file"), File.join(pwd,"second_file")).should == {"2"=>{"AB"=>2, "BC"=>2, "CD"=>1, "DE"=>1, "EF"=>1, "FG"=>1, "G\n"=>1, "CX"=>1, "XY"=>1, "YZ"=>1, "Z\n"=>1}, "3"=>{"ABC"=>2, "BCD"=>1, "CDE"=>1, "DEF"=>1, "EFG"=>1, "FG\n"=>1, "BCX"=>1, "CXY"=>1, "XYZ"=>1, "YZ\n"=>1}}
10
+
11
+ end
12
+
13
+ end
metadata ADDED
@@ -0,0 +1,89 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: maxixe
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Roger Braun
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-08-20 00:00:00.000000000 %:z
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: yajl-ruby
17
+ requirement: &72352710 !ruby/object:Gem::Requirement
18
+ none: false
19
+ requirements:
20
+ - - ! '>='
21
+ - !ruby/object:Gem::Version
22
+ version: '0'
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: *72352710
26
+ - !ruby/object:Gem::Dependency
27
+ name: rspec
28
+ requirement: &72352370 !ruby/object:Gem::Requirement
29
+ none: false
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: *72352370
37
+ description: Maxixe is an implementation of the Tango algorithm describe in the paper
38
+ "Mostly-unsupervised statistical segmentation of Japanese kanji sequences" by Ando
39
+ and Lee. While the paper deals with Japanese characters, it should work on any unsegmented
40
+ text given enough corpus data and a tuning of the algorithm paramenters.
41
+ email:
42
+ - maxixe@rogerbraun.net
43
+ executables: []
44
+ extensions: []
45
+ extra_rdoc_files: []
46
+ files:
47
+ - .gitignore
48
+ - .rspec
49
+ - Gemfile
50
+ - Rakefile
51
+ - lib/.maxixe.rb.swp
52
+ - lib/maxixe.rb
53
+ - lib/maxixe/version.rb
54
+ - maxixe.gemspec
55
+ - spec/segmenter/.segmenter_spec.rb.swp
56
+ - spec/segmenter/segmenter_spec.rb
57
+ - spec/spec_helper.rb
58
+ - spec/trainer/.first_file.swp
59
+ - spec/trainer/.second_file.swp
60
+ - spec/trainer/.trainer_spec.rb.swp
61
+ - spec/trainer/first_file
62
+ - spec/trainer/second_file
63
+ - spec/trainer/trainer_spec.rb
64
+ has_rdoc: true
65
+ homepage: https://github.com/rogerbraun/Maxixe
66
+ licenses: []
67
+ post_install_message:
68
+ rdoc_options: []
69
+ require_paths:
70
+ - lib
71
+ required_ruby_version: !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ! '>='
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
77
+ required_rubygems_version: !ruby/object:Gem::Requirement
78
+ none: false
79
+ requirements:
80
+ - - ! '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ requirements: []
84
+ rubyforge_project: maxixe
85
+ rubygems_version: 1.6.1
86
+ signing_key:
87
+ specification_version: 3
88
+ summary: A small statistical segmenter for any language.
89
+ test_files: []