maxixe 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
 - data/.rspec +2 -0
 - data/Gemfile +4 -0
 - data/Rakefile +9 -0
 - data/lib/.maxixe.rb.swp +0 -0
 - data/lib/maxixe.rb +125 -0
 - data/lib/maxixe/version.rb +3 -0
 - data/maxixe.gemspec +24 -0
 - data/spec/segmenter/.segmenter_spec.rb.swp +0 -0
 - data/spec/segmenter/segmenter_spec.rb +67 -0
 - data/spec/spec_helper.rb +5 -0
 - data/spec/trainer/.first_file.swp +0 -0
 - data/spec/trainer/.second_file.swp +0 -0
 - data/spec/trainer/.trainer_spec.rb.swp +0 -0
 - data/spec/trainer/first_file +1 -0
 - data/spec/trainer/second_file +1 -0
 - data/spec/trainer/trainer_spec.rb +13 -0
 - metadata +89 -0
 
    
        data/.gitignore
    ADDED
    
    
    
        data/.rspec
    ADDED
    
    
    
        data/Gemfile
    ADDED
    
    
    
        data/Rakefile
    ADDED
    
    
    
        data/lib/.maxixe.rb.swp
    ADDED
    
    | 
         Binary file 
     | 
    
        data/lib/maxixe.rb
    ADDED
    
    | 
         @@ -0,0 +1,125 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module Maxixe
         
     | 
| 
      
 2 
     | 
    
         
            +
              class Segmenter
         
     | 
| 
      
 3 
     | 
    
         
            +
                
         
     | 
| 
      
 4 
     | 
    
         
            +
                attr_accessor :t
         
     | 
| 
      
 5 
     | 
    
         
            +
                
         
     | 
| 
      
 6 
     | 
    
         
            +
                def initialize(index, t = 0.5)
         
     | 
| 
      
 7 
     | 
    
         
            +
                  @index = index
         
     | 
| 
      
 8 
     | 
    
         
            +
                  @n = index.keys.map(&:to_i)
         
     | 
| 
      
 9 
     | 
    
         
            +
                  @t = t
         
     | 
| 
      
 10 
     | 
    
         
            +
                end
         
     | 
| 
      
 11 
     | 
    
         
            +
             
     | 
| 
      
 12 
     | 
    
         
            +
                def segment(str, t = nil)
         
     | 
| 
      
 13 
     | 
    
         
            +
             
     | 
| 
      
 14 
     | 
    
         
            +
                  n_grams = all_n_grams(str)
         
     | 
| 
      
 15 
     | 
    
         
            +
                  
         
     | 
| 
      
 16 
     | 
    
         
            +
                  votes_for_all = n_grams.map{|n| compute_votes(straddling_and_non_straddling(n,str), n.first.size)}
         
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
      
 18 
     | 
    
         
            +
                  averaged = average_votes(votes_for_all)
         
     | 
| 
      
 19 
     | 
    
         
            +
             
     | 
| 
      
 20 
     | 
    
         
            +
                  split_with_votes(averaged, str, t)
         
     | 
| 
      
 21 
     | 
    
         
            +
             
     | 
| 
      
 22 
     | 
    
         
            +
                end
         
     | 
| 
      
 23 
     | 
    
         
            +
             
     | 
| 
      
 24 
     | 
    
         
            +
                def split_with_votes(votes, str, t = nil)
         
     | 
| 
      
 25 
     | 
    
         
            +
                  points = []
         
     | 
| 
      
 26 
     | 
    
         
            +
                  votes.each_with_index do |vote, i|
         
     | 
| 
      
 27 
     | 
    
         
            +
                    treshold = vote > (t || @t)
         
     | 
| 
      
 28 
     | 
    
         
            +
                    maximum = if i > 0 and i < (votes.size - 1)
         
     | 
| 
      
 29 
     | 
    
         
            +
                      vote > votes[i - 1] and vote > votes[i + 1]
         
     | 
| 
      
 30 
     | 
    
         
            +
                    else false end 
         
     | 
| 
      
 31 
     | 
    
         
            +
             
     | 
| 
      
 32 
     | 
    
         
            +
                    points << i if treshold or maximum
         
     | 
| 
      
 33 
     | 
    
         
            +
                  end 
         
     | 
| 
      
 34 
     | 
    
         
            +
             
     | 
| 
      
 35 
     | 
    
         
            +
                  res = str.dup
         
     | 
| 
      
 36 
     | 
    
         
            +
                  offset = 1
         
     | 
| 
      
 37 
     | 
    
         
            +
                  points.each do |p|
         
     | 
| 
      
 38 
     | 
    
         
            +
                    res.insert(p + offset, " ")
         
     | 
| 
      
 39 
     | 
    
         
            +
                    offset += 1
         
     | 
| 
      
 40 
     | 
    
         
            +
                  end
         
     | 
| 
      
 41 
     | 
    
         
            +
             
     | 
| 
      
 42 
     | 
    
         
            +
                  res 
         
     | 
| 
      
 43 
     | 
    
         
            +
             
     | 
| 
      
 44 
     | 
    
         
            +
                end
         
     | 
| 
      
 45 
     | 
    
         
            +
             
     | 
| 
      
 46 
     | 
    
         
            +
                def all_n_grams str
         
     | 
| 
      
 47 
     | 
    
         
            +
                  @n.map do |n| str.each_char.each_cons(n).to_a end
         
     | 
| 
      
 48 
     | 
    
         
            +
                end
         
     | 
| 
      
 49 
     | 
    
         
            +
             
     | 
| 
      
 50 
     | 
    
         
            +
                def token_count(n_gram)
         
     | 
| 
      
 51 
     | 
    
         
            +
                  @index[n_gram.length.to_s][n_gram] || 0
         
     | 
| 
      
 52 
     | 
    
         
            +
                end
         
     | 
| 
      
 53 
     | 
    
         
            +
             
     | 
| 
      
 54 
     | 
    
         
            +
                def straddling_and_non_straddling n_grams, str
         
     | 
| 
      
 55 
     | 
    
         
            +
                  (0..(str.length - 2)).map do |pos|
         
     | 
| 
      
 56 
     | 
    
         
            +
                    [non_straddling(n_grams, pos), straddling(n_grams, pos)]
         
     | 
| 
      
 57 
     | 
    
         
            +
                  end
         
     | 
| 
      
 58 
     | 
    
         
            +
                end
         
     | 
| 
      
 59 
     | 
    
         
            +
             
     | 
| 
      
 60 
     | 
    
         
            +
                def non_straddling n_grams, pos
         
     | 
| 
      
 61 
     | 
    
         
            +
                  res = []
         
     | 
| 
      
 62 
     | 
    
         
            +
                  n_grams.each_with_index do |n_gram, i|
         
     | 
| 
      
 63 
     | 
    
         
            +
                    res << n_gram if i == pos + 1 or i == pos - (n_gram.size - 1)
         
     | 
| 
      
 64 
     | 
    
         
            +
                  end
         
     | 
| 
      
 65 
     | 
    
         
            +
                  res.map(&:join)
         
     | 
| 
      
 66 
     | 
    
         
            +
                end
         
     | 
| 
      
 67 
     | 
    
         
            +
             
     | 
| 
      
 68 
     | 
    
         
            +
                def straddling n_grams, pos
         
     | 
| 
      
 69 
     | 
    
         
            +
                  res = []
         
     | 
| 
      
 70 
     | 
    
         
            +
                  n_grams.each_with_index do |n_gram, i|
         
     | 
| 
      
 71 
     | 
    
         
            +
                    res << n_gram if i <= pos and i > pos - (n_gram.size - 1)
         
     | 
| 
      
 72 
     | 
    
         
            +
                  end
         
     | 
| 
      
 73 
     | 
    
         
            +
                  res.map(&:join)
         
     | 
| 
      
 74 
     | 
    
         
            +
                end
         
     | 
| 
      
 75 
     | 
    
         
            +
             
     | 
| 
      
 76 
     | 
    
         
            +
                def compute_votes positions_with_ngrams, n
         
     | 
| 
      
 77 
     | 
    
         
            +
                  positions_with_ngrams.map do |(non_strad, strad)|
         
     | 
| 
      
 78 
     | 
    
         
            +
                    compute_vote(non_strad, strad, n)
         
     | 
| 
      
 79 
     | 
    
         
            +
                  end
         
     | 
| 
      
 80 
     | 
    
         
            +
                end
         
     | 
| 
      
 81 
     | 
    
         
            +
             
     | 
| 
      
 82 
     | 
    
         
            +
                def compute_vote(non_strad, strad, n)
         
     | 
| 
      
 83 
     | 
    
         
            +
                  res = non_strad.inject(0) do |res, s|
         
     | 
| 
      
 84 
     | 
    
         
            +
                    res + strad.inject(0) do |res_2, t|
         
     | 
| 
      
 85 
     | 
    
         
            +
                      res_2 + ((token_count(s) > token_count(t)) ? 1 : 0)
         
     | 
| 
      
 86 
     | 
    
         
            +
                    end
         
     | 
| 
      
 87 
     | 
    
         
            +
                  end 
         
     | 
| 
      
 88 
     | 
    
         
            +
                  res / (2.0 * (n - 1))
         
     | 
| 
      
 89 
     | 
    
         
            +
                end
         
     | 
| 
      
 90 
     | 
    
         
            +
             
     | 
| 
      
 91 
     | 
    
         
            +
                def average_votes(votes)
         
     | 
| 
      
 92 
     | 
    
         
            +
                  votes.transpose.map do |vote_array|
         
     | 
| 
      
 93 
     | 
    
         
            +
                    vote_array.inject(&:+).to_f / vote_array.size
         
     | 
| 
      
 94 
     | 
    
         
            +
                  end
         
     | 
| 
      
 95 
     | 
    
         
            +
                end
         
     | 
| 
      
 96 
     | 
    
         
            +
              end
         
     | 
| 
      
 97 
     | 
    
         
            +
             
     | 
| 
      
 98 
     | 
    
         
            +
             
     | 
| 
      
 99 
     | 
    
         
            +
              class Trainer
         
     | 
| 
      
 100 
     | 
    
         
            +
             
     | 
| 
      
 101 
     | 
    
         
            +
                def self.generate_and_dump(n, output, *files)
         
     | 
| 
      
 102 
     | 
    
         
            +
                  res = self.generate_training_data(n, *files)
         
     | 
| 
      
 103 
     | 
    
         
            +
                  File.open(output,"w") do |file|
         
     | 
| 
      
 104 
     | 
    
         
            +
                    Yajl::Encoder.encode res, file
         
     | 
| 
      
 105 
     | 
    
         
            +
                  end
         
     | 
| 
      
 106 
     | 
    
         
            +
                end
         
     | 
| 
      
 107 
     | 
    
         
            +
             
     | 
| 
      
 108 
     | 
    
         
            +
                def self.generate_training_data(n, *files)
         
     | 
| 
      
 109 
     | 
    
         
            +
                  result = n.inject({}){|r, c_n| r[c_n.to_s] = Hash.new{0}; r} 
         
     | 
| 
      
 110 
     | 
    
         
            +
             
     | 
| 
      
 111 
     | 
    
         
            +
                  files.each do |file|
         
     | 
| 
      
 112 
     | 
    
         
            +
                    input = open(file)
         
     | 
| 
      
 113 
     | 
    
         
            +
                    input.each_line do |line|
         
     | 
| 
      
 114 
     | 
    
         
            +
                      n.each do |c_n|
         
     | 
| 
      
 115 
     | 
    
         
            +
                        n_grams = line.each_char.each_cons(c_n).map(&:join).to_a
         
     | 
| 
      
 116 
     | 
    
         
            +
                        n_grams.each do |n_gram|
         
     | 
| 
      
 117 
     | 
    
         
            +
                          result[c_n.to_s][n_gram] += 1
         
     | 
| 
      
 118 
     | 
    
         
            +
                        end
         
     | 
| 
      
 119 
     | 
    
         
            +
                      end
         
     | 
| 
      
 120 
     | 
    
         
            +
                    end
         
     | 
| 
      
 121 
     | 
    
         
            +
                  end
         
     | 
| 
      
 122 
     | 
    
         
            +
                  result
         
     | 
| 
      
 123 
     | 
    
         
            +
                end
         
     | 
| 
      
 124 
     | 
    
         
            +
              end
         
     | 
| 
      
 125 
     | 
    
         
            +
            end
         
     | 
    
        data/maxixe.gemspec
    ADDED
    
    | 
         @@ -0,0 +1,24 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # -*- encoding: utf-8 -*-
         
     | 
| 
      
 2 
     | 
    
         
            +
            $:.push File.expand_path("../lib", __FILE__)
         
     | 
| 
      
 3 
     | 
    
         
            +
            require "maxixe/version"
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            Gem::Specification.new do |s|
         
     | 
| 
      
 6 
     | 
    
         
            +
              s.name        = "maxixe"
         
     | 
| 
      
 7 
     | 
    
         
            +
              s.version     = Maxixe::VERSION
         
     | 
| 
      
 8 
     | 
    
         
            +
              s.platform    = Gem::Platform::RUBY
         
     | 
| 
      
 9 
     | 
    
         
            +
              s.authors     = ["Roger Braun"]
         
     | 
| 
      
 10 
     | 
    
         
            +
              s.email       = ["maxixe@rogerbraun.net"]
         
     | 
| 
      
 11 
     | 
    
         
            +
              s.homepage    = "https://github.com/rogerbraun/Maxixe"
         
     | 
| 
      
 12 
     | 
    
         
            +
              s.summary     = %q{A small statistical segmenter for any language.}
         
     | 
| 
      
 13 
     | 
    
         
            +
              s.description = %q{Maxixe is an implementation of the Tango algorithm describe in the paper "Mostly-unsupervised statistical segmentation of Japanese kanji sequences" by Ando and Lee. While the paper deals with Japanese characters, it should work on any unsegmented text given enough corpus data and a tuning of the algorithm paramenters.}
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
              s.rubyforge_project = "maxixe"
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
      
 17 
     | 
    
         
            +
              s.add_dependency "yajl-ruby"
         
     | 
| 
      
 18 
     | 
    
         
            +
              s.add_development_dependency "rspec"
         
     | 
| 
      
 19 
     | 
    
         
            +
             
     | 
| 
      
 20 
     | 
    
         
            +
              s.files         = `git ls-files`.split("\n")
         
     | 
| 
      
 21 
     | 
    
         
            +
              s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
         
     | 
| 
      
 22 
     | 
    
         
            +
              s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
         
     | 
| 
      
 23 
     | 
    
         
            +
              s.require_paths = ["lib"]
         
     | 
| 
      
 24 
     | 
    
         
            +
            end
         
     | 
| 
         Binary file 
     | 
| 
         @@ -0,0 +1,67 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require "spec_helper"
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            describe Maxixe::Segmenter do
         
     | 
| 
      
 4 
     | 
    
         
            +
              describe "internal functions" do
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
                before(:each) do 
         
     | 
| 
      
 7 
     | 
    
         
            +
                  @sentence = "1234567"
         
     | 
| 
      
 8 
     | 
    
         
            +
                  @two_grams = @sentence.each_char.each_cons(2).to_a
         
     | 
| 
      
 9 
     | 
    
         
            +
                  @three_grams = @sentence.each_char.each_cons(3).to_a
         
     | 
| 
      
 10 
     | 
    
         
            +
                  @segmenter = Maxixe::Segmenter.new({})
         
     | 
| 
      
 11 
     | 
    
         
            +
                end
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
                it "should give all non_straddling n_grams for a given position" do
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
                  # only right segment exists
         
     | 
| 
      
 16 
     | 
    
         
            +
                  @segmenter.non_straddling(@two_grams, 0).should == ["23"] 
         
     | 
| 
      
 17 
     | 
    
         
            +
                  @segmenter.non_straddling(@three_grams, 0).should == ["234"] 
         
     | 
| 
      
 18 
     | 
    
         
            +
             
     | 
| 
      
 19 
     | 
    
         
            +
                  # only left segment exists
         
     | 
| 
      
 20 
     | 
    
         
            +
                  @segmenter.non_straddling(@two_grams, 5).should == ["56"]
         
     | 
| 
      
 21 
     | 
    
         
            +
                  @segmenter.non_straddling(@three_grams, 5).should == ["456"]
         
     | 
| 
      
 22 
     | 
    
         
            +
             
     | 
| 
      
 23 
     | 
    
         
            +
                  # both segments exists
         
     | 
| 
      
 24 
     | 
    
         
            +
                  @segmenter.non_straddling(@two_grams, 1).should == ["12","34"]
         
     | 
| 
      
 25 
     | 
    
         
            +
                  @segmenter.non_straddling(@three_grams, 2).should == ["123", "456"]
         
     | 
| 
      
 26 
     | 
    
         
            +
             
     | 
| 
      
 27 
     | 
    
         
            +
                end
         
     | 
| 
      
 28 
     | 
    
         
            +
             
     | 
| 
      
 29 
     | 
    
         
            +
                it "should give all straddling n_grams for a given position" do
         
     | 
| 
      
 30 
     | 
    
         
            +
             
     | 
| 
      
 31 
     | 
    
         
            +
                  @segmenter.straddling(@two_grams, 1).should == ["23"]
         
     | 
| 
      
 32 
     | 
    
         
            +
                  @segmenter.straddling(@three_grams, 1).should == ["123", "234"]
         
     | 
| 
      
 33 
     | 
    
         
            +
                  @segmenter.straddling(@three_grams, 0).should == ["123"]
         
     | 
| 
      
 34 
     | 
    
         
            +
             
     | 
| 
      
 35 
     | 
    
         
            +
                end
         
     | 
| 
      
 36 
     | 
    
         
            +
             
     | 
| 
      
 37 
     | 
    
         
            +
                it "should give all straddling and non straddling n-grams for a given string and all positions" do
         
     | 
| 
      
 38 
     | 
    
         
            +
                  
         
     | 
| 
      
 39 
     | 
    
         
            +
                  res = @segmenter.straddling_and_non_straddling(@two_grams, @sentence)
         
     | 
| 
      
 40 
     | 
    
         
            +
             
     | 
| 
      
 41 
     | 
    
         
            +
                  res.size.should == @sentence.size - 1
         
     | 
| 
      
 42 
     | 
    
         
            +
                  
         
     | 
| 
      
 43 
     | 
    
         
            +
                  res[0].should == [["23"],["12"]]
         
     | 
| 
      
 44 
     | 
    
         
            +
                  res[1].should == [["12","34"],["23"]]
         
     | 
| 
      
 45 
     | 
    
         
            +
             
     | 
| 
      
 46 
     | 
    
         
            +
                  res = @segmenter.straddling_and_non_straddling(@three_grams, @sentence)
         
     | 
| 
      
 47 
     | 
    
         
            +
                  res[0].should == [["234"],["123"]]
         
     | 
| 
      
 48 
     | 
    
         
            +
                  res[1].should == [["345"],["123","234"]]
         
     | 
| 
      
 49 
     | 
    
         
            +
             
     | 
| 
      
 50 
     | 
    
         
            +
                end
         
     | 
| 
      
 51 
     | 
    
         
            +
             
     | 
| 
      
 52 
     | 
    
         
            +
                it "should average votes" do
         
     | 
| 
      
 53 
     | 
    
         
            +
                  votes = [[1,0,1,0],[0,1,0,1]]
         
     | 
| 
      
 54 
     | 
    
         
            +
                  @segmenter.average_votes(votes).should == [0.5, 0.5, 0.5, 0.5]
         
     | 
| 
      
 55 
     | 
    
         
            +
                end
         
     | 
| 
      
 56 
     | 
    
         
            +
              end
         
     | 
| 
      
 57 
     | 
    
         
            +
             
     | 
| 
      
 58 
     | 
    
         
            +
              describe "Segmenting Text" do
         
     | 
| 
      
 59 
     | 
    
         
            +
                before(:each) do 
         
     | 
| 
      
 60 
     | 
    
         
            +
                  @segmenter = Maxixe::Segmenter.new({"2"=>{"AB"=>2, "BC"=>2, "CD"=>1, "DE"=>1, "EF"=>1, "FG"=>1, "G\n"=>1, "CX"=>1, "XY"=>1, "YZ"=>1, "Z\n"=>1}, "3"=>{"ABC"=>2, "BCD"=>1, "CDE"=>1, "DEF"=>1, "EFG"=>1, "FG\n"=>1, "BCX"=>1, "CXY"=>1, "XYZ"=>1, "YZ\n"=>1}})
         
     | 
| 
      
 61 
     | 
    
         
            +
                end
         
     | 
| 
      
 62 
     | 
    
         
            +
             
     | 
| 
      
 63 
     | 
    
         
            +
                it "should be able to segment text" do
         
     | 
| 
      
 64 
     | 
    
         
            +
                  @segmenter.segment("ABCDE").should == "ABC DE"
         
     | 
| 
      
 65 
     | 
    
         
            +
                end
         
     | 
| 
      
 66 
     | 
    
         
            +
              end
         
     | 
| 
      
 67 
     | 
    
         
            +
            end
         
     | 
    
        data/spec/spec_helper.rb
    ADDED
    
    
| 
         Binary file 
     | 
| 
         Binary file 
     | 
| 
         Binary file 
     | 
| 
         @@ -0,0 +1 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            ABCDEFG
         
     | 
| 
         @@ -0,0 +1 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            ABCXYZ
         
     | 
| 
         @@ -0,0 +1,13 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require "spec_helper"
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            describe Maxixe::Trainer do
         
     | 
| 
      
 4 
     | 
    
         
            +
              
         
     | 
| 
      
 5 
     | 
    
         
            +
              it "should generate n-gram data from a set of files" do
         
     | 
| 
      
 6 
     | 
    
         
            +
             
     | 
| 
      
 7 
     | 
    
         
            +
                pwd = File.dirname(__FILE__)
         
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
                Maxixe::Trainer.generate_training_data([2,3], File.join(pwd, "first_file"), File.join(pwd,"second_file")).should == {"2"=>{"AB"=>2, "BC"=>2, "CD"=>1, "DE"=>1, "EF"=>1, "FG"=>1, "G\n"=>1, "CX"=>1, "XY"=>1, "YZ"=>1, "Z\n"=>1}, "3"=>{"ABC"=>2, "BCD"=>1, "CDE"=>1, "DEF"=>1, "EFG"=>1, "FG\n"=>1, "BCX"=>1, "CXY"=>1, "XYZ"=>1, "YZ\n"=>1}}
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
              end
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
            end
         
     | 
    
        metadata
    ADDED
    
    | 
         @@ -0,0 +1,89 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            --- !ruby/object:Gem::Specification
         
     | 
| 
      
 2 
     | 
    
         
            +
            name: maxixe
         
     | 
| 
      
 3 
     | 
    
         
            +
            version: !ruby/object:Gem::Version
         
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.0.1
         
     | 
| 
      
 5 
     | 
    
         
            +
              prerelease: 
         
     | 
| 
      
 6 
     | 
    
         
            +
            platform: ruby
         
     | 
| 
      
 7 
     | 
    
         
            +
            authors:
         
     | 
| 
      
 8 
     | 
    
         
            +
            - Roger Braun
         
     | 
| 
      
 9 
     | 
    
         
            +
            autorequire: 
         
     | 
| 
      
 10 
     | 
    
         
            +
            bindir: bin
         
     | 
| 
      
 11 
     | 
    
         
            +
            cert_chain: []
         
     | 
| 
      
 12 
     | 
    
         
            +
            date: 2011-08-20 00:00:00.000000000 %:z
         
     | 
| 
      
 13 
     | 
    
         
            +
            default_executable: 
         
     | 
| 
      
 14 
     | 
    
         
            +
            dependencies:
         
     | 
| 
      
 15 
     | 
    
         
            +
            - !ruby/object:Gem::Dependency
         
     | 
| 
      
 16 
     | 
    
         
            +
              name: yajl-ruby
         
     | 
| 
      
 17 
     | 
    
         
            +
              requirement: &72352710 !ruby/object:Gem::Requirement
         
     | 
| 
      
 18 
     | 
    
         
            +
                none: false
         
     | 
| 
      
 19 
     | 
    
         
            +
                requirements:
         
     | 
| 
      
 20 
     | 
    
         
            +
                - - ! '>='
         
     | 
| 
      
 21 
     | 
    
         
            +
                  - !ruby/object:Gem::Version
         
     | 
| 
      
 22 
     | 
    
         
            +
                    version: '0'
         
     | 
| 
      
 23 
     | 
    
         
            +
              type: :runtime
         
     | 
| 
      
 24 
     | 
    
         
            +
              prerelease: false
         
     | 
| 
      
 25 
     | 
    
         
            +
              version_requirements: *72352710
         
     | 
| 
      
 26 
     | 
    
         
            +
            - !ruby/object:Gem::Dependency
         
     | 
| 
      
 27 
     | 
    
         
            +
              name: rspec
         
     | 
| 
      
 28 
     | 
    
         
            +
              requirement: &72352370 !ruby/object:Gem::Requirement
         
     | 
| 
      
 29 
     | 
    
         
            +
                none: false
         
     | 
| 
      
 30 
     | 
    
         
            +
                requirements:
         
     | 
| 
      
 31 
     | 
    
         
            +
                - - ! '>='
         
     | 
| 
      
 32 
     | 
    
         
            +
                  - !ruby/object:Gem::Version
         
     | 
| 
      
 33 
     | 
    
         
            +
                    version: '0'
         
     | 
| 
      
 34 
     | 
    
         
            +
              type: :development
         
     | 
| 
      
 35 
     | 
    
         
            +
              prerelease: false
         
     | 
| 
      
 36 
     | 
    
         
            +
              version_requirements: *72352370
         
     | 
| 
      
 37 
     | 
    
         
            +
            description: Maxixe is an implementation of the Tango algorithm describe in the paper
         
     | 
| 
      
 38 
     | 
    
         
            +
              "Mostly-unsupervised statistical segmentation of Japanese kanji sequences" by Ando
         
     | 
| 
      
 39 
     | 
    
         
            +
              and Lee. While the paper deals with Japanese characters, it should work on any unsegmented
         
     | 
| 
      
 40 
     | 
    
         
            +
              text given enough corpus data and a tuning of the algorithm paramenters.
         
     | 
| 
      
 41 
     | 
    
         
            +
            email:
         
     | 
| 
      
 42 
     | 
    
         
            +
            - maxixe@rogerbraun.net
         
     | 
| 
      
 43 
     | 
    
         
            +
            executables: []
         
     | 
| 
      
 44 
     | 
    
         
            +
            extensions: []
         
     | 
| 
      
 45 
     | 
    
         
            +
            extra_rdoc_files: []
         
     | 
| 
      
 46 
     | 
    
         
            +
            files:
         
     | 
| 
      
 47 
     | 
    
         
            +
            - .gitignore
         
     | 
| 
      
 48 
     | 
    
         
            +
            - .rspec
         
     | 
| 
      
 49 
     | 
    
         
            +
            - Gemfile
         
     | 
| 
      
 50 
     | 
    
         
            +
            - Rakefile
         
     | 
| 
      
 51 
     | 
    
         
            +
            - lib/.maxixe.rb.swp
         
     | 
| 
      
 52 
     | 
    
         
            +
            - lib/maxixe.rb
         
     | 
| 
      
 53 
     | 
    
         
            +
            - lib/maxixe/version.rb
         
     | 
| 
      
 54 
     | 
    
         
            +
            - maxixe.gemspec
         
     | 
| 
      
 55 
     | 
    
         
            +
            - spec/segmenter/.segmenter_spec.rb.swp
         
     | 
| 
      
 56 
     | 
    
         
            +
            - spec/segmenter/segmenter_spec.rb
         
     | 
| 
      
 57 
     | 
    
         
            +
            - spec/spec_helper.rb
         
     | 
| 
      
 58 
     | 
    
         
            +
            - spec/trainer/.first_file.swp
         
     | 
| 
      
 59 
     | 
    
         
            +
            - spec/trainer/.second_file.swp
         
     | 
| 
      
 60 
     | 
    
         
            +
            - spec/trainer/.trainer_spec.rb.swp
         
     | 
| 
      
 61 
     | 
    
         
            +
            - spec/trainer/first_file
         
     | 
| 
      
 62 
     | 
    
         
            +
            - spec/trainer/second_file
         
     | 
| 
      
 63 
     | 
    
         
            +
            - spec/trainer/trainer_spec.rb
         
     | 
| 
      
 64 
     | 
    
         
            +
            has_rdoc: true
         
     | 
| 
      
 65 
     | 
    
         
            +
            homepage: https://github.com/rogerbraun/Maxixe
         
     | 
| 
      
 66 
     | 
    
         
            +
            licenses: []
         
     | 
| 
      
 67 
     | 
    
         
            +
            post_install_message: 
         
     | 
| 
      
 68 
     | 
    
         
            +
            rdoc_options: []
         
     | 
| 
      
 69 
     | 
    
         
            +
            require_paths:
         
     | 
| 
      
 70 
     | 
    
         
            +
            - lib
         
     | 
| 
      
 71 
     | 
    
         
            +
            required_ruby_version: !ruby/object:Gem::Requirement
         
     | 
| 
      
 72 
     | 
    
         
            +
              none: false
         
     | 
| 
      
 73 
     | 
    
         
            +
              requirements:
         
     | 
| 
      
 74 
     | 
    
         
            +
              - - ! '>='
         
     | 
| 
      
 75 
     | 
    
         
            +
                - !ruby/object:Gem::Version
         
     | 
| 
      
 76 
     | 
    
         
            +
                  version: '0'
         
     | 
| 
      
 77 
     | 
    
         
            +
            required_rubygems_version: !ruby/object:Gem::Requirement
         
     | 
| 
      
 78 
     | 
    
         
            +
              none: false
         
     | 
| 
      
 79 
     | 
    
         
            +
              requirements:
         
     | 
| 
      
 80 
     | 
    
         
            +
              - - ! '>='
         
     | 
| 
      
 81 
     | 
    
         
            +
                - !ruby/object:Gem::Version
         
     | 
| 
      
 82 
     | 
    
         
            +
                  version: '0'
         
     | 
| 
      
 83 
     | 
    
         
            +
            requirements: []
         
     | 
| 
      
 84 
     | 
    
         
            +
            rubyforge_project: maxixe
         
     | 
| 
      
 85 
     | 
    
         
            +
            rubygems_version: 1.6.1
         
     | 
| 
      
 86 
     | 
    
         
            +
            signing_key: 
         
     | 
| 
      
 87 
     | 
    
         
            +
            specification_version: 3
         
     | 
| 
      
 88 
     | 
    
         
            +
            summary: A small statistical segmenter for any language.
         
     | 
| 
      
 89 
     | 
    
         
            +
            test_files: []
         
     |