RubyGems - rmmseg-cpp-traditional - Versions diffs - 0.0.2 - Mend

rmmseg-cpp-traditional 0.0.2

Files changed (51) hide show

data/.gitignore +17 -0
data/Gemfile +4 -0
data/History.txt +21 -0
data/LICENSE.txt +22 -0
data/Manifest.txt +43 -0
data/README +111 -0
data/README.md +29 -0
data/Rakefile +19 -0
data/bin/rmmseg +63 -0
data/data/chars.dic +12638 -0
data/data/words.dic +120308 -0
data/ext/rmmseg/algor.cpp +222 -0
data/ext/rmmseg/algor.h +80 -0
data/ext/rmmseg/chunk.h +59 -0
data/ext/rmmseg/dict.cpp +230 -0
data/ext/rmmseg/dict.h +34 -0
data/ext/rmmseg/extconf.rb +17 -0
data/ext/rmmseg/memory.cpp +9 -0
data/ext/rmmseg/memory.h +43 -0
data/ext/rmmseg/rmmseg.cpp +263 -0
data/ext/rmmseg/rules.h +86 -0
data/ext/rmmseg/token.h +19 -0
data/ext/rmmseg/word.h +44 -0
data/lib/rmmseg/dictionary.rb +59 -0
data/lib/rmmseg/ferret.rb +64 -0
data/lib/rmmseg-cpp-traditional/version.rb +7 -0
data/lib/rmmseg-cpp-traditional.rb +9 -0
data/lib/rmmseg.rb +3 -0
data/misc/convert.rb +114 -0
data/misc/ferret_example.rb +59 -0
data/misc/homepage.erb +196 -0
data/misc/homepage.html +1212 -0
data/rmmseg-cpp-traditional.gemspec +19 -0
data/spec/rmmseg_spec.rb +8 -0
data/spec/spec_helper.rb +17 -0
data/tasks/ann.rake +81 -0
data/tasks/bones.rake +21 -0
data/tasks/gem.rake +126 -0
data/tasks/git.rake +41 -0
data/tasks/homepage.rake +15 -0
data/tasks/manifest.rake +49 -0
data/tasks/notes.rake +28 -0
data/tasks/post_load.rake +39 -0
data/tasks/rdoc.rake +51 -0
data/tasks/rubyforge.rake +58 -0
data/tasks/setup.rb +268 -0
data/tasks/spec.rake +55 -0
data/tasks/svn.rake +48 -0
data/tasks/test.rake +38 -0
data/test/test_rmmseg.rb +0 -0
metadata +116 -0

data/ext/rmmseg/rmmseg.cpp ADDED Viewed

@@ -0,0 +1,263 @@
+#include <ruby.h>
+#include <cstdio>               // for debug
+#include "token.h"
+#include "dict.h"
+#include "algor.h"
+using namespace std;
+extern "C" {
+    /*****************************************
+     *
+     * Normal interface
+     *
+     *****************************************/
+    /*********************
+     * RMMSeg module
+     *********************/
+    static VALUE mRMMSeg;
+    /*********************
+     * Dictionary module
+     *********************/
+    static VALUE mDictionary;
+    /*
+     * Load a character dictionary.
+     *
+     * call-seq:
+     *   load_chars(path)    -> status
+     *
+     * Return +true+ if loaded successfully, +false+ otherwise.
+     */
+    static VALUE dic_load_chars(VALUE mod, VALUE path)
+    {
+        if (rmmseg::dict::load_chars(RSTRING_PTR(path)))
+            return Qtrue;
+        return Qfalse;
+    }
+    /*
+     * Load a word dictionary.
+     *
+     * call-seq:
+     *   load_words(path)    -> status
+     *
+     * Return +true+ if loaded successfully, +false+ otherwise.
+     */
+    static VALUE dic_load_words(VALUE mod, VALUE path)
+    {
+        if (rmmseg::dict::load_words(RSTRING_PTR(path)))
+            return Qtrue;
+        return Qfalse;
+    }
+    /*
+     * Add a word to the in-memory dictionary.
+     *
+     * call-seq:
+     *   add(word, length, freq)
+     *
+     * - +word+ is a String.
+     * - +length+ is number of characters (not number of bytes) of the
+     *   word to be added.
+     * - +freq+ is the frequency of the word. This is only used when
+     *   it is a one-character word.
+     */
+    static VALUE dic_add(VALUE mod, VALUE word, VALUE len, VALUE freq)
+    {
+        const char *str = RSTRING_PTR(word);
+        int nbytes = RSTRING_LEN(word);
+        rmmseg::Word *w = rmmseg::make_word(str, FIX2INT(len), FIX2INT(freq), nbytes);
+        rmmseg::dict::add(w);
+        return Qnil;
+    }
+    /*
+     * Check whether one word is included in the dictionary.
+     *
+     * call-seq:
+     *   has_word?(word)    -> result
+     *
+     * Return +true+ if the word is included in the dictionary,
+     * +false+ otherwise.
+     */
+    static VALUE dic_has_word(VALUE mod, VALUE word)
+    {
+        const char *str = RSTRING_PTR(word);
+        int nbytes = RSTRING_LEN(word);
+        if (rmmseg::dict::get(str, nbytes) != NULL)
+            return Qtrue;
+        return Qfalse;
+    }
+    /**********************
+     * Token Class
+     **********************/
+    struct Token
+    {
+        VALUE text;
+        VALUE start;
+        VALUE end;
+    };
+    static void tk_mark(Token *t)
+    {
+        // start and end are Fixnums, no need to mark
+        rb_gc_mark(t->text);
+    }
+    static void tk_free(Token *t)
+    {
+        free(t);
+    }
+    /*
+     * Get the text held by this token.
+     *
+     * call-seq:
+     *   text()    -> text
+     *
+     */
+    static VALUE tk_text(VALUE self)
+    {
+        Token *tk = (Token *)DATA_PTR(self);
+        return tk->text;
+    }
+    /*
+     * Get the start position of this token.
+     *
+     * call-seq:
+     *   start()    -> start_pos
+     *
+     */
+    static VALUE tk_start(VALUE self)
+    {
+        Token *tk = (Token *)DATA_PTR(self);
+        return tk->start;
+    }
+    /*
+     * Get the end position of this token.
+     *
+     * call-seq:
+     *   end()    -> end_pos
+     *
+     */
+    static VALUE tk_end(VALUE self)
+    {
+        Token *tk = (Token *)DATA_PTR(self);
+        return tk->end;
+    }
+    static VALUE cToken;
+    static VALUE tk_create(const char* base, const rmmseg::Token &t)
+    {
+        Token *tk = ALLOC(Token);
+        int start = t.text-base;
+        // This is necessary, see
+        // http://lifegoo.pluskid.org/?p=348
+        volatile VALUE text = rb_str_new(t.text, t.length);
+        tk->text = text;
+        tk->start = INT2FIX(start);
+        tk->end = INT2FIX(start + t.length);
+        volatile VALUE tok = Data_Wrap_Struct(cToken,
+                                (RUBY_DATA_FUNC)tk_mark,
+                                (RUBY_DATA_FUNC)tk_free,
+                                tk);
+        return tok;
+    }
+    /*********************
+     * Algorithm Class
+     *********************/
+    struct Algorithm
+    {
+        VALUE text;             // hold to avoid being garbage collected
+        rmmseg::Algorithm *algor;
+    };
+    static void algor_mark(Algorithm *a)
+    {
+        rb_gc_mark(a->text);
+    }
+    static void algor_free(Algorithm *a)
+    {
+        free(a->algor);
+    }
+    static VALUE cAlgorithm;
+    /*
+     * Create an Algorithm object to do segmenting on +text+.
+     *
+     * call-seq:
+     *   new(text)    -> algorithm
+     *
+     */
+    static VALUE algor_create(VALUE klass, VALUE text)
+    {
+        Algorithm *algor = ALLOC(Algorithm);
+        void *mem;
+        algor->text = text;
+        mem = malloc(sizeof(rmmseg::Algorithm));
+        algor->algor = new(mem) rmmseg::Algorithm(RSTRING_PTR(text),
+                                                  RSTRING_LEN(text));
+        return Data_Wrap_Struct(klass,
+                                (RUBY_DATA_FUNC)algor_mark,
+                                (RUBY_DATA_FUNC)algor_free,
+                                algor);
+    }
+    /*
+     * Get next token.
+     *
+     * call-seq:
+     *   next_token()   -> token
+     *
+     * Return +nil+ if no more token available.
+     */
+    static VALUE algor_next_token(VALUE self)
+    {
+        Algorithm *algor = (Algorithm *)DATA_PTR(self);
+        rmmseg::Token tk = algor->algor->next_token();
+        if (tk.length == 0)
+            return Qnil;
+        volatile VALUE rtk = tk_create(RSTRING_PTR(algor->text), tk);
+        return rtk;
+    }
+    void Init_rmmseg()
+    {
+        mRMMSeg = rb_define_module("RMMSeg");
+        /* Manage dictionaries used by rmmseg. */
+        mDictionary = rb_define_module_under(mRMMSeg, "Dictionary");
+        rb_define_singleton_method(mDictionary, "load_chars", RUBY_METHOD_FUNC(dic_load_chars), 1);
+        rb_define_singleton_method(mDictionary, "load_words", RUBY_METHOD_FUNC(dic_load_words), 1);
+        rb_define_singleton_method(mDictionary, "add", RUBY_METHOD_FUNC(dic_add), 3);
+        rb_define_singleton_method(mDictionary, "has_word?", RUBY_METHOD_FUNC(dic_has_word), 1);
+        /* A Token hold the text and related position information. */
+        cToken = rb_define_class_under(mRMMSeg, "Token", rb_cObject);
+        rb_undef_method(rb_singleton_class(cToken), "new");
+        rb_define_method(cToken, "text", RUBY_METHOD_FUNC(tk_text), 0);
+        rb_define_method(cToken, "start", RUBY_METHOD_FUNC(tk_start), 0);
+        rb_define_method(cToken, "end", RUBY_METHOD_FUNC(tk_end), 0);
+        /* An Algorithm object use the MMSEG algorithm to do segmenting. */
+        cAlgorithm = rb_define_class_under(mRMMSeg, "Algorithm", rb_cObject);
+        rb_define_singleton_method(cAlgorithm, "new", RUBY_METHOD_FUNC(algor_create), 1);
+        rb_define_method(cAlgorithm, "next_token", RUBY_METHOD_FUNC(algor_next_token), 0);
+    }
+}

data/ext/rmmseg/rules.h ADDED Viewed

@@ -0,0 +1,86 @@
+#ifndef _RULES_H_
+#define _RULES_H_
+#include <vector>
+#include <algorithm>
+#include "chunk.h"
+namespace rmmseg
+{
+    template <typename Cmp>
+    void take_highest(std::vector<Chunk> &chunks, const Cmp &cmp)
+    {
+        unsigned int i = 1, j;
+        for (j = 1; j < chunks.size(); ++j)
+        {
+            int rlt = cmp(chunks[j], chunks[0]);
+            if (rlt > 0)
+                i = 0;
+            if (rlt >= 0)
+                std::swap(chunks[i++], chunks[j]);
+        }
+        chunks.erase(chunks.begin()+i, chunks.end());
+    }
+    struct MMCmp_t
+    {
+        int operator()(const Chunk &a, const Chunk &b) const
+        {
+            return a.total_length() - b.total_length();
+        }
+    } MMCmp;
+    void mm_filter(std::vector<Chunk> &chunks)
+    {
+        take_highest(chunks, MMCmp);
+    }
+    struct LAWLCmp_t
+    {
+        int operator()(const Chunk &a, const Chunk &b) const
+        {
+            double rlt = a.average_length() - b.average_length();
+            if (rlt == 0)
+                return 0;
+            if (rlt > 0)
+                return 1;
+            return -1;
+        }
+    } LAWLCmp;
+    void lawl_filter(std::vector<Chunk> &chunks)
+    {
+        take_highest(chunks, LAWLCmp);
+    }
+    struct SVWLCmp_t
+    {
+        int operator()(const Chunk &a, const Chunk& b) const
+        {
+            double rlt = a.variance() - b.variance();
+            if (rlt == 0)
+                return 0;
+            if (rlt < 0)
+                return 1;
+            return -1;
+        }
+    } SVWLCmp;
+    void svwl_filter(std::vector<Chunk> &chunks)
+    {
+        take_highest(chunks, SVWLCmp);
+    }
+    struct LSDMFOCWCmp_t
+    {
+        int operator()(const Chunk &a, const Chunk& b) const
+        {
+            return a.degree_of_morphemic_freedom() - b.degree_of_morphemic_freedom();
+        }
+    } LSDMFOCWCmp;
+    void lsdmfocw_filter(std::vector<Chunk> &chunks)
+    {
+        take_highest(chunks, LSDMFOCWCmp);
+    }
+}
+#endif /* _RULES_H_ */

data/ext/rmmseg/token.h ADDED Viewed

@@ -0,0 +1,19 @@
+#ifndef _TOKEN_H_
+#define _TOKEN_H_
+namespace rmmseg
+{
+    struct Token
+    {
+        Token(const char *txt, int len)
+            :text(txt), length(len) { }
+        // `text' may or may not be nul-terminated, its length
+        // should be stored in the `length' field.
+        //
+        // if length is 0, this is an empty token
+        const char *text;
+        int length;
+    };
+}
+#endif /* _TOKEN_H_ */

data/ext/rmmseg/word.h ADDED Viewed

@@ -0,0 +1,44 @@
+#ifndef _WORD_H_
+#define _WORD_H_
+#include <climits>
+#include <cstring>
+#include "memory.h"
+namespace rmmseg
+{
+    const int word_embed_len = 4; /* at least 1 char (3 bytes+'\0') */
+    struct Word
+    {
+        unsigned char   nbytes;   /* number of bytes */
+        char            length;   /* number of characters */
+        unsigned short  freq;
+        char            text[word_embed_len];
+    };
+    /**
+     * text: the text of the word.
+     * length: number of characters (not bytes).
+     * freq: the frequency of the word.
+     */
+    inline Word *make_word(const char *text, int length=1,
+                           int freq=0, int nbytes=-1)
+    {
+        if (freq > USHRT_MAX)
+            freq = USHRT_MAX;   /* avoid overflow */
+        if (nbytes == -1)
+            nbytes = std::strlen(text);
+        Word *w = static_cast<Word *>(pool_alloc(sizeof(Word)
+                                                 + nbytes+1
+                                                 - word_embed_len));
+        w->nbytes = nbytes;
+        w->length = length;
+        w->freq = freq;
+        std::strncpy(w->text, text, nbytes);
+        w->text[nbytes] = '\0';
+        return w;
+    }
+}
+#endif /* _WORD_H_ */

data/lib/rmmseg/dictionary.rb ADDED Viewed

@@ -0,0 +1,59 @@
+module RMMSeg
+  module Dictionary
+    @dictionaries = [
+                     [:chars, File.join(File.dirname(__FILE__),
+                                        "..", "..", "data", "chars.dic")],
+                     [:words, File.join(File.dirname(__FILE__),
+                                        "..", "..", "data", "words.dic")]
+                    ]
+    class << self
+      #
+      # An array of dictionaries used by RMMSeg. Each entry is of the
+      # following form:
+      #
+      #   [type, path]
+      #
+      # where +type+ can either <tt>:chars</tt> or <tt>:words</tt>. +path+ is the path
+      # to the dictionary file.
+      #
+      # The format of <tt>:chars</tt> dictionary is a collection of lines of the
+      # following form:
+      #
+      #   freq char
+      #
+      # Where +frequency+ is a number <b>less than 65535</b>. +char+ is the
+      # character. They are spearated by <b>exactly one space</b>.
+      #
+      # The format of <tt>:words</tt> dictionary is similar:
+      #
+      #   length word
+      #
+      # except the first number is not the frequency, but the number of
+      # characters (not number of bytes) in the word.
+      #
+      # There's a script (convert.rb) in the tools directory that can be used
+      # to convert and normalize dictionaries.
+      attr_accessor :dictionaries
+      # Add a user defined dictionary, +type+ can be
+      # +:chars+ or <tt>:words</tt>. See doc of dictionaries.
+      def add_dictionary(path, type)
+        @dictionaries << [type, path]
+      end
+      # Load dictionaries. Call this method after set up the path of the
+      # dictionaries needed to load and before any Algorithm object is
+      # created.
+      def load_dictionaries()
+        @dictionaries.each do |type, path|
+          if type == :chars
+            load_chars(path)
+          elsif type == :words
+            load_words(path)
+          end
+        end
+      end
+    end
+  end
+end

data/lib/rmmseg/ferret.rb ADDED Viewed

@@ -0,0 +1,64 @@
+require 'rubygems'
+require 'rmmseg'
+require 'ferret'
+module RMMSeg
+  module Ferret
+        # The Analyzer class can be used with Ferret .
+    class Analyzer < ::Ferret::Analysis::Analyzer
+      # Construct an Analyzer. Optional block can be used to
+      # add more +TokenFilter+s. e.g.
+      #
+      #   analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
+      #     Ferret::Analysis::LowerCaseFilter.new(tokenizer)
+      #   }
+      #
+      def initialize(&brk)
+        @brk = brk
+      end
+      def token_stream(field, text)
+        t = Tokenizer.new(text)
+        if @brk
+          @brk.call(t)
+        else
+          t
+        end
+      end
+    end
+    # The Tokenizer tokenize text with RMMSeg::Algorithm.
+    class Tokenizer < ::Ferret::Analysis::TokenStream
+      # Create a new Tokenizer to tokenize +text+
+      def initialize(str)
+        self.text = str
+      end
+      # Get next token
+      def next
+        tok = @algor.next_token
+        if tok.nil?
+          return nil
+        else
+          @token.text = tok.text
+          @token.start = tok.start
+          @token.end = tok.end
+          return @token
+        end
+      end
+      # Get the text being tokenized
+      def text
+        @text
+      end
+      # Set the text to be tokenized
+      def text=(str)
+        @token = ::Ferret::Analysis::Token.new("", 0, 0)
+        @text = str
+        @algor = Algorithm.new(@text)
+      end
+    end
+  end
+end

data/lib/rmmseg-cpp-traditional/version.rb ADDED Viewed

@@ -0,0 +1,7 @@
+module Rmmseg
+  module Cpp
+    module Traditional
+      VERSION = "0.0.1"
+    end
+  end
+end

data/lib/rmmseg-cpp-traditional.rb ADDED Viewed

@@ -0,0 +1,9 @@
+require "rmmseg-cpp-traditional/version"
+module Rmmseg
+  module Cpp
+    module Traditional
+      # Your code goes here...
+    end
+  end
+end

data/lib/rmmseg.rb ADDED Viewed

@@ -0,0 +1,3 @@
+require File.join(File.dirname(__FILE__), 'rmmseg', 'dictionary')
+require File.join(File.dirname(__FILE__), '..',
+                  'ext', 'rmmseg', 'rmmseg')

data/misc/convert.rb ADDED Viewed

@@ -0,0 +1,114 @@
+#!/usr/bin/ruby
+# A utility used to convert the old RMMSeg dictionary
+# to rmmseg-cpp format.
+# There are several constrains for the new rmmseg-cpp
+# dictionary format.
+#  - length of word should be specified in the dict
+#  - number and string should be separated by ONE space
+#  - there should be a newline at the end of file
+$KCODE='u'
+require 'jcode'
+def usage(msg=nil)
+  puts "***ERROR: #{msg}\n\n" if msg
+  puts <<EOT
+Usage:
+#{$0} action type input.dic output.dic
+  action: either 'convert' or 'normalize'
+           - 'convert' is used to convert the dict from
+             old RMMSeg format.
+           - 'normalize' is used to normalize an existing
+             rmmseg-cpp dict.
+  type:   either 'words' or 'chars'
+EOT
+  exit(0)
+end
+usage if ARGV.size != 4
+usage("unknown action #{ARGV[0]}") if ! ['convert', 'normalize'].include? ARGV[0]
+usage("unknown type #{ARGV[1]}") if ! ['words', 'chars'].include? ARGV[1]
+def output(data)
+  File.open(ARGV[3], "w") do |f|
+    data.each do |num, word|
+      f.puts "#{num} #{word}" if word
+    end
+  end
+end
+def read_RMMSeg_chars
+  max = 0
+  File.readlines(ARGV[2]).map do |line|
+    if line =~ /^(.)\s+(\d+)$/
+      n = $2.to_i
+      max = n if n > max
+      [n, $1]
+    else
+      [nil, nil]
+    end
+  end.map do |num, word|
+    if word
+      [num*65535/max, word]
+    else
+      [nil, nil]
+    end
+  end
+end
+def read_RMMSeg_words
+  File.readlines(ARGV[2]).map do |line|
+    line.chomp!
+    if !line.empty?
+      [line.jlength, line]
+    else
+      [nil, nil]
+    end
+  end
+end
+def read_rmmseg_cpp_chars
+  max = 0
+  File.readlines(ARGV[2]).map do |line|
+    if line =~ /^(\d+)\s+(.)$/
+      n = $1.to_i
+      max = n if n > max
+      [n, $2]
+    else
+      [nil, nil]
+    end
+  end.map do |num, word|
+    if word
+      [num*65535/max, word]
+    else
+      [nil, nil]
+    end
+  end
+end
+def read_rmmseg_cpp_words
+  File.readlines(ARGV[2]).map do |line|
+    if line =~ /^(\d+)\s+(\w+)$/
+      [$1, $2]
+    else
+      [nil, nil]
+    end
+  end
+end
+case ARGV[0,2]
+when ['convert', 'chars']
+  output(read_RMMSeg_chars)
+when ['convert', 'words']
+  output(read_RMMSeg_words)
+when ['normalize', 'chars']
+  output(read_rmmseg_cpp_chars)
+when ['normalize', 'words']
+  output(read_rmmseg_cpp_words)
+end