RubyGems - lijia-rmmseg-cpp - Versions diffs - 10.2.9.2 - Mend

lijia-rmmseg-cpp 10.2.9.2

Files changed (45) hide show

@@ -0,0 +1,34 @@
+#ifndef _DICT_H_
+#define _DICT_H_
+#include "word.h"
+/**
+ * A dictionary is a hash table of
+ *  - key: string
+ *  - value: word
+ *
+ * Dictionary data can be loaded from files. Two type of dictionary
+ * files are supported:
+ *  - character file: Each line contains a number and a character,
+ *                    the number is the frequency of the character.
+ *                    The frequency should NOT exceeds 65535.
+ *  - word file:      Each line contains a number and a word, the
+ *                    number is the character count of the word.
+ */
+namespace rmmseg
+{
+    /* Instead of making a class with only one instance, i'll not
+     * bother to make it a class here. */
+    namespace dict
+    {
+        void  add(Word *word);
+        bool  load_chars(const char *filename);
+        bool  load_words(const char *filename);
+        Word *get(const char *str, int len);
+    }
+}
+#endif /* _DICT_H_ */

data/ext/rmmseg/extconf.rb ADDED

@@ -0,0 +1,17 @@
+require 'mkmf'
+CONFIG['LDSHARED'] = CONFIG['LDSHARED'].sub(/^\$\(CC\)/, 'g++')
+# if RUBY_PLATFORM =~ /darwin/
+# #  CONFIG['LDSHARED'] = 'g++ --dynamiclib -flat_namespace -undefined suppress'
+#   CONFIG['LDSHARED'] = 'g++ --dynamiclib'
+# elsif RUBY_PLATFORM =~ /linux/
+#   CONFIG['LDSHARED'] = 'g++ -shared'
+# end
+if RUBY_PLATFORM =~ /darwin/ and `which brew`.empty?
+  CONFIG['LDSHARED'] = 'g++ -dynamiclib -single_module -flat_namespace -undefined suppress'
+end
+$objs = ['algor.o', 'dict.o', 'memory.o', 'rmmseg.o']
+create_makefile('rmmseg')

data/ext/rmmseg/memory.cpp ADDED

@@ -0,0 +1,9 @@
+#include "memory.h"
+#define PRE_ALLOC_SIZE 2097152 /* 2MB */
+namespace rmmseg
+{
+    char   *_pool_base = static_cast<char *>(std::malloc(PRE_ALLOC_SIZE));
+    size_t  _pool_size = PRE_ALLOC_SIZE;
+}

data/ext/rmmseg/memory.h ADDED

@@ -0,0 +1,43 @@
+#ifndef _MEMORY_H_
+#define _MEMORY_H_
+#include <cstdlib>
+/**
+ * Pre-allocate a chunk of memory and allocate them in small pieces.
+ * Those memory are never freed after allocation. Used for persist
+ * data like dictionary contents that will never be destroyed unless
+ * the application exited.
+ */
+namespace rmmseg
+{
+    const size_t REALLOC_SIZE = 2048; /* 2KB */
+    extern size_t  _pool_size;
+    extern char   *_pool_base;
+    inline void *pool_alloc(size_t len)
+    {
+        void *mem = _pool_base;
+        if (len <= _pool_size)
+        {
+            _pool_size -= len;
+            _pool_base += len;
+            return mem;
+        }
+        /* NOTE: the remaining memory is simply discard, which WILL
+         * cause memory leak. However, this function is not for allocating
+         * large object. Larger pre-alloc chunk size will also reduce the
+         * impact of this leak. So this is generally not a problem. */
+        _pool_base = static_cast<char *>(std::malloc(REALLOC_SIZE));
+        mem = _pool_base;
+        _pool_base += len;
+        _pool_size = REALLOC_SIZE - len;
+        return mem;
+    }
+}
+#endif /* _MEMORY_H_ */

data/ext/rmmseg/rmmseg.cpp ADDED

@@ -0,0 +1,263 @@
+#include <ruby.h>
+#include <cstdio>               // for debug
+#include "token.h"
+#include "dict.h"
+#include "algor.h"
+using namespace std;
+extern "C" {
+    /*****************************************
+     *
+     * Normal interface
+     *
+     *****************************************/
+    /*********************
+     * RMMSeg module
+     *********************/
+    static VALUE mRMMSeg;
+    /*********************
+     * Dictionary module
+     *********************/
+    static VALUE mDictionary;
+    /*
+     * Load a character dictionary.
+     *
+     * call-seq:
+     *   load_chars(path)    -> status
+     *
+     * Return +true+ if loaded successfully, +false+ otherwise.
+     */
+    static VALUE dic_load_chars(VALUE mod, VALUE path)
+    {
+        if (rmmseg::dict::load_chars(RSTRING_PTR(path)))
+            return Qtrue;
+        return Qfalse;
+    }
+    /*
+     * Load a word dictionary.
+     *
+     * call-seq:
+     *   load_words(path)    -> status
+     *
+     * Return +true+ if loaded successfully, +false+ otherwise.
+     */
+    static VALUE dic_load_words(VALUE mod, VALUE path)
+    {
+        if (rmmseg::dict::load_words(RSTRING_PTR(path)))
+            return Qtrue;
+        return Qfalse;
+    }
+    /*
+     * Add a word to the in-memory dictionary.
+     *
+     * call-seq:
+     *   add(word, length, freq)
+     *
+     * - +word+ is a String.
+     * - +length+ is number of characters (not number of bytes) of the
+     *   word to be added.
+     * - +freq+ is the frequency of the word. This is only used when
+     *   it is a one-character word.
+     */
+    static VALUE dic_add(VALUE mod, VALUE word, VALUE len, VALUE freq)
+    {
+        const char *str = RSTRING_PTR(word);
+        int nbytes = RSTRING_LEN(word);
+        rmmseg::Word *w = rmmseg::make_word(str, FIX2INT(len), FIX2INT(freq), nbytes);
+        rmmseg::dict::add(w);
+        return Qnil;
+    }
+    /*
+     * Check whether one word is included in the dictionary.
+     *
+     * call-seq:
+     *   has_word?(word)    -> result
+     *
+     * Return +true+ if the word is included in the dictionary,
+     * +false+ otherwise.
+     */
+    static VALUE dic_has_word(VALUE mod, VALUE word)
+    {
+        const char *str = RSTRING_PTR(word);
+        int nbytes = RSTRING_LEN(word);
+        if (rmmseg::dict::get(str, nbytes) != NULL)
+            return Qtrue;
+        return Qfalse;
+    }
+    /**********************
+     * Token Class
+     **********************/
+    struct Token
+    {
+        VALUE text;
+        VALUE start;
+        VALUE end;
+    };
+    static void tk_mark(Token *t)
+    {
+        // start and end are Fixnums, no need to mark
+        rb_gc_mark(t->text);
+    }
+    static void tk_free(Token *t)
+    {
+        free(t);
+    }
+    /*
+     * Get the text held by this token.
+     *
+     * call-seq:
+     *   text()    -> text
+     *
+     */
+    static VALUE tk_text(VALUE self)
+    {
+        Token *tk = (Token *)DATA_PTR(self);
+        return tk->text;
+    }
+    /*
+     * Get the start position of this token.
+     *
+     * call-seq:
+     *   start()    -> start_pos
+     *
+     */
+    static VALUE tk_start(VALUE self)
+    {
+        Token *tk = (Token *)DATA_PTR(self);
+        return tk->start;
+    }
+    /*
+     * Get the end position of this token.
+     *
+     * call-seq:
+     *   end()    -> end_pos
+     *
+     */
+    static VALUE tk_end(VALUE self)
+    {
+        Token *tk = (Token *)DATA_PTR(self);
+        return tk->end;
+    }
+    static VALUE cToken;
+    static VALUE tk_create(const char* base, const rmmseg::Token &t)
+    {
+        Token *tk = ALLOC(Token);
+        int start = t.text-base;
+        // This is necessary, see
+        // http://lifegoo.pluskid.org/?p=348
+        volatile VALUE text = rb_str_new(t.text, t.length);
+        tk->text = text;
+        tk->start = INT2FIX(start);
+        tk->end = INT2FIX(start + t.length);
+        volatile VALUE tok = Data_Wrap_Struct(cToken,
+                                (RUBY_DATA_FUNC)tk_mark,
+                                (RUBY_DATA_FUNC)tk_free,
+                                tk);
+        return tok;
+    }
+    /*********************
+     * Algorithm Class
+     *********************/
+    struct Algorithm
+    {
+        VALUE text;             // hold to avoid being garbage collected
+        rmmseg::Algorithm *algor;
+    };
+    static void algor_mark(Algorithm *a)
+    {
+        rb_gc_mark(a->text);
+    }
+    static void algor_free(Algorithm *a)
+    {
+        free(a->algor);
+    }
+    static VALUE cAlgorithm;
+    /*
+     * Create an Algorithm object to do segmenting on +text+.
+     *
+     * call-seq:
+     *   new(text)    -> algorithm
+     *
+     */
+    static VALUE algor_create(VALUE klass, VALUE text)
+    {
+        Algorithm *algor = ALLOC(Algorithm);
+        void *mem;
+        algor->text = text;
+        mem = malloc(sizeof(rmmseg::Algorithm));
+        algor->algor = new(mem) rmmseg::Algorithm(RSTRING_PTR(text),
+                                                  RSTRING_LEN(text));
+        return Data_Wrap_Struct(klass,
+                                (RUBY_DATA_FUNC)algor_mark,
+                                (RUBY_DATA_FUNC)algor_free,
+                                algor);
+    }
+    /*
+     * Get next token.
+     *
+     * call-seq:
+     *   next_token()   -> token
+     *
+     * Return +nil+ if no more token available.
+     */
+    static VALUE algor_next_token(VALUE self)
+    {
+        Algorithm *algor = (Algorithm *)DATA_PTR(self);
+        rmmseg::Token tk = algor->algor->next_token();
+        if (tk.length == 0)
+            return Qnil;
+        volatile VALUE rtk = tk_create(RSTRING_PTR(algor->text), tk);
+        return rtk;
+    }
+    void Init_rmmseg()
+    {
+        mRMMSeg = rb_define_module("RMMSeg");
+        /* Manage dictionaries used by rmmseg. */
+        mDictionary = rb_define_module_under(mRMMSeg, "Dictionary");
+        rb_define_singleton_method(mDictionary, "load_chars", RUBY_METHOD_FUNC(dic_load_chars), 1);
+        rb_define_singleton_method(mDictionary, "load_words", RUBY_METHOD_FUNC(dic_load_words), 1);
+        rb_define_singleton_method(mDictionary, "add", RUBY_METHOD_FUNC(dic_add), 3);
+        rb_define_singleton_method(mDictionary, "has_word?", RUBY_METHOD_FUNC(dic_has_word), 1);
+        /* A Token hold the text and related position information. */
+        cToken = rb_define_class_under(mRMMSeg, "Token", rb_cObject);
+        rb_undef_method(rb_singleton_class(cToken), "new");
+        rb_define_method(cToken, "text", RUBY_METHOD_FUNC(tk_text), 0);
+        rb_define_method(cToken, "start", RUBY_METHOD_FUNC(tk_start), 0);
+        rb_define_method(cToken, "end", RUBY_METHOD_FUNC(tk_end), 0);
+        /* An Algorithm object use the MMSEG algorithm to do segmenting. */
+        cAlgorithm = rb_define_class_under(mRMMSeg, "Algorithm", rb_cObject);
+        rb_define_singleton_method(cAlgorithm, "new", RUBY_METHOD_FUNC(algor_create), 1);
+        rb_define_method(cAlgorithm, "next_token", RUBY_METHOD_FUNC(algor_next_token), 0);
+    }
+}

data/ext/rmmseg/rules.h ADDED

@@ -0,0 +1,86 @@
+#ifndef _RULES_H_
+#define _RULES_H_
+#include <vector>
+#include <algorithm>
+#include "chunk.h"
+namespace rmmseg
+{
+    template <typename Cmp>
+    void take_highest(std::vector<Chunk> &chunks, const Cmp &cmp)
+    {
+        unsigned int i = 1, j;
+        for (j = 1; j < chunks.size(); ++j)
+        {
+            int rlt = cmp(chunks[j], chunks[0]);
+            if (rlt > 0)
+                i = 0;
+            if (rlt >= 0)
+                std::swap(chunks[i++], chunks[j]);
+        }
+        chunks.erase(chunks.begin()+i, chunks.end());
+    }
+    struct MMCmp_t
+    {
+        int operator()(const Chunk &a, const Chunk &b) const
+        {
+            return a.total_length() - b.total_length();
+        }
+    } MMCmp;
+    void mm_filter(std::vector<Chunk> &chunks)
+    {
+        take_highest(chunks, MMCmp);
+    }
+    struct LAWLCmp_t
+    {
+        int operator()(const Chunk &a, const Chunk &b) const
+        {
+            double rlt = a.average_length() - b.average_length();
+            if (rlt == 0)
+                return 0;
+            if (rlt > 0)
+                return 1;
+            return -1;
+        }
+    } LAWLCmp;
+    void lawl_filter(std::vector<Chunk> &chunks)
+    {
+        take_highest(chunks, LAWLCmp);
+    }
+    struct SVWLCmp_t
+    {
+        int operator()(const Chunk &a, const Chunk& b) const
+        {
+            double rlt = a.variance() - b.variance();
+            if (rlt == 0)
+                return 0;
+            if (rlt < 0)
+                return 1;
+            return -1;
+        }
+    } SVWLCmp;
+    void svwl_filter(std::vector<Chunk> &chunks)
+    {
+        take_highest(chunks, SVWLCmp);
+    }
+    struct LSDMFOCWCmp_t
+    {
+        int operator()(const Chunk &a, const Chunk& b) const
+        {
+            return a.degree_of_morphemic_freedom() - b.degree_of_morphemic_freedom();
+        }
+    } LSDMFOCWCmp;
+    void lsdmfocw_filter(std::vector<Chunk> &chunks)
+    {
+        take_highest(chunks, LSDMFOCWCmp);
+    }
+}
+#endif /* _RULES_H_ */