RubyGems - pluskid-rmmseg-cpp - Versions diffs - 0.2.0 - Mend

pluskid-rmmseg-cpp 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

data/ext/rmmseg/algor.cpp ADDED Viewed

@@ -0,0 +1,216 @@
+#include <cctype>
+#include <cassert>
+#include "rules.h"
+#include "algor.h"
+using namespace std;
+namespace rmmseg
+{
+    Token Algorithm::next_token()
+    {
+        do
+        {
+            if (m_pos >= m_text_length)
+                return Token(NULL, 0);
+            Token tk(NULL, 0);
+            int len = next_char();
+            if (len == 1)
+                tk = get_basic_latin_word();
+            else
+                tk = get_cjk_word(len);
+            if (tk.length > 0)
+                return tk;
+        }
+        while (true);
+    }
+    Token Algorithm::get_basic_latin_word()
+    {
+        int len = 1;
+        int start, end;
+        // Skip pre-word whitespaces and punctuations
+        while (m_pos < m_text_length)
+        {
+            if (len > 1)
+                break;
+            if (isalnum(m_text[m_pos]))
+                break;
+            m_pos++;
+            len = next_char();
+        }
+        start = m_pos;
+        while (m_pos < m_text_length)
+        {
+            if (len > 1)
+                break;
+            if (!isalnum(m_text[m_pos]))
+                break;
+            m_pos++;
+            len = next_char();
+        }
+        end = m_pos;
+        // Skip post-word whitespaces and punctuations
+        while (m_pos < m_text_length)
+        {
+            if (len > 1)
+                break;
+            if (isalnum(m_text[m_pos]))
+                break;
+            m_pos++;
+            len = next_char();
+        }
+        return Token(m_text+start, end-start);
+    }
+    Token Algorithm::get_cjk_word(int len)
+    {
+        vector<Chunk> chunks = create_chunks();
+        if (chunks.size() > 1)
+            mm_filter(chunks);
+        if (chunks.size() > 1)
+            lawl_filter(chunks);
+        if (chunks.size() > 1)
+            svwl_filter(chunks);
+        if (chunks.size() > 1)
+            lsdmfocw_filter(chunks);
+        Token token(m_text+m_pos, chunks[0].words[0]->nbytes);
+        m_pos += chunks[0].words[0]->nbytes;
+        return token;
+    }
+    vector<Chunk> Algorithm::create_chunks()
+    {
+        vector<Chunk> chunks;
+        Chunk chunk;
+        Word *w1, *w2, *w3;
+        int orig_pos = m_pos;
+        typedef vector<Word *> vec_t;
+        typedef vec_t::iterator it_t;
+        vec_t words1 = find_match_words();
+        for (it_t i1 = words1.begin();
+             i1 != words1.end();
+             ++i1)
+        {
+            w1 = *i1;
+            chunk.words[0] = w1;
+            m_pos += w1->nbytes;
+            if (m_pos < m_text_length)
+            {
+                vec_t words2 = find_match_words();
+                for (it_t i2 = words2.begin();
+                     i2 != words2.end();
+                     ++i2)
+                {
+                    w2 = *i2;
+                    chunk.words[1] = w2;
+                    m_pos += w2->nbytes;
+                    if (m_pos < m_text_length)
+                    {
+                        vec_t words3 = find_match_words();
+                        for (it_t i3 = words3.begin();
+                             i3 != words3.end();
+                             ++i3)
+                        {
+                            w3 = *i3;
+                            if (w3->length == -1) // tmp word
+                            {
+                                chunk.n = 2;
+                            }
+                            else
+                            {
+                                chunk.n = 3;
+                                chunk.words[2] = w3;
+                            }
+                            chunks.push_back(chunk);
+                        }
+                    }
+                    else if (m_pos == m_text_length)
+                    {
+                        chunk.n = 2;
+                        chunks.push_back(chunk);
+                    }
+                    m_pos -= w2->nbytes;
+                }
+            }
+            else if (m_pos == m_text_length)
+            {
+                chunk.n = 1;
+                chunks.push_back(chunk);
+            }
+            m_pos -= w1->nbytes;
+        }
+        m_pos = orig_pos;
+        return chunks;
+    }
+    int Algorithm::next_char()
+    {
+        // ONLY for UTF-8
+        unsigned char ch = m_text[m_pos];
+        if (ch >= 0xC0 && ch <= 0xDF)
+            return 2;
+        if (ch >= 0xE0 && ch <= 0xEF)
+            return 3;
+        return 1;
+    }
+    vector<Word *> Algorithm::find_match_words()
+    {
+        for (int i = 0; i < match_cache_size; ++i)
+            if (m_match_cache[i].first == m_pos)
+                return m_match_cache[i].second;
+        vector<Word *> words;
+        Word *word;
+        int orig_pos = m_pos;
+        int n = 0, len;
+        while (m_pos < m_text_length)
+        {
+            len = next_char();
+            if (len <= 1)
+                break;
+            if (n >= max_word_length())
+                break;
+            m_pos += len;
+            n++;
+            word = dict::get(m_text+orig_pos, m_pos-orig_pos);
+            if (word)
+                words.push_back(word);
+        }
+        m_pos = orig_pos;
+        if (words.empty())
+        {
+            word = get_tmp_word();
+            word->nbytes = next_char();
+            word->length = -1;
+            strncpy(word->text, m_text+m_pos, word->nbytes);
+            word->text[word->nbytes] = '\0';
+            words.push_back(word);
+        }
+        m_match_cache[m_match_cache_i] = make_pair(m_pos, words);
+        m_match_cache_i++;
+        if (m_match_cache_i >= match_cache_size)
+            m_match_cache_i = 0;
+        return words;
+    }
+}

data/ext/rmmseg/algor.h ADDED Viewed

@@ -0,0 +1,75 @@
+#ifndef _ALGORITHM_H_
+#define _ALGORITHM_H_
+#include <vector>
+#include "chunk.h"
+#include "token.h"
+#include "dict.h"
+/**
+ * The Algorithm of MMSeg use four rules:
+ *  - Maximum matching rule
+ *  - Largest average word length rule
+ *  - Smallest variance of word length rule
+ *  - Largest sum of degree of morphemic freedom of one-character
+ *    words rule
+ */
+namespace rmmseg
+{
+    class Algorithm
+    {
+    public:
+        Algorithm(const char *text, int length)
+            :m_text(text), m_pos(0),
+            m_text_length(length),
+            m_tmp_words_i(0),
+            m_match_cache_i(0)
+            {
+                for (int i = 0; i < match_cache_size; ++i)
+                    m_match_cache[i].first = -1;
+            }
+        Token next_token();
+    private:
+        Token get_basic_latin_word();
+        Token get_cjk_word(int);
+        std::vector<Chunk> create_chunks();
+        int next_word();
+        int next_char();
+        std::vector<Word *> find_match_words();
+        int max_word_length() { return 4; }
+        const char *m_text;
+        int m_pos;
+        int m_text_length;
+        /* tmp words are only for 1-char words which
+         * are not exist in the dictionary. It's length
+         * value will be set to -1 to indicate it is
+         * a tmp word. */
+        Word *get_tmp_word()
+        {
+            if (m_tmp_words_i >= max_tmp_words)
+                m_tmp_words_i = 0;  // round wrap
+            return &m_tmp_words[m_tmp_words_i++];
+        }
+        /* related to max_word_length and match_words_cache_size */
+        static const int max_tmp_words = 64;
+        Word m_tmp_words[max_tmp_words];
+        int m_tmp_words_i;
+        /* match word caches */
+        static const int match_cache_size = 3;
+        typedef std::pair<int, std::vector<Word *> > match_cache_t;
+        match_cache_t m_match_cache[match_cache_size];
+        int m_match_cache_i;
+    };
+}
+#endif /* _ALGORITHM_H_ */

data/ext/rmmseg/chunk.h ADDED Viewed

@@ -0,0 +1,58 @@
+#ifndef _CHUNK_H_
+#define _CHUNK_H_
+#include <cmath>
+#include "word.h"
+namespace rmmseg
+{
+    /**
+     * A chunk stores 3 (or less) successive words.
+     */
+    struct Chunk
+    {
+        int total_length()
+        {
+            int len = 0;
+            for (int i = 0; i < n; ++i)
+                if (words[i]->length == -1) /* tmp word */
+                    len += 1;
+                else
+                    len += words[i]->length;
+            return len;
+        }
+        double average_length()
+        {
+            return ((double)total_length())/n;
+        }
+        double variance()
+        {
+            double avg = average_length();
+            double sqr_sum = 0;
+            double tmp;
+            for (int i = 0; i < n; ++i)
+            {
+                tmp = words[i]->length;
+                if (tmp == -1)
+                    tmp = 1;
+                tmp = tmp-avg;
+                sqr_sum += tmp*tmp;
+            }
+            return std::sqrt(sqr_sum);
+        }
+        int degree_of_morphemic_freedom()
+        {
+            int sum = 0;
+            for (int i = 0; i < n; ++i)
+                sum += words[i]->freq;
+            return sum;
+        }
+        int n;
+        Word *words[3];
+    };
+}
+#endif /* _CHUNK_H_ */

data/ext/rmmseg/dict.cpp ADDED Viewed

@@ -0,0 +1,228 @@
+#include <cstdio>
+#include "dict.h"
+using namespace std;
+namespace rmmseg
+{
+    struct Entry
+    {
+        Word *word;
+        Entry *next;
+    };
+    const int init_size = 262147;
+    const int max_density = 5;
+    /*
+      Table of prime numbers 2^n+a, 2<=n<=30.
+    */
+    static int primes[] = {
+        524288 + 21,
+        1048576 + 7,
+        2097152 + 17,
+        4194304 + 15,
+        8388608 + 9,
+        16777216 + 43,
+        33554432 + 35,
+        67108864 + 15,
+        134217728 + 29,
+        268435456 + 3,
+        536870912 + 11,
+        1073741824 + 85,
+    };
+    static int n_bins = init_size;
+    static int n_entries = 0;
+    static Entry **bins = static_cast<Entry **>(std::calloc(init_size,
+                                                            sizeof(Entry *)));
+    static int new_size()
+    {
+        for (int i = 0;
+             i < sizeof(primes)/sizeof(primes[0]);
+             ++i)
+        {
+            if (primes[i] > n_bins)
+            {
+                return primes[i];
+            }
+        }
+        // TODO: raise exception here
+        return n_bins;
+    }
+    static unsigned int hash(const char *str, int len)
+    {
+        unsigned int key = 0;
+        while (len--)
+        {
+            key += *str++;
+            key += (key << 10);
+            key ^= (key >> 6);
+        }
+        key += (key << 3);
+        key ^= (key >> 11);
+        key += (key << 15);
+        return key;
+    }
+    static void rehash()
+    {
+        int new_n_bins = new_size();
+        Entry **new_bins = static_cast<Entry **>(calloc(new_n_bins,
+                                                        sizeof(Entry *)));
+        Entry *entry, *next;
+        unsigned int hash_val;
+        for (int i = 0; i < n_bins; ++i)
+        {
+            entry = bins[i];
+            while (entry)
+            {
+                next = entry->next;
+                hash_val = hash(entry->word->text,
+                                entry->word->nbytes) % new_n_bins;
+                entry->next = new_bins[hash_val];
+                new_bins[hash_val] = entry;
+                entry = next;
+            }
+        }
+        free(bins);
+        n_bins = new_n_bins;
+        bins = new_bins;
+    }
+    namespace dict
+    {
+        /**
+         * str: the base of the string
+         * len: length of the string (in bytes)
+         *
+         * str may be a substring of a big chunk of text thus not nul-terminated,
+         * so len is necessary here.
+         */
+        Word *get(const char *str, int len)
+        {
+            unsigned int h = hash(str, len) % n_bins;
+            Entry *entry = bins[h];
+            if (!entry)
+                return NULL;
+            do
+            {
+                if (len == entry->word->nbytes &&
+                    strncmp(str, entry->word->text, len) == 0)
+                    return entry->word;
+                entry = entry->next;
+            }
+            while (entry);
+            return NULL;
+        }
+        void add(Word *word)
+        {
+            unsigned int hash_val = hash(word->text, word->nbytes);
+            unsigned int h = hash_val % n_bins;
+            Entry *entry = bins[h];
+            if (!entry)
+            {
+                if (n_entries/n_bins > max_density)
+                {
+                    rehash();
+                    h = hash_val % n_bins;
+                }
+                entry = static_cast<Entry *>(pool_alloc(sizeof(Entry)));
+                entry->word = word;
+                entry->next = NULL;
+                bins[h] = entry;
+                n_entries++;
+            }
+            bool done = false;
+            do
+            {
+                if (word->nbytes == entry->word->nbytes &&
+                    strncmp(word->text, entry->word->text, word->nbytes) == 0)
+                {
+                    /* Overwriting. WARNING: the original Word object is
+                     * permanently lost. This IS a memory leak, because
+                     * the memory is allocated by pool_alloc. Instead of
+                     * fixing this, tuning the dictionary file is a better
+                     * idea
+                     */
+                    entry->word = word;
+                    done = true;
+                    break;
+                }
+                entry = entry->next;
+            }
+            while (entry);
+            if (!done)
+            {
+                entry = static_cast<Entry *>(pool_alloc(sizeof(Entry)));
+                entry->word = word;
+                entry->next = bins[h];
+                bins[h] = entry;
+            }
+        }
+        bool load_chars(const char *filename)
+        {
+            FILE *fp = fopen(filename, "r");
+            if (!fp)
+            {
+                return false;
+            }
+            const int buf_len = 24;
+            char buf[buf_len];
+            char *ptr;
+            while(fgets(buf, buf_len, fp))
+            {
+                // NOTE: there SHOULD be a newline at the end of the file
+                buf[strlen(buf)-1] = '\0';    // truncate the newline
+                ptr = strchr(buf, ' ');
+                if (!ptr)
+                    continue;       // illegal input
+                *ptr = '\0';
+                add(make_word(ptr+1, 1, atoi(buf)));
+            }
+            fclose(fp);
+            return true;
+        }
+        bool load_words(const char *filename)
+        {
+            FILE *fp = fopen(filename, "r");
+            if (!fp)
+            {
+                return false;
+            }
+            const int buf_len = 48;
+            char buf[buf_len];
+            char *ptr;
+            while(fgets(buf, buf_len, fp))
+            {
+                // NOTE: there SHOULD be a newline at the end of the file
+                buf[strlen(buf)-1] = '\0';    // truncate the newline
+                ptr = strchr(buf, ' ');
+                if (!ptr)
+                    continue;       // illegal input
+                *ptr = '\0';
+                add(make_word(ptr+1, atoi(buf), 0));
+            }
+            fclose(fp);
+            return true;
+        }
+    }
+}

data/ext/rmmseg/dict.h ADDED Viewed

@@ -0,0 +1,34 @@
+#ifndef _DICT_H_
+#define _DICT_H_
+#include "word.h"
+/**
+ * A dictionary is a hash table of
+ *  - key: string
+ *  - value: word
+ *
+ * Dictionary data can be loaded from files. Two type of dictionary
+ * files are supported:
+ *  - character file: Each line contains a number and a character,
+ *                    the number is the frequency of the character.
+ *                    The frequency should NOT exceeds 65535.
+ *  - word file:      Each line contains a number and a word, the
+ *                    number is the character count of the word.
+ */
+namespace rmmseg
+{
+    /* Instead of making a class with only one instance, i'll not
+     * bother to make it a class here. */
+    namespace dict
+    {
+        void  add(Word *word);
+        bool  load_chars(const char *filename);
+        bool  load_words(const char *filename);
+        Word *get(const char *str, int len);
+    }
+}
+#endif /* _DICT_H_ */

data/ext/rmmseg/extconf.rb ADDED Viewed

@@ -0,0 +1,6 @@
+require 'mkmf'
+CONFIG['LDSHARED'] = 'g++ -shared'
+$objs = ['algor.o', 'dict.o', 'memory.o', 'rmmseg.o']
+create_makefile('rmmseg')

data/ext/rmmseg/memory.cpp ADDED Viewed

@@ -0,0 +1,9 @@
+#include "memory.h"
+#define PRE_ALLOC_SIZE 2097152 /* 2MB */
+namespace rmmseg
+{
+    char *_pool_base = static_cast<char *>(std::malloc(PRE_ALLOC_SIZE));
+    int   _pool_size = PRE_ALLOC_SIZE;
+}

data/ext/rmmseg/memory.h ADDED Viewed

@@ -0,0 +1,43 @@
+#ifndef _MEMORY_H_
+#define _MEMORY_H_
+#include <cstdlib>
+/**
+ * Pre-allocate a chunk of memory and allocate them in small pieces.
+ * Those memory are never freed after allocation. Used for persist
+ * data like dictionary contents that will never be destroyed unless
+ * the application exited.
+ */
+namespace rmmseg
+{
+    const int REALLOC_SIZE = 2048; /* 2KB */
+    extern int   _pool_size;
+    extern char *_pool_base;
+    inline void *pool_alloc(int len)
+    {
+        void *mem = _pool_base;
+        if (len <= _pool_size)
+        {
+            _pool_size -= len;
+            _pool_base += len;
+            return mem;
+        }
+        /* NOTE: the remaining memory is simply discard, which WILL
+         * cause memory leak. However, this function is not for allocating
+         * large object. Larger pre-alloc chunk size will also reduce the
+         * impact of this leak. So this is generally not a problem. */
+        _pool_base = static_cast<char *>(std::malloc(REALLOC_SIZE));
+        mem = _pool_base;
+        _pool_base += len;
+        _pool_size = REALLOC_SIZE - len;
+        return mem;
+    }
+}
+#endif /* _MEMORY_H_ */