RubyGems - rmmseg-cpp-new - Versions diffs - 0.3.0 - Mend

rmmseg-cpp-new 0.3.0

Files changed (22) hide show

data/ext/rmmseg/algor.cpp ADDED Viewed

@@ -0,0 +1,253 @@
+#include <cctype>
+#include <cassert>
+#include "rules.h"
+#include "algor.h"
+using namespace std;
+namespace rmmseg
+{
+    Token Algorithm::next_token()
+    {
+        do
+        {
+            if (m_pos >= m_text_length)
+                return Token(NULL, 0);
+            Token tk(NULL, 0);
+            int len = next_char();
+            if (len == 1)
+                tk = get_basic_latin_word();
+            else
+                tk = get_cjk_word(len);
+            if (tk.length > 0)
+                return tk;
+        }
+        while (true);
+    }
+    Token Algorithm::get_basic_latin_word()
+    {
+        int len = 1;
+        int start, end;
+        // Skip pre-word whitespaces and punctuations
+        while (m_pos < m_text_length)
+        {
+            if (len > 1)
+                break;
+            if (isalnum(m_text[m_pos]))
+                break;
+            m_pos++;
+            len = next_char();
+        }
+        start = m_pos;
+        while (m_pos < m_text_length)
+        {
+            if (len > 1)
+                break;
+            if (!isalnum(m_text[m_pos]))
+                break;
+            m_pos++;
+            len = next_char();
+        }
+        end = m_pos;
+        // Skip post-word whitespaces and punctuations
+        while (m_pos < m_text_length)
+        {
+            if (len > 1)
+                break;
+            if (isalnum(m_text[m_pos]))
+                break;
+            m_pos++;
+            len = next_char();
+        }
+        return Token(m_text+start, end-start);
+    }
+    Token Algorithm::get_cjk_word(int len)
+    {
+        vector<Chunk> chunks = create_chunks();
+        if (chunks.size() > 1)
+            mm_filter(chunks);
+        if (chunks.size() > 1)
+            lawl_filter(chunks);
+        if (chunks.size() > 1)
+            svwl_filter(chunks);
+        if (chunks.size() > 1)
+            lsdmfocw_filter(chunks);
+        if (chunks.size() < 1)
+            return Token(NULL, 0);
+        Token token(m_text+m_pos, chunks[0].words[0]->nbytes);
+        m_pos += chunks[0].words[0]->nbytes;
+        return token;
+    }
+    vector<Chunk> Algorithm::create_chunks()
+    {
+        vector<Chunk> chunks;
+        Chunk chunk;
+        Word *w1, *w2, *w3;
+        int orig_pos = m_pos;
+        typedef vector<Word *> vec_t;
+        typedef vec_t::iterator it_t;
+        vec_t words1 = find_match_words();
+        for (it_t i1 = words1.begin();
+             i1 != words1.end();
+             ++i1)
+        {
+            w1 = *i1;
+            chunk.words[0] = w1;
+            m_pos += w1->nbytes;
+            if (m_pos < m_text_length)
+            {
+                vec_t words2 = find_match_words();
+                for (it_t i2 = words2.begin();
+                     i2 != words2.end();
+                     ++i2)
+                {
+                    w2 = *i2;
+                    chunk.words[1] = w2;
+                    m_pos += w2->nbytes;
+                    if (m_pos < m_text_length)
+                    {
+                        vec_t words3 = find_match_words();
+                        for (it_t i3 = words3.begin();
+                             i3 != words3.end();
+                             ++i3)
+                        {
+                            w3 = *i3;
+                            if (w3->length == -1) // tmp word
+                            {
+                                chunk.n = 2;
+                            }
+                            else
+                            {
+                                chunk.n = 3;
+                                chunk.words[2] = w3;
+                            }
+                            chunks.push_back(chunk);
+                        }
+                    }
+                    else if (m_pos == m_text_length)
+                    {
+                        chunk.n = 2;
+                        chunks.push_back(chunk);
+                    }
+                    m_pos -= w2->nbytes;
+                }
+            }
+            else if (m_pos == m_text_length)
+            {
+                chunk.n = 1;
+                chunks.push_back(chunk);
+            }
+            m_pos -= w1->nbytes;
+        }
+        m_pos = orig_pos;
+        return chunks;
+    }
+    int Algorithm::next_char()
+    {
+        // ONLY for UTF-8
+        unsigned char ch = m_text[m_pos];
+        if (  ch>>7 == 0      )
+                return 1;
+        if ( ch>>5  ==  0x6 )
+                return 2;
+        else if (ch>> 4 == 0xe)
+                return 3;
+        else if (ch>> 3 == 0x1e )
+                return 4;
+        else
+                return -1;
+        // if (ch >= 0xC0 && ch <= 0xDF)
+        // {
+        //     return 1;
+        // }
+        // if (ch >= 0x60 && ch <= 0xEF)
+        // {
+        //     return min(2, m_text_length-m_pos);
+        // }
+        // if (ch >= 0xE0 && ch <= 0xEF)
+        // {
+        //     return min(3, m_text_length-m_pos);
+        // }
+        // return 1;
+    }
+    // int Algorithm::next_char(const char * p)
+    // {
+    //     uint8 one = (unsigned char)(*p);
+    //     if (  one>>7 == 0      )
+    //             return 1;
+    //     if ( one>>5  ==  0x6 )
+    //             return 2;
+    //     else if (one >> 4 == 0xe)
+    //             return 3;
+    //     else if (one >> 3 == 0x1e )
+    //             return 4;
+    //     else
+    //             return -1;
+    // }
+    vector<Word *> Algorithm::find_match_words()
+    {
+        for (int i = 0; i < match_cache_size; ++i)
+            if (m_match_cache[i].first == m_pos)
+                return m_match_cache[i].second;
+        vector<Word *> words;
+        Word *word;
+        int orig_pos = m_pos;
+        int n = 0, len;
+        while (m_pos < m_text_length)
+        {
+            if (n >= max_word_length())
+                break;
+            len = next_char();
+            if (len <= 1)
+                break;
+            m_pos += len;
+            n++;
+            word = dict::get(m_text+orig_pos, m_pos-orig_pos);
+            if (word)
+                words.push_back(word);
+        }
+        m_pos = orig_pos;
+        if (words.empty())
+        {
+            word = get_tmp_word();
+            word->nbytes = next_char();
+            word->length = -1;
+            strncpy(word->text, m_text+m_pos, word->nbytes);
+            word->text[word->nbytes] = '\0';
+            words.push_back(word);
+        }
+        m_match_cache[m_match_cache_i] = make_pair(m_pos, words);
+        m_match_cache_i++;
+        if (m_match_cache_i >= match_cache_size)
+            m_match_cache_i = 0;
+        return words;
+    }
+}

data/ext/rmmseg/algor.h ADDED Viewed

@@ -0,0 +1,79 @@
+#ifndef _ALGORITHM_H_
+#define _ALGORITHM_H_
+#include <vector>
+#include "chunk.h"
+#include "token.h"
+#include "dict.h"
+/**
+ * The Algorithm of MMSeg use four rules:
+ *  - Maximum matching rule
+ *  - Largest average word length rule
+ *  - Smallest variance of word length rule
+ *  - Largest sum of degree of morphemic freedom of one-character
+ *    words rule
+ */
+namespace rmmseg
+{
+    class Algorithm
+    {
+    public:
+        Algorithm(const char *text, int length)
+            :m_text(text), m_pos(0),
+            m_text_length(length),
+            m_tmp_words_i(0),
+            m_match_cache_i(0)
+        {
+            for (int i = 0; i < match_cache_size; ++i)
+                m_match_cache[i].first = -1;
+        }
+        Token next_token();
+        const char *get_text() const
+        {
+            return m_text;
+        }
+    private:
+        Token get_basic_latin_word();
+        Token get_cjk_word(int);
+        std::vector<Chunk> create_chunks();
+        int next_word();
+        int next_char();
+        std::vector<Word *> find_match_words();
+        int max_word_length() { return 20; }
+        const char *m_text;
+        int m_pos;
+        int m_text_length;
+        /* tmp words are only for 1-char words which
+         * are not exist in the dictionary. It's length
+         * value will be set to -1 to indicate it is
+         * a tmp word. */
+        Word *get_tmp_word()
+        {
+            if (m_tmp_words_i >= max_tmp_words)
+                m_tmp_words_i = 0;  // round wrap
+            return &m_tmp_words[m_tmp_words_i++];
+        }
+        /* related to max_word_length and match_cache_size */
+        static const int max_tmp_words = 512;
+        Word m_tmp_words[max_tmp_words];
+        int m_tmp_words_i;
+        /* match word caches */
+        static const int match_cache_size = 3;
+        typedef std::pair<int, std::vector<Word *> > match_cache_t;
+        match_cache_t m_match_cache[match_cache_size];
+        int m_match_cache_i;
+    };
+}
+#endif /* _ALGORITHM_H_ */

data/ext/rmmseg/chunk.h ADDED Viewed

@@ -0,0 +1,59 @@
+#ifndef _CHUNK_H_
+#define _CHUNK_H_
+#include <cmath>
+#include "word.h"
+namespace rmmseg
+{
+    /**
+     * A chunk stores 3 (or less) successive words.
+     */
+    struct Chunk
+    {
+        int total_length() const
+        {
+            int len = 0;
+            for (int i = 0; i < n; ++i)
+                len += std::abs(words[i]->length);
+                //if (words[i]->length == -1) /* tmp word */
+                //    len += 1;
+                //else
+                //    len += words[i]->length;
+            return len;
+        }
+        double average_length() const
+        {
+            return ((double)total_length())/n;
+        }
+        double variance() const
+        {
+            double avg = average_length();
+            double sqr_sum = 0;
+            double tmp;
+            for (int i = 0; i < n; ++i)
+            {
+                tmp = std::abs(words[i]->length);
+                //if (tmp == -1)
+                //    tmp = 1;
+                tmp = tmp-avg;
+                sqr_sum += tmp*tmp;
+            }
+            return std::sqrt(sqr_sum);
+        }
+        int degree_of_morphemic_freedom() const
+        {
+            int sum = 0;
+            for (int i = 0; i < n; ++i)
+                sum += words[i]->freq;
+            return sum;
+        }
+        int n;
+        Word *words[3];
+    };
+}
+#endif /* _CHUNK_H_ */

data/ext/rmmseg/dict.cpp ADDED Viewed

@@ -0,0 +1,230 @@
+#include <cstdio>
+#include "dict.h"
+using namespace std;
+namespace rmmseg
+{
+    struct Entry
+    {
+        Word *word;
+        Entry *next;
+    };
+    const size_t init_size = 262147;
+    const size_t max_density = 5;
+    /*
+      Table of prime numbers 2^n+a, 2<=n<=30.
+    */
+    static size_t primes[] = {
+        524288 + 21,
+        1048576 + 7,
+        2097152 + 17,
+        4194304 + 15,
+        8388608 + 9,
+        16777216 + 43,
+        33554432 + 35,
+        67108864 + 15,
+        134217728 + 29,
+        268435456 + 3,
+        536870912 + 11,
+        1073741824 + 85,
+    };
+    static size_t n_bins = init_size;
+    static size_t n_entries = 0;
+    static Entry **bins = static_cast<Entry **>(std::calloc(init_size,
+                                                            sizeof(Entry *)));
+    static size_t new_size()
+    {
+        for (size_t i = 0;
+             i < sizeof(primes)/sizeof(primes[0]);
+             ++i)
+        {
+            if (primes[i] > n_bins)
+            {
+                return primes[i];
+            }
+        }
+        // TODO: raise exception here
+        return n_bins;
+    }
+    static unsigned int hash(const char *str, int len)
+    {
+        unsigned int key = 0;
+        while (len--)
+        {
+            key += *str++;
+            key += (key << 10);
+            key ^= (key >> 6);
+        }
+        key += (key << 3);
+        key ^= (key >> 11);
+        key += (key << 15);
+        return key;
+    }
+    static void rehash()
+    {
+        int new_n_bins = new_size();
+        Entry **new_bins = static_cast<Entry **>(calloc(new_n_bins,
+                                                        sizeof(Entry *)));
+        Entry *entry, *next;
+        unsigned int hash_val;
+        for (size_t i = 0; i < n_bins; ++i)
+        {
+            entry = bins[i];
+            while (entry)
+            {
+                next = entry->next;
+                hash_val = hash(entry->word->text,
+                                entry->word->nbytes) % new_n_bins;
+                entry->next = new_bins[hash_val];
+                new_bins[hash_val] = entry;
+                entry = next;
+            }
+        }
+        free(bins);
+        n_bins = new_n_bins;
+        bins = new_bins;
+    }
+    namespace dict
+    {
+        /**
+         * str: the base of the string
+         * len: length of the string (in bytes)
+         *
+         * str may be a substring of a big chunk of text thus not nul-terminated,
+         * so len is necessary here.
+         */
+        Word *get(const char *str, int len)
+        {
+            unsigned int h = hash(str, len) % n_bins;
+            Entry *entry = bins[h];
+            if (!entry)
+                return NULL;
+            do
+            {
+                if (len == entry->word->nbytes &&
+                    strncmp(str, entry->word->text, len) == 0)
+                    return entry->word;
+                entry = entry->next;
+            }
+            while (entry);
+            return NULL;
+        }
+        void add(Word *word)
+        {
+            unsigned int hash_val = hash(word->text, word->nbytes);
+            unsigned int h = hash_val % n_bins;
+            Entry *entry = bins[h];
+            if (!entry)
+            {
+                if (n_entries/n_bins > max_density)
+                {
+                    rehash();
+                    h = hash_val % n_bins;
+                }
+                entry = static_cast<Entry *>(pool_alloc(sizeof(Entry)));
+                entry->word = word;
+                entry->next = NULL;
+                bins[h] = entry;
+                n_entries++;
+                return;
+            }
+            bool done = false;
+            do
+            {
+                if (word->nbytes == entry->word->nbytes &&
+                    strncmp(word->text, entry->word->text, word->nbytes) == 0)
+                {
+                    /* Overwriting. WARNING: the original Word object is
+                     * permanently lost. This IS a memory leak, because
+                     * the memory is allocated by pool_alloc. Instead of
+                     * fixing this, tuning the dictionary file is a better
+                     * idea
+                     */
+                    entry->word = word;
+                    done = true;
+                    break;
+                }
+                entry = entry->next;
+            }
+            while (entry);
+            if (!done)
+            {
+                entry = static_cast<Entry *>(pool_alloc(sizeof(Entry)));
+                entry->word = word;
+                entry->next = bins[h];
+                bins[h] = entry;
+                n_entries++;
+            }
+        }
+        bool load_chars(const char *filename)
+        {
+            FILE *fp = fopen(filename, "r");
+            if (!fp)
+            {
+                return false;
+            }
+            const size_t buf_len = 24;
+            char buf[buf_len];
+            char *ptr;
+            while(fgets(buf, buf_len, fp))
+            {
+                // NOTE: there SHOULD be a newline at the end of the file
+                buf[strlen(buf)-1] = '\0';    // truncate the newline
+                ptr = strchr(buf, ' ');
+                if (!ptr)
+                    continue;       // illegal input
+                *ptr = '\0';
+                add(make_word(ptr+1, 1, atoi(buf)));
+            }
+            fclose(fp);
+            return true;
+        }
+        bool load_words(const char *filename)
+        {
+            FILE *fp = fopen(filename, "r");
+            if (!fp)
+            {
+                return false;
+            }
+            const int buf_len = 48;
+            char buf[buf_len];
+            char *ptr;
+            while(fgets(buf, buf_len, fp))
+            {
+                // NOTE: there SHOULD be a newline at the end of the file
+                buf[strlen(buf)-1] = '\0';    // truncate the newline
+                ptr = strchr(buf, ' ');
+                if (!ptr)
+                    continue;       // illegal input
+                *ptr = '\0';
+                add(make_word(ptr+1, atoi(buf), 0));
+            }
+            fclose(fp);
+            return true;
+        }
+    }
+}