rmmseg-cpp-traditional 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/History.txt +21 -0
- data/LICENSE.txt +22 -0
- data/Manifest.txt +43 -0
- data/README +111 -0
- data/README.md +29 -0
- data/Rakefile +19 -0
- data/bin/rmmseg +63 -0
- data/data/chars.dic +12638 -0
- data/data/words.dic +120308 -0
- data/ext/rmmseg/algor.cpp +222 -0
- data/ext/rmmseg/algor.h +80 -0
- data/ext/rmmseg/chunk.h +59 -0
- data/ext/rmmseg/dict.cpp +230 -0
- data/ext/rmmseg/dict.h +34 -0
- data/ext/rmmseg/extconf.rb +17 -0
- data/ext/rmmseg/memory.cpp +9 -0
- data/ext/rmmseg/memory.h +43 -0
- data/ext/rmmseg/rmmseg.cpp +263 -0
- data/ext/rmmseg/rules.h +86 -0
- data/ext/rmmseg/token.h +19 -0
- data/ext/rmmseg/word.h +44 -0
- data/lib/rmmseg/dictionary.rb +59 -0
- data/lib/rmmseg/ferret.rb +64 -0
- data/lib/rmmseg-cpp-traditional/version.rb +7 -0
- data/lib/rmmseg-cpp-traditional.rb +9 -0
- data/lib/rmmseg.rb +3 -0
- data/misc/convert.rb +114 -0
- data/misc/ferret_example.rb +59 -0
- data/misc/homepage.erb +196 -0
- data/misc/homepage.html +1212 -0
- data/rmmseg-cpp-traditional.gemspec +19 -0
- data/spec/rmmseg_spec.rb +8 -0
- data/spec/spec_helper.rb +17 -0
- data/tasks/ann.rake +81 -0
- data/tasks/bones.rake +21 -0
- data/tasks/gem.rake +126 -0
- data/tasks/git.rake +41 -0
- data/tasks/homepage.rake +15 -0
- data/tasks/manifest.rake +49 -0
- data/tasks/notes.rake +28 -0
- data/tasks/post_load.rake +39 -0
- data/tasks/rdoc.rake +51 -0
- data/tasks/rubyforge.rake +58 -0
- data/tasks/setup.rb +268 -0
- data/tasks/spec.rake +55 -0
- data/tasks/svn.rake +48 -0
- data/tasks/test.rake +38 -0
- data/test/test_rmmseg.rb +0 -0
- metadata +116 -0
| @@ -0,0 +1,222 @@ | |
| 1 | 
            +
            #include <cctype>
         | 
| 2 | 
            +
            #include <cassert>
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            #include "rules.h"
         | 
| 5 | 
            +
            #include "algor.h"
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            using namespace std;
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            namespace rmmseg
         | 
| 10 | 
            +
            {
         | 
| 11 | 
            +
                Token Algorithm::next_token()
         | 
| 12 | 
            +
                {
         | 
| 13 | 
            +
                    do
         | 
| 14 | 
            +
                    {
         | 
| 15 | 
            +
                        if (m_pos >= m_text_length)
         | 
| 16 | 
            +
                            return Token(NULL, 0);
         | 
| 17 | 
            +
                 
         | 
| 18 | 
            +
                        Token tk(NULL, 0);
         | 
| 19 | 
            +
                        int len = next_char();
         | 
| 20 | 
            +
                        if (len == 1)
         | 
| 21 | 
            +
                            tk = get_basic_latin_word();
         | 
| 22 | 
            +
                        else
         | 
| 23 | 
            +
                            tk = get_cjk_word(len);
         | 
| 24 | 
            +
                        if (tk.length > 0)
         | 
| 25 | 
            +
                            return tk;
         | 
| 26 | 
            +
                    }
         | 
| 27 | 
            +
                    while (true);
         | 
| 28 | 
            +
                }
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                Token Algorithm::get_basic_latin_word()
         | 
| 31 | 
            +
                {
         | 
| 32 | 
            +
                    int len = 1;
         | 
| 33 | 
            +
                    int start, end;
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                    // Skip pre-word whitespaces and punctuations
         | 
| 36 | 
            +
                    while (m_pos < m_text_length)
         | 
| 37 | 
            +
                    {
         | 
| 38 | 
            +
                        if (len > 1)
         | 
| 39 | 
            +
                            break;
         | 
| 40 | 
            +
                        if (isalnum(m_text[m_pos]))
         | 
| 41 | 
            +
                            break;
         | 
| 42 | 
            +
                        m_pos++;
         | 
| 43 | 
            +
                        len = next_char();
         | 
| 44 | 
            +
                    }
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                    start = m_pos;
         | 
| 47 | 
            +
                    while (m_pos < m_text_length)
         | 
| 48 | 
            +
                    {
         | 
| 49 | 
            +
                        if (len > 1)
         | 
| 50 | 
            +
                            break;
         | 
| 51 | 
            +
                        if (!isalnum(m_text[m_pos]))
         | 
| 52 | 
            +
                            break;
         | 
| 53 | 
            +
                        m_pos++;
         | 
| 54 | 
            +
                        len = next_char();
         | 
| 55 | 
            +
                    }
         | 
| 56 | 
            +
                    end = m_pos;
         | 
| 57 | 
            +
             | 
| 58 | 
            +
                    // Skip post-word whitespaces and punctuations
         | 
| 59 | 
            +
                    while (m_pos < m_text_length)
         | 
| 60 | 
            +
                    {
         | 
| 61 | 
            +
                        if (len > 1)
         | 
| 62 | 
            +
                            break;
         | 
| 63 | 
            +
                        if (isalnum(m_text[m_pos]))
         | 
| 64 | 
            +
                            break;
         | 
| 65 | 
            +
                        m_pos++;
         | 
| 66 | 
            +
                        len = next_char();
         | 
| 67 | 
            +
                    }
         | 
| 68 | 
            +
             | 
| 69 | 
            +
                    return Token(m_text+start, end-start);
         | 
| 70 | 
            +
                }
         | 
| 71 | 
            +
             | 
| 72 | 
            +
                Token Algorithm::get_cjk_word(int len)
         | 
| 73 | 
            +
                {
         | 
| 74 | 
            +
                    vector<Chunk> chunks = create_chunks();
         | 
| 75 | 
            +
             | 
| 76 | 
            +
                    if (chunks.size() > 1)
         | 
| 77 | 
            +
                        mm_filter(chunks);
         | 
| 78 | 
            +
                    if (chunks.size() > 1)
         | 
| 79 | 
            +
                        lawl_filter(chunks);
         | 
| 80 | 
            +
                    if (chunks.size() > 1)
         | 
| 81 | 
            +
                        svwl_filter(chunks);
         | 
| 82 | 
            +
                    if (chunks.size() > 1)
         | 
| 83 | 
            +
                        lsdmfocw_filter(chunks);
         | 
| 84 | 
            +
             | 
| 85 | 
            +
                    if (chunks.size() < 1)
         | 
| 86 | 
            +
                        return Token(NULL, 0);
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                    Token token(m_text+m_pos, chunks[0].words[0]->nbytes);
         | 
| 89 | 
            +
                    m_pos += chunks[0].words[0]->nbytes;
         | 
| 90 | 
            +
                    return token;
         | 
| 91 | 
            +
                }
         | 
| 92 | 
            +
                
         | 
| 93 | 
            +
                vector<Chunk> Algorithm::create_chunks()
         | 
| 94 | 
            +
                {
         | 
| 95 | 
            +
                    vector<Chunk> chunks;
         | 
| 96 | 
            +
                    Chunk chunk;
         | 
| 97 | 
            +
                    Word *w1, *w2, *w3;
         | 
| 98 | 
            +
             | 
| 99 | 
            +
                    int orig_pos = m_pos;
         | 
| 100 | 
            +
                    typedef vector<Word *> vec_t;
         | 
| 101 | 
            +
                    typedef vec_t::iterator it_t;
         | 
| 102 | 
            +
             | 
| 103 | 
            +
                    vec_t words1 = find_match_words();
         | 
| 104 | 
            +
                    for (it_t i1 = words1.begin();
         | 
| 105 | 
            +
                         i1 != words1.end();
         | 
| 106 | 
            +
                         ++i1)
         | 
| 107 | 
            +
                    {
         | 
| 108 | 
            +
                        w1 = *i1;
         | 
| 109 | 
            +
                        chunk.words[0] = w1;
         | 
| 110 | 
            +
                        m_pos += w1->nbytes;
         | 
| 111 | 
            +
                        if (m_pos < m_text_length)
         | 
| 112 | 
            +
                        {
         | 
| 113 | 
            +
                            vec_t words2 = find_match_words();
         | 
| 114 | 
            +
                            for (it_t i2 = words2.begin();
         | 
| 115 | 
            +
                                 i2 != words2.end();
         | 
| 116 | 
            +
                                 ++i2)
         | 
| 117 | 
            +
                            {
         | 
| 118 | 
            +
                                w2 = *i2;
         | 
| 119 | 
            +
                                chunk.words[1] = w2;
         | 
| 120 | 
            +
                                m_pos += w2->nbytes;
         | 
| 121 | 
            +
                                if (m_pos < m_text_length)
         | 
| 122 | 
            +
                                {
         | 
| 123 | 
            +
                                    vec_t words3 = find_match_words();
         | 
| 124 | 
            +
                                    for (it_t i3 = words3.begin();
         | 
| 125 | 
            +
                                         i3 != words3.end();
         | 
| 126 | 
            +
                                         ++i3)
         | 
| 127 | 
            +
                                    {
         | 
| 128 | 
            +
                                        w3 = *i3;
         | 
| 129 | 
            +
                                        if (w3->length == -1) // tmp word
         | 
| 130 | 
            +
                                        {
         | 
| 131 | 
            +
                                            chunk.n = 2;
         | 
| 132 | 
            +
                                        }
         | 
| 133 | 
            +
                                        else
         | 
| 134 | 
            +
                                        {
         | 
| 135 | 
            +
                                            chunk.n = 3;
         | 
| 136 | 
            +
                                            chunk.words[2] = w3;
         | 
| 137 | 
            +
                                        }
         | 
| 138 | 
            +
                                        chunks.push_back(chunk);
         | 
| 139 | 
            +
                                    }
         | 
| 140 | 
            +
                                }
         | 
| 141 | 
            +
                                else if (m_pos == m_text_length)
         | 
| 142 | 
            +
                                {
         | 
| 143 | 
            +
                                    chunk.n = 2;
         | 
| 144 | 
            +
                                    chunks.push_back(chunk);
         | 
| 145 | 
            +
                                }
         | 
| 146 | 
            +
                                m_pos -= w2->nbytes;
         | 
| 147 | 
            +
                            }
         | 
| 148 | 
            +
                        }
         | 
| 149 | 
            +
                        else if (m_pos == m_text_length)
         | 
| 150 | 
            +
                        {
         | 
| 151 | 
            +
                            chunk.n = 1;
         | 
| 152 | 
            +
                            chunks.push_back(chunk);
         | 
| 153 | 
            +
                        }
         | 
| 154 | 
            +
                        m_pos -= w1->nbytes;
         | 
| 155 | 
            +
                    }
         | 
| 156 | 
            +
             | 
| 157 | 
            +
                    m_pos = orig_pos;
         | 
| 158 | 
            +
                    return chunks;
         | 
| 159 | 
            +
                }
         | 
| 160 | 
            +
             | 
| 161 | 
            +
                int Algorithm::next_char()
         | 
| 162 | 
            +
                {
         | 
| 163 | 
            +
                    // ONLY for UTF-8
         | 
| 164 | 
            +
                    unsigned char ch = m_text[m_pos];
         | 
| 165 | 
            +
                    if (ch >= 0xC0 && ch <= 0xDF)
         | 
| 166 | 
            +
                    {
         | 
| 167 | 
            +
                        return min(2, m_text_length-m_pos);
         | 
| 168 | 
            +
                    }
         | 
| 169 | 
            +
                    if (ch >= 0xE0 && ch <= 0xEF)
         | 
| 170 | 
            +
                    {
         | 
| 171 | 
            +
                        return min(3, m_text_length-m_pos);
         | 
| 172 | 
            +
                    }
         | 
| 173 | 
            +
                    return 1;
         | 
| 174 | 
            +
                }
         | 
| 175 | 
            +
             | 
| 176 | 
            +
                vector<Word *> Algorithm::find_match_words()
         | 
| 177 | 
            +
                {
         | 
| 178 | 
            +
                    for (int i = 0; i < match_cache_size; ++i)
         | 
| 179 | 
            +
                        if (m_match_cache[i].first == m_pos)
         | 
| 180 | 
            +
                            return m_match_cache[i].second;
         | 
| 181 | 
            +
             | 
| 182 | 
            +
                    vector<Word *> words;
         | 
| 183 | 
            +
                    Word *word;
         | 
| 184 | 
            +
                    int orig_pos = m_pos;
         | 
| 185 | 
            +
                    int n = 0, len;
         | 
| 186 | 
            +
             | 
| 187 | 
            +
                    while (m_pos < m_text_length)
         | 
| 188 | 
            +
                    {
         | 
| 189 | 
            +
                        if (n >= max_word_length())
         | 
| 190 | 
            +
                            break;
         | 
| 191 | 
            +
                        len = next_char();
         | 
| 192 | 
            +
                        if (len <= 1)
         | 
| 193 | 
            +
                            break;
         | 
| 194 | 
            +
             | 
| 195 | 
            +
                        m_pos += len;
         | 
| 196 | 
            +
                        n++;
         | 
| 197 | 
            +
             | 
| 198 | 
            +
                        word = dict::get(m_text+orig_pos, m_pos-orig_pos);
         | 
| 199 | 
            +
                        if (word)
         | 
| 200 | 
            +
                            words.push_back(word);
         | 
| 201 | 
            +
                    }
         | 
| 202 | 
            +
             | 
| 203 | 
            +
                    m_pos = orig_pos;
         | 
| 204 | 
            +
                    
         | 
| 205 | 
            +
                    if (words.empty())
         | 
| 206 | 
            +
                    {
         | 
| 207 | 
            +
                        word = get_tmp_word();
         | 
| 208 | 
            +
                        word->nbytes = next_char();
         | 
| 209 | 
            +
                        word->length = -1;
         | 
| 210 | 
            +
                        strncpy(word->text, m_text+m_pos, word->nbytes);
         | 
| 211 | 
            +
                        word->text[word->nbytes] = '\0';
         | 
| 212 | 
            +
                        words.push_back(word);
         | 
| 213 | 
            +
                    }
         | 
| 214 | 
            +
             | 
| 215 | 
            +
                    m_match_cache[m_match_cache_i] = make_pair(m_pos, words);
         | 
| 216 | 
            +
                    m_match_cache_i++;
         | 
| 217 | 
            +
                    if (m_match_cache_i >= match_cache_size)
         | 
| 218 | 
            +
                        m_match_cache_i = 0;
         | 
| 219 | 
            +
             | 
| 220 | 
            +
                    return words;
         | 
| 221 | 
            +
                }
         | 
| 222 | 
            +
            }
         | 
    
        data/ext/rmmseg/algor.h
    ADDED
    
    | @@ -0,0 +1,80 @@ | |
| 1 | 
            +
            #ifndef _ALGORITHM_H_
         | 
| 2 | 
            +
            #define _ALGORITHM_H_
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            #include <vector>
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            #include "chunk.h"
         | 
| 7 | 
            +
            #include "token.h"
         | 
| 8 | 
            +
            #include "dict.h"
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            /**
         | 
| 11 | 
            +
             * The Algorithm of MMSeg use four rules:
         | 
| 12 | 
            +
             *  - Maximum matching rule
         | 
| 13 | 
            +
             *  - Largest average word length rule
         | 
| 14 | 
            +
             *  - Smallest variance of word length rule
         | 
| 15 | 
            +
             *  - Largest sum of degree of morphemic freedom of one-character
         | 
| 16 | 
            +
             *    words rule
         | 
| 17 | 
            +
             */
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            namespace rmmseg
         | 
| 20 | 
            +
            {
         | 
| 21 | 
            +
                class Algorithm
         | 
| 22 | 
            +
                {
         | 
| 23 | 
            +
                public:
         | 
| 24 | 
            +
                    Algorithm(const char *text, int length)
         | 
| 25 | 
            +
                        :m_text(text), m_pos(0),
         | 
| 26 | 
            +
                        m_text_length(length),
         | 
| 27 | 
            +
                        m_tmp_words_i(0),
         | 
| 28 | 
            +
                        m_match_cache_i(0)
         | 
| 29 | 
            +
                    {
         | 
| 30 | 
            +
                        for (int i = 0; i < match_cache_size; ++i)
         | 
| 31 | 
            +
                            m_match_cache[i].first = -1;
         | 
| 32 | 
            +
                    }
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                    Token next_token();
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                    const char *get_text() const
         | 
| 37 | 
            +
                    {
         | 
| 38 | 
            +
                        return m_text;
         | 
| 39 | 
            +
                    }
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                private:
         | 
| 42 | 
            +
                    Token get_basic_latin_word();
         | 
| 43 | 
            +
                    Token get_cjk_word(int);
         | 
| 44 | 
            +
                    
         | 
| 45 | 
            +
                    std::vector<Chunk> create_chunks();
         | 
| 46 | 
            +
                    int next_word();
         | 
| 47 | 
            +
                    int next_char();
         | 
| 48 | 
            +
                    std::vector<Word *> find_match_words();
         | 
| 49 | 
            +
                    int max_word_length() { return 4; }
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                    
         | 
| 52 | 
            +
                    const char *m_text;
         | 
| 53 | 
            +
                    int m_pos;
         | 
| 54 | 
            +
                    int m_text_length;
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                    /* tmp words are only for 1-char words which
         | 
| 57 | 
            +
                     * are not exist in the dictionary. It's length
         | 
| 58 | 
            +
                     * value will be set to -1 to indicate it is
         | 
| 59 | 
            +
                     * a tmp word. */
         | 
| 60 | 
            +
                    Word *get_tmp_word()
         | 
| 61 | 
            +
                    {
         | 
| 62 | 
            +
                        if (m_tmp_words_i >= max_tmp_words)
         | 
| 63 | 
            +
                            m_tmp_words_i = 0;  // round wrap
         | 
| 64 | 
            +
                        return &m_tmp_words[m_tmp_words_i++];
         | 
| 65 | 
            +
                    }
         | 
| 66 | 
            +
             | 
| 67 | 
            +
                    /* related to max_word_length and match_cache_size */
         | 
| 68 | 
            +
                    static const int max_tmp_words = 64;
         | 
| 69 | 
            +
                    Word m_tmp_words[max_tmp_words];
         | 
| 70 | 
            +
                    int m_tmp_words_i;
         | 
| 71 | 
            +
             | 
| 72 | 
            +
                    /* match word caches */
         | 
| 73 | 
            +
                    static const int match_cache_size = 3;
         | 
| 74 | 
            +
                    typedef std::pair<int, std::vector<Word *> > match_cache_t;
         | 
| 75 | 
            +
                    match_cache_t m_match_cache[match_cache_size];
         | 
| 76 | 
            +
                    int m_match_cache_i;
         | 
| 77 | 
            +
                };
         | 
| 78 | 
            +
            }
         | 
| 79 | 
            +
             | 
| 80 | 
            +
            #endif /* _ALGORITHM_H_ */
         | 
    
        data/ext/rmmseg/chunk.h
    ADDED
    
    | @@ -0,0 +1,59 @@ | |
| 1 | 
            +
            #ifndef _CHUNK_H_
         | 
| 2 | 
            +
            #define _CHUNK_H_
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            #include <cmath>
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            #include "word.h"
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            namespace rmmseg
         | 
| 9 | 
            +
            {
         | 
| 10 | 
            +
                /**
         | 
| 11 | 
            +
                 * A chunk stores 3 (or less) successive words.
         | 
| 12 | 
            +
                 */
         | 
| 13 | 
            +
                struct Chunk
         | 
| 14 | 
            +
                {
         | 
| 15 | 
            +
                    int total_length() const
         | 
| 16 | 
            +
                    {
         | 
| 17 | 
            +
                        int len = 0;
         | 
| 18 | 
            +
                        for (int i = 0; i < n; ++i)
         | 
| 19 | 
            +
                            len += std::abs(words[i]->length);
         | 
| 20 | 
            +
                            //if (words[i]->length == -1) /* tmp word */
         | 
| 21 | 
            +
                            //    len += 1;
         | 
| 22 | 
            +
                            //else
         | 
| 23 | 
            +
                            //    len += words[i]->length;
         | 
| 24 | 
            +
                        return len;
         | 
| 25 | 
            +
                    }
         | 
| 26 | 
            +
                    double average_length() const
         | 
| 27 | 
            +
                    {
         | 
| 28 | 
            +
                        return ((double)total_length())/n;
         | 
| 29 | 
            +
                    }
         | 
| 30 | 
            +
                    double variance() const
         | 
| 31 | 
            +
                    {
         | 
| 32 | 
            +
                        double avg = average_length();
         | 
| 33 | 
            +
                        double sqr_sum = 0;
         | 
| 34 | 
            +
                        double tmp;
         | 
| 35 | 
            +
                        for (int i = 0; i < n; ++i)
         | 
| 36 | 
            +
                        {
         | 
| 37 | 
            +
                            tmp = std::abs(words[i]->length);
         | 
| 38 | 
            +
                            //if (tmp == -1)
         | 
| 39 | 
            +
                            //    tmp = 1;
         | 
| 40 | 
            +
                            tmp = tmp-avg;
         | 
| 41 | 
            +
                            sqr_sum += tmp*tmp;
         | 
| 42 | 
            +
                        }
         | 
| 43 | 
            +
                        return std::sqrt(sqr_sum);
         | 
| 44 | 
            +
                    }
         | 
| 45 | 
            +
                    int degree_of_morphemic_freedom() const
         | 
| 46 | 
            +
                    {
         | 
| 47 | 
            +
                        int sum = 0;
         | 
| 48 | 
            +
                        for (int i = 0; i < n; ++i)
         | 
| 49 | 
            +
                            sum += words[i]->freq;
         | 
| 50 | 
            +
                        return sum;
         | 
| 51 | 
            +
                    }
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                    
         | 
| 54 | 
            +
                    int n;
         | 
| 55 | 
            +
                    Word *words[3];
         | 
| 56 | 
            +
                };
         | 
| 57 | 
            +
            }
         | 
| 58 | 
            +
             | 
| 59 | 
            +
            #endif /* _CHUNK_H_ */
         | 
    
        data/ext/rmmseg/dict.cpp
    ADDED
    
    | @@ -0,0 +1,230 @@ | |
| 1 | 
            +
            #include <cstdio>
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            #include "dict.h"
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            using namespace std;
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            namespace rmmseg
         | 
| 8 | 
            +
            {
         | 
| 9 | 
            +
                struct Entry
         | 
| 10 | 
            +
                {
         | 
| 11 | 
            +
                    Word *word;
         | 
| 12 | 
            +
                    Entry *next;
         | 
| 13 | 
            +
                };
         | 
| 14 | 
            +
             | 
| 15 | 
            +
                const size_t init_size = 262147;
         | 
| 16 | 
            +
                const size_t max_density = 5;
         | 
| 17 | 
            +
                /*
         | 
| 18 | 
            +
                  Table of prime numbers 2^n+a, 2<=n<=30.
         | 
| 19 | 
            +
                */
         | 
| 20 | 
            +
                static size_t primes[] = {
         | 
| 21 | 
            +
                    524288 + 21,
         | 
| 22 | 
            +
                    1048576 + 7,
         | 
| 23 | 
            +
                    2097152 + 17,
         | 
| 24 | 
            +
                    4194304 + 15,
         | 
| 25 | 
            +
                    8388608 + 9,
         | 
| 26 | 
            +
                    16777216 + 43,
         | 
| 27 | 
            +
                    33554432 + 35,
         | 
| 28 | 
            +
                    67108864 + 15,
         | 
| 29 | 
            +
                    134217728 + 29,
         | 
| 30 | 
            +
                    268435456 + 3,
         | 
| 31 | 
            +
                    536870912 + 11,
         | 
| 32 | 
            +
                    1073741824 + 85,
         | 
| 33 | 
            +
                };
         | 
| 34 | 
            +
             | 
| 35 | 
            +
             | 
| 36 | 
            +
                static size_t n_bins = init_size;
         | 
| 37 | 
            +
                static size_t n_entries = 0;
         | 
| 38 | 
            +
                static Entry **bins = static_cast<Entry **>(std::calloc(init_size,
         | 
| 39 | 
            +
                                                                        sizeof(Entry *)));
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                static size_t new_size()
         | 
| 42 | 
            +
                {
         | 
| 43 | 
            +
                    for (size_t i = 0;
         | 
| 44 | 
            +
                         i < sizeof(primes)/sizeof(primes[0]);
         | 
| 45 | 
            +
                         ++i)
         | 
| 46 | 
            +
                    {
         | 
| 47 | 
            +
                        if (primes[i] > n_bins)
         | 
| 48 | 
            +
                        {
         | 
| 49 | 
            +
                            return primes[i];
         | 
| 50 | 
            +
                        }
         | 
| 51 | 
            +
                    }
         | 
| 52 | 
            +
                    // TODO: raise exception here
         | 
| 53 | 
            +
                    return n_bins;
         | 
| 54 | 
            +
                }
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                static unsigned int hash(const char *str, int len)
         | 
| 57 | 
            +
                {
         | 
| 58 | 
            +
                    unsigned int key = 0;
         | 
| 59 | 
            +
                    while (len--)
         | 
| 60 | 
            +
                    {
         | 
| 61 | 
            +
                        key += *str++;
         | 
| 62 | 
            +
                        key += (key << 10);
         | 
| 63 | 
            +
                        key ^= (key >> 6);
         | 
| 64 | 
            +
                    }
         | 
| 65 | 
            +
                    key += (key << 3);
         | 
| 66 | 
            +
                    key ^= (key >> 11);
         | 
| 67 | 
            +
                    key += (key << 15);
         | 
| 68 | 
            +
                    return key;
         | 
| 69 | 
            +
                }
         | 
| 70 | 
            +
             | 
| 71 | 
            +
                static void rehash()
         | 
| 72 | 
            +
                {
         | 
| 73 | 
            +
                    int new_n_bins = new_size();
         | 
| 74 | 
            +
                    Entry **new_bins = static_cast<Entry **>(calloc(new_n_bins,
         | 
| 75 | 
            +
                                                                    sizeof(Entry *)));
         | 
| 76 | 
            +
                    Entry *entry, *next;
         | 
| 77 | 
            +
                    unsigned int hash_val;
         | 
| 78 | 
            +
             | 
| 79 | 
            +
                    for (size_t i = 0; i < n_bins; ++i)
         | 
| 80 | 
            +
                    {
         | 
| 81 | 
            +
                        entry = bins[i];
         | 
| 82 | 
            +
                        while (entry)
         | 
| 83 | 
            +
                        {
         | 
| 84 | 
            +
                            next = entry->next;
         | 
| 85 | 
            +
                            hash_val = hash(entry->word->text,
         | 
| 86 | 
            +
                                            entry->word->nbytes) % new_n_bins;
         | 
| 87 | 
            +
                            entry->next = new_bins[hash_val];
         | 
| 88 | 
            +
                            new_bins[hash_val] = entry;
         | 
| 89 | 
            +
                            entry = next;
         | 
| 90 | 
            +
                        }
         | 
| 91 | 
            +
                    }
         | 
| 92 | 
            +
                    free(bins);
         | 
| 93 | 
            +
                    n_bins = new_n_bins;
         | 
| 94 | 
            +
                    bins = new_bins;
         | 
| 95 | 
            +
                }
         | 
| 96 | 
            +
             | 
| 97 | 
            +
                namespace dict
         | 
| 98 | 
            +
                {
         | 
| 99 | 
            +
             | 
| 100 | 
            +
                    /**
         | 
| 101 | 
            +
                     * str: the base of the string
         | 
| 102 | 
            +
                     * len: length of the string (in bytes)
         | 
| 103 | 
            +
                     *
         | 
| 104 | 
            +
                     * str may be a substring of a big chunk of text thus not nul-terminated,
         | 
| 105 | 
            +
                     * so len is necessary here.
         | 
| 106 | 
            +
                     */
         | 
| 107 | 
            +
                    Word *get(const char *str, int len)
         | 
| 108 | 
            +
                    {
         | 
| 109 | 
            +
                        unsigned int h = hash(str, len) % n_bins;
         | 
| 110 | 
            +
                        Entry *entry = bins[h];
         | 
| 111 | 
            +
                        if (!entry)
         | 
| 112 | 
            +
                            return NULL;
         | 
| 113 | 
            +
                        do
         | 
| 114 | 
            +
                        {
         | 
| 115 | 
            +
                            if (len == entry->word->nbytes &&
         | 
| 116 | 
            +
                                strncmp(str, entry->word->text, len) == 0)
         | 
| 117 | 
            +
                                return entry->word;
         | 
| 118 | 
            +
                            entry = entry->next;
         | 
| 119 | 
            +
                        }
         | 
| 120 | 
            +
                        while (entry);
         | 
| 121 | 
            +
             | 
| 122 | 
            +
                        return NULL;
         | 
| 123 | 
            +
                    }
         | 
| 124 | 
            +
             | 
| 125 | 
            +
                    void add(Word *word)
         | 
| 126 | 
            +
                    {
         | 
| 127 | 
            +
                        unsigned int hash_val = hash(word->text, word->nbytes);
         | 
| 128 | 
            +
                        unsigned int h = hash_val % n_bins;
         | 
| 129 | 
            +
                        Entry *entry = bins[h];
         | 
| 130 | 
            +
                        if (!entry)
         | 
| 131 | 
            +
                        {
         | 
| 132 | 
            +
                            if (n_entries/n_bins > max_density)
         | 
| 133 | 
            +
                            {
         | 
| 134 | 
            +
                                rehash();
         | 
| 135 | 
            +
                                h = hash_val % n_bins;
         | 
| 136 | 
            +
                            }
         | 
| 137 | 
            +
                        
         | 
| 138 | 
            +
                            entry = static_cast<Entry *>(pool_alloc(sizeof(Entry)));
         | 
| 139 | 
            +
                            entry->word = word;
         | 
| 140 | 
            +
                            entry->next = NULL;
         | 
| 141 | 
            +
                            bins[h] = entry;
         | 
| 142 | 
            +
                            n_entries++;
         | 
| 143 | 
            +
                            return;
         | 
| 144 | 
            +
                        }
         | 
| 145 | 
            +
             | 
| 146 | 
            +
                        bool done = false;
         | 
| 147 | 
            +
                        do
         | 
| 148 | 
            +
                        {
         | 
| 149 | 
            +
                            if (word->nbytes == entry->word->nbytes &&
         | 
| 150 | 
            +
                                strncmp(word->text, entry->word->text, word->nbytes) == 0)
         | 
| 151 | 
            +
                            {
         | 
| 152 | 
            +
                                /* Overwriting. WARNING: the original Word object is
         | 
| 153 | 
            +
                                 * permanently lost. This IS a memory leak, because
         | 
| 154 | 
            +
                                 * the memory is allocated by pool_alloc. Instead of
         | 
| 155 | 
            +
                                 * fixing this, tuning the dictionary file is a better
         | 
| 156 | 
            +
                                 * idea
         | 
| 157 | 
            +
                                 */
         | 
| 158 | 
            +
                                entry->word = word;
         | 
| 159 | 
            +
                                done = true;
         | 
| 160 | 
            +
                                break;
         | 
| 161 | 
            +
                            }
         | 
| 162 | 
            +
                            entry = entry->next;
         | 
| 163 | 
            +
                        }
         | 
| 164 | 
            +
                        while (entry);
         | 
| 165 | 
            +
             | 
| 166 | 
            +
                        if (!done)
         | 
| 167 | 
            +
                        {
         | 
| 168 | 
            +
                            entry = static_cast<Entry *>(pool_alloc(sizeof(Entry)));
         | 
| 169 | 
            +
                            entry->word = word;
         | 
| 170 | 
            +
                            entry->next = bins[h];
         | 
| 171 | 
            +
                            bins[h] = entry;
         | 
| 172 | 
            +
                            n_entries++;
         | 
| 173 | 
            +
                        }
         | 
| 174 | 
            +
                    }
         | 
| 175 | 
            +
             | 
| 176 | 
            +
                    bool load_chars(const char *filename)
         | 
| 177 | 
            +
                    {
         | 
| 178 | 
            +
                        FILE *fp = fopen(filename, "r");
         | 
| 179 | 
            +
                        if (!fp)
         | 
| 180 | 
            +
                        {
         | 
| 181 | 
            +
                            return false;
         | 
| 182 | 
            +
                        }
         | 
| 183 | 
            +
             | 
| 184 | 
            +
                        const size_t buf_len = 24;
         | 
| 185 | 
            +
                        char buf[buf_len];
         | 
| 186 | 
            +
                        char *ptr;
         | 
| 187 | 
            +
             | 
| 188 | 
            +
                        while(fgets(buf, buf_len, fp))
         | 
| 189 | 
            +
                        {
         | 
| 190 | 
            +
                            // NOTE: there SHOULD be a newline at the end of the file
         | 
| 191 | 
            +
                            buf[strlen(buf)-1] = '\0';    // truncate the newline
         | 
| 192 | 
            +
                            ptr = strchr(buf, ' ');
         | 
| 193 | 
            +
                            if (!ptr)
         | 
| 194 | 
            +
                                continue;       // illegal input
         | 
| 195 | 
            +
                            *ptr = '\0';
         | 
| 196 | 
            +
                            add(make_word(ptr+1, 1, atoi(buf)));
         | 
| 197 | 
            +
                        }
         | 
| 198 | 
            +
                    
         | 
| 199 | 
            +
                        fclose(fp);
         | 
| 200 | 
            +
                        return true;
         | 
| 201 | 
            +
                    }
         | 
| 202 | 
            +
             | 
| 203 | 
            +
                    bool load_words(const char *filename)
         | 
| 204 | 
            +
                    {
         | 
| 205 | 
            +
                        FILE *fp = fopen(filename, "r");
         | 
| 206 | 
            +
                        if (!fp)
         | 
| 207 | 
            +
                        {
         | 
| 208 | 
            +
                            return false;
         | 
| 209 | 
            +
                        }
         | 
| 210 | 
            +
             | 
| 211 | 
            +
                        const int buf_len = 48;
         | 
| 212 | 
            +
                        char buf[buf_len];
         | 
| 213 | 
            +
                        char *ptr;
         | 
| 214 | 
            +
             | 
| 215 | 
            +
                        while(fgets(buf, buf_len, fp))
         | 
| 216 | 
            +
                        {
         | 
| 217 | 
            +
                            // NOTE: there SHOULD be a newline at the end of the file
         | 
| 218 | 
            +
                            buf[strlen(buf)-1] = '\0';    // truncate the newline
         | 
| 219 | 
            +
                            ptr = strchr(buf, ' ');
         | 
| 220 | 
            +
                            if (!ptr)
         | 
| 221 | 
            +
                                continue;       // illegal input
         | 
| 222 | 
            +
                            *ptr = '\0';
         | 
| 223 | 
            +
                            add(make_word(ptr+1, atoi(buf), 0));
         | 
| 224 | 
            +
                        }
         | 
| 225 | 
            +
                    
         | 
| 226 | 
            +
                        fclose(fp);
         | 
| 227 | 
            +
                        return true;
         | 
| 228 | 
            +
                    }
         | 
| 229 | 
            +
                }
         | 
| 230 | 
            +
            }
         | 
    
        data/ext/rmmseg/dict.h
    ADDED
    
    | @@ -0,0 +1,34 @@ | |
| 1 | 
            +
            #ifndef _DICT_H_
         | 
| 2 | 
            +
            #define _DICT_H_
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            #include "word.h"
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            /**
         | 
| 7 | 
            +
             * A dictionary is a hash table of
         | 
| 8 | 
            +
             *  - key: string
         | 
| 9 | 
            +
             *  - value: word
         | 
| 10 | 
            +
             *
         | 
| 11 | 
            +
             * Dictionary data can be loaded from files. Two type of dictionary
         | 
| 12 | 
            +
             * files are supported:
         | 
| 13 | 
            +
             *  - character file: Each line contains a number and a character,
         | 
| 14 | 
            +
             *                    the number is the frequency of the character.
         | 
| 15 | 
            +
             *                    The frequency should NOT exceeds 65535.
         | 
| 16 | 
            +
             *  - word file:      Each line contains a number and a word, the
         | 
| 17 | 
            +
             *                    number is the character count of the word.
         | 
| 18 | 
            +
             */
         | 
| 19 | 
            +
             | 
| 20 | 
            +
            namespace rmmseg
         | 
| 21 | 
            +
            {
         | 
| 22 | 
            +
                /* Instead of making a class with only one instance, i'll not
         | 
| 23 | 
            +
                 * bother to make it a class here. */
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                namespace dict
         | 
| 26 | 
            +
                {
         | 
| 27 | 
            +
                    void  add(Word *word);
         | 
| 28 | 
            +
                    bool  load_chars(const char *filename);
         | 
| 29 | 
            +
                    bool  load_words(const char *filename);
         | 
| 30 | 
            +
                    Word *get(const char *str, int len);
         | 
| 31 | 
            +
                }
         | 
| 32 | 
            +
            }
         | 
| 33 | 
            +
             | 
| 34 | 
            +
            #endif /* _DICT_H_ */
         | 
| @@ -0,0 +1,17 @@ | |
| 1 | 
            +
            require 'mkmf'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            CONFIG['LDSHARED'] = CONFIG['LDSHARED'].sub(/^\$\(CC\)/, 'g++')
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            # if RUBY_PLATFORM =~ /darwin/
         | 
| 6 | 
            +
            # #  CONFIG['LDSHARED'] = 'g++ --dynamiclib -flat_namespace -undefined suppress' 
         | 
| 7 | 
            +
            #   CONFIG['LDSHARED'] = 'g++ --dynamiclib'
         | 
| 8 | 
            +
            # elsif RUBY_PLATFORM =~ /linux/
         | 
| 9 | 
            +
            #   CONFIG['LDSHARED'] = 'g++ -shared'  
         | 
| 10 | 
            +
            # end
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            if RUBY_PLATFORM =~ /darwin/
         | 
| 13 | 
            +
              CONFIG['LDSHARED'] = 'g++ -dynamiclib -single_module -flat_namespace -undefined suppress'
         | 
| 14 | 
            +
            end
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            $objs = ['algor.o', 'dict.o', 'memory.o', 'rmmseg.o']
         | 
| 17 | 
            +
            create_makefile('rmmseg')
         | 
    
        data/ext/rmmseg/memory.h
    ADDED
    
    | @@ -0,0 +1,43 @@ | |
| 1 | 
            +
            #ifndef _MEMORY_H_
         | 
| 2 | 
            +
            #define _MEMORY_H_
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            #include <cstdlib>
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            /**
         | 
| 7 | 
            +
             * Pre-allocate a chunk of memory and allocate them in small pieces.
         | 
| 8 | 
            +
             * Those memory are never freed after allocation. Used for persist
         | 
| 9 | 
            +
             * data like dictionary contents that will never be destroyed unless
         | 
| 10 | 
            +
             * the application exited.
         | 
| 11 | 
            +
             */
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            namespace rmmseg
         | 
| 14 | 
            +
            {
         | 
| 15 | 
            +
                const size_t REALLOC_SIZE = 2048; /* 2KB */
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                extern size_t  _pool_size;
         | 
| 18 | 
            +
                extern char   *_pool_base;
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                inline void *pool_alloc(size_t len)
         | 
| 21 | 
            +
                {
         | 
| 22 | 
            +
                    void *mem = _pool_base;
         | 
| 23 | 
            +
                    
         | 
| 24 | 
            +
                    if (len <= _pool_size)
         | 
| 25 | 
            +
                    {
         | 
| 26 | 
            +
                        _pool_size -= len;
         | 
| 27 | 
            +
                        _pool_base += len;
         | 
| 28 | 
            +
                        return mem;
         | 
| 29 | 
            +
                    }
         | 
| 30 | 
            +
                    
         | 
| 31 | 
            +
                    /* NOTE: the remaining memory is simply discard, which WILL
         | 
| 32 | 
            +
                     * cause memory leak. However, this function is not for allocating
         | 
| 33 | 
            +
                     * large object. Larger pre-alloc chunk size will also reduce the
         | 
| 34 | 
            +
                     * impact of this leak. So this is generally not a problem. */
         | 
| 35 | 
            +
                    _pool_base = static_cast<char *>(std::malloc(REALLOC_SIZE));
         | 
| 36 | 
            +
                    mem = _pool_base;
         | 
| 37 | 
            +
                    _pool_base += len;
         | 
| 38 | 
            +
                    _pool_size = REALLOC_SIZE - len;
         | 
| 39 | 
            +
                    return mem;
         | 
| 40 | 
            +
                }
         | 
| 41 | 
            +
            }
         | 
| 42 | 
            +
             | 
| 43 | 
            +
            #endif /* _MEMORY_H_ */
         |