pluskid-rmmseg-cpp 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,216 @@
1
+ #include <cctype>
2
+ #include <cassert>
3
+
4
+ #include "rules.h"
5
+ #include "algor.h"
6
+
7
+ using namespace std;
8
+
9
+ namespace rmmseg
10
+ {
11
+ Token Algorithm::next_token()
12
+ {
13
+ do
14
+ {
15
+ if (m_pos >= m_text_length)
16
+ return Token(NULL, 0);
17
+
18
+ Token tk(NULL, 0);
19
+ int len = next_char();
20
+ if (len == 1)
21
+ tk = get_basic_latin_word();
22
+ else
23
+ tk = get_cjk_word(len);
24
+ if (tk.length > 0)
25
+ return tk;
26
+ }
27
+ while (true);
28
+ }
29
+
30
+ Token Algorithm::get_basic_latin_word()
31
+ {
32
+ int len = 1;
33
+ int start, end;
34
+
35
+ // Skip pre-word whitespaces and punctuations
36
+ while (m_pos < m_text_length)
37
+ {
38
+ if (len > 1)
39
+ break;
40
+ if (isalnum(m_text[m_pos]))
41
+ break;
42
+ m_pos++;
43
+ len = next_char();
44
+ }
45
+
46
+ start = m_pos;
47
+ while (m_pos < m_text_length)
48
+ {
49
+ if (len > 1)
50
+ break;
51
+ if (!isalnum(m_text[m_pos]))
52
+ break;
53
+ m_pos++;
54
+ len = next_char();
55
+ }
56
+ end = m_pos;
57
+
58
+ // Skip post-word whitespaces and punctuations
59
+ while (m_pos < m_text_length)
60
+ {
61
+ if (len > 1)
62
+ break;
63
+ if (isalnum(m_text[m_pos]))
64
+ break;
65
+ m_pos++;
66
+ len = next_char();
67
+ }
68
+
69
+ return Token(m_text+start, end-start);
70
+ }
71
+
72
+ Token Algorithm::get_cjk_word(int len)
73
+ {
74
+ vector<Chunk> chunks = create_chunks();
75
+
76
+ if (chunks.size() > 1)
77
+ mm_filter(chunks);
78
+ if (chunks.size() > 1)
79
+ lawl_filter(chunks);
80
+ if (chunks.size() > 1)
81
+ svwl_filter(chunks);
82
+ if (chunks.size() > 1)
83
+ lsdmfocw_filter(chunks);
84
+
85
+
86
+ Token token(m_text+m_pos, chunks[0].words[0]->nbytes);
87
+ m_pos += chunks[0].words[0]->nbytes;
88
+ return token;
89
+ }
90
+
91
+ vector<Chunk> Algorithm::create_chunks()
92
+ {
93
+ vector<Chunk> chunks;
94
+ Chunk chunk;
95
+ Word *w1, *w2, *w3;
96
+
97
+ int orig_pos = m_pos;
98
+ typedef vector<Word *> vec_t;
99
+ typedef vec_t::iterator it_t;
100
+
101
+ vec_t words1 = find_match_words();
102
+ for (it_t i1 = words1.begin();
103
+ i1 != words1.end();
104
+ ++i1)
105
+ {
106
+ w1 = *i1;
107
+ chunk.words[0] = w1;
108
+ m_pos += w1->nbytes;
109
+ if (m_pos < m_text_length)
110
+ {
111
+ vec_t words2 = find_match_words();
112
+ for (it_t i2 = words2.begin();
113
+ i2 != words2.end();
114
+ ++i2)
115
+ {
116
+ w2 = *i2;
117
+ chunk.words[1] = w2;
118
+ m_pos += w2->nbytes;
119
+ if (m_pos < m_text_length)
120
+ {
121
+ vec_t words3 = find_match_words();
122
+ for (it_t i3 = words3.begin();
123
+ i3 != words3.end();
124
+ ++i3)
125
+ {
126
+ w3 = *i3;
127
+ if (w3->length == -1) // tmp word
128
+ {
129
+ chunk.n = 2;
130
+ }
131
+ else
132
+ {
133
+ chunk.n = 3;
134
+ chunk.words[2] = w3;
135
+ }
136
+ chunks.push_back(chunk);
137
+ }
138
+ }
139
+ else if (m_pos == m_text_length)
140
+ {
141
+ chunk.n = 2;
142
+ chunks.push_back(chunk);
143
+ }
144
+ m_pos -= w2->nbytes;
145
+ }
146
+ }
147
+ else if (m_pos == m_text_length)
148
+ {
149
+ chunk.n = 1;
150
+ chunks.push_back(chunk);
151
+ }
152
+ m_pos -= w1->nbytes;
153
+ }
154
+
155
+ m_pos = orig_pos;
156
+ return chunks;
157
+ }
158
+
159
+ int Algorithm::next_char()
160
+ {
161
+ // ONLY for UTF-8
162
+ unsigned char ch = m_text[m_pos];
163
+ if (ch >= 0xC0 && ch <= 0xDF)
164
+ return 2;
165
+ if (ch >= 0xE0 && ch <= 0xEF)
166
+ return 3;
167
+ return 1;
168
+ }
169
+
170
+ vector<Word *> Algorithm::find_match_words()
171
+ {
172
+ for (int i = 0; i < match_cache_size; ++i)
173
+ if (m_match_cache[i].first == m_pos)
174
+ return m_match_cache[i].second;
175
+
176
+ vector<Word *> words;
177
+ Word *word;
178
+ int orig_pos = m_pos;
179
+ int n = 0, len;
180
+
181
+ while (m_pos < m_text_length)
182
+ {
183
+ len = next_char();
184
+ if (len <= 1)
185
+ break;
186
+ if (n >= max_word_length())
187
+ break;
188
+
189
+ m_pos += len;
190
+ n++;
191
+
192
+ word = dict::get(m_text+orig_pos, m_pos-orig_pos);
193
+ if (word)
194
+ words.push_back(word);
195
+ }
196
+
197
+ m_pos = orig_pos;
198
+
199
+ if (words.empty())
200
+ {
201
+ word = get_tmp_word();
202
+ word->nbytes = next_char();
203
+ word->length = -1;
204
+ strncpy(word->text, m_text+m_pos, word->nbytes);
205
+ word->text[word->nbytes] = '\0';
206
+ words.push_back(word);
207
+ }
208
+
209
+ m_match_cache[m_match_cache_i] = make_pair(m_pos, words);
210
+ m_match_cache_i++;
211
+ if (m_match_cache_i >= match_cache_size)
212
+ m_match_cache_i = 0;
213
+
214
+ return words;
215
+ }
216
+ }
@@ -0,0 +1,75 @@
1
+ #ifndef _ALGORITHM_H_
2
+ #define _ALGORITHM_H_
3
+
4
+ #include <vector>
5
+
6
+ #include "chunk.h"
7
+ #include "token.h"
8
+ #include "dict.h"
9
+
10
+ /**
11
+ * The Algorithm of MMSeg use four rules:
12
+ * - Maximum matching rule
13
+ * - Largest average word length rule
14
+ * - Smallest variance of word length rule
15
+ * - Largest sum of degree of morphemic freedom of one-character
16
+ * words rule
17
+ */
18
+
19
+ namespace rmmseg
20
+ {
21
+ class Algorithm
22
+ {
23
+ public:
24
+ Algorithm(const char *text, int length)
25
+ :m_text(text), m_pos(0),
26
+ m_text_length(length),
27
+ m_tmp_words_i(0),
28
+ m_match_cache_i(0)
29
+ {
30
+ for (int i = 0; i < match_cache_size; ++i)
31
+ m_match_cache[i].first = -1;
32
+ }
33
+
34
+ Token next_token();
35
+
36
+ private:
37
+ Token get_basic_latin_word();
38
+ Token get_cjk_word(int);
39
+
40
+ std::vector<Chunk> create_chunks();
41
+ int next_word();
42
+ int next_char();
43
+ std::vector<Word *> find_match_words();
44
+ int max_word_length() { return 4; }
45
+
46
+
47
+ const char *m_text;
48
+ int m_pos;
49
+ int m_text_length;
50
+
51
+ /* tmp words are only for 1-char words which
52
+ * are not exist in the dictionary. It's length
53
+ * value will be set to -1 to indicate it is
54
+ * a tmp word. */
55
+ Word *get_tmp_word()
56
+ {
57
+ if (m_tmp_words_i >= max_tmp_words)
58
+ m_tmp_words_i = 0; // round wrap
59
+ return &m_tmp_words[m_tmp_words_i++];
60
+ }
61
+
62
+ /* related to max_word_length and match_words_cache_size */
63
+ static const int max_tmp_words = 64;
64
+ Word m_tmp_words[max_tmp_words];
65
+ int m_tmp_words_i;
66
+
67
+ /* match word caches */
68
+ static const int match_cache_size = 3;
69
+ typedef std::pair<int, std::vector<Word *> > match_cache_t;
70
+ match_cache_t m_match_cache[match_cache_size];
71
+ int m_match_cache_i;
72
+ };
73
+ }
74
+
75
+ #endif /* _ALGORITHM_H_ */
@@ -0,0 +1,58 @@
1
+ #ifndef _CHUNK_H_
2
+ #define _CHUNK_H_
3
+
4
+ #include <cmath>
5
+
6
+ #include "word.h"
7
+
8
+ namespace rmmseg
9
+ {
10
+ /**
11
+ * A chunk stores 3 (or less) successive words.
12
+ */
13
+ struct Chunk
14
+ {
15
+ int total_length()
16
+ {
17
+ int len = 0;
18
+ for (int i = 0; i < n; ++i)
19
+ if (words[i]->length == -1) /* tmp word */
20
+ len += 1;
21
+ else
22
+ len += words[i]->length;
23
+ return len;
24
+ }
25
+ double average_length()
26
+ {
27
+ return ((double)total_length())/n;
28
+ }
29
+ double variance()
30
+ {
31
+ double avg = average_length();
32
+ double sqr_sum = 0;
33
+ double tmp;
34
+ for (int i = 0; i < n; ++i)
35
+ {
36
+ tmp = words[i]->length;
37
+ if (tmp == -1)
38
+ tmp = 1;
39
+ tmp = tmp-avg;
40
+ sqr_sum += tmp*tmp;
41
+ }
42
+ return std::sqrt(sqr_sum);
43
+ }
44
+ int degree_of_morphemic_freedom()
45
+ {
46
+ int sum = 0;
47
+ for (int i = 0; i < n; ++i)
48
+ sum += words[i]->freq;
49
+ return sum;
50
+ }
51
+
52
+
53
+ int n;
54
+ Word *words[3];
55
+ };
56
+ }
57
+
58
+ #endif /* _CHUNK_H_ */
@@ -0,0 +1,228 @@
1
+ #include <cstdio>
2
+
3
+ #include "dict.h"
4
+
5
+ using namespace std;
6
+
7
+ namespace rmmseg
8
+ {
9
+ struct Entry
10
+ {
11
+ Word *word;
12
+ Entry *next;
13
+ };
14
+
15
+ const int init_size = 262147;
16
+ const int max_density = 5;
17
+ /*
18
+ Table of prime numbers 2^n+a, 2<=n<=30.
19
+ */
20
+ static int primes[] = {
21
+ 524288 + 21,
22
+ 1048576 + 7,
23
+ 2097152 + 17,
24
+ 4194304 + 15,
25
+ 8388608 + 9,
26
+ 16777216 + 43,
27
+ 33554432 + 35,
28
+ 67108864 + 15,
29
+ 134217728 + 29,
30
+ 268435456 + 3,
31
+ 536870912 + 11,
32
+ 1073741824 + 85,
33
+ };
34
+
35
+
36
+ static int n_bins = init_size;
37
+ static int n_entries = 0;
38
+ static Entry **bins = static_cast<Entry **>(std::calloc(init_size,
39
+ sizeof(Entry *)));
40
+
41
+ static int new_size()
42
+ {
43
+ for (int i = 0;
44
+ i < sizeof(primes)/sizeof(primes[0]);
45
+ ++i)
46
+ {
47
+ if (primes[i] > n_bins)
48
+ {
49
+ return primes[i];
50
+ }
51
+ }
52
+ // TODO: raise exception here
53
+ return n_bins;
54
+ }
55
+
56
+ static unsigned int hash(const char *str, int len)
57
+ {
58
+ unsigned int key = 0;
59
+ while (len--)
60
+ {
61
+ key += *str++;
62
+ key += (key << 10);
63
+ key ^= (key >> 6);
64
+ }
65
+ key += (key << 3);
66
+ key ^= (key >> 11);
67
+ key += (key << 15);
68
+ return key;
69
+ }
70
+
71
+ static void rehash()
72
+ {
73
+ int new_n_bins = new_size();
74
+ Entry **new_bins = static_cast<Entry **>(calloc(new_n_bins,
75
+ sizeof(Entry *)));
76
+ Entry *entry, *next;
77
+ unsigned int hash_val;
78
+
79
+ for (int i = 0; i < n_bins; ++i)
80
+ {
81
+ entry = bins[i];
82
+ while (entry)
83
+ {
84
+ next = entry->next;
85
+ hash_val = hash(entry->word->text,
86
+ entry->word->nbytes) % new_n_bins;
87
+ entry->next = new_bins[hash_val];
88
+ new_bins[hash_val] = entry;
89
+ entry = next;
90
+ }
91
+ }
92
+ free(bins);
93
+ n_bins = new_n_bins;
94
+ bins = new_bins;
95
+ }
96
+
97
+ namespace dict
98
+ {
99
+
100
+ /**
101
+ * str: the base of the string
102
+ * len: length of the string (in bytes)
103
+ *
104
+ * str may be a substring of a big chunk of text thus not nul-terminated,
105
+ * so len is necessary here.
106
+ */
107
+ Word *get(const char *str, int len)
108
+ {
109
+ unsigned int h = hash(str, len) % n_bins;
110
+ Entry *entry = bins[h];
111
+ if (!entry)
112
+ return NULL;
113
+ do
114
+ {
115
+ if (len == entry->word->nbytes &&
116
+ strncmp(str, entry->word->text, len) == 0)
117
+ return entry->word;
118
+ entry = entry->next;
119
+ }
120
+ while (entry);
121
+
122
+ return NULL;
123
+ }
124
+
125
+ void add(Word *word)
126
+ {
127
+ unsigned int hash_val = hash(word->text, word->nbytes);
128
+ unsigned int h = hash_val % n_bins;
129
+ Entry *entry = bins[h];
130
+ if (!entry)
131
+ {
132
+ if (n_entries/n_bins > max_density)
133
+ {
134
+ rehash();
135
+ h = hash_val % n_bins;
136
+ }
137
+
138
+ entry = static_cast<Entry *>(pool_alloc(sizeof(Entry)));
139
+ entry->word = word;
140
+ entry->next = NULL;
141
+ bins[h] = entry;
142
+ n_entries++;
143
+ }
144
+
145
+ bool done = false;
146
+ do
147
+ {
148
+ if (word->nbytes == entry->word->nbytes &&
149
+ strncmp(word->text, entry->word->text, word->nbytes) == 0)
150
+ {
151
+ /* Overwriting. WARNING: the original Word object is
152
+ * permanently lost. This IS a memory leak, because
153
+ * the memory is allocated by pool_alloc. Instead of
154
+ * fixing this, tuning the dictionary file is a better
155
+ * idea
156
+ */
157
+ entry->word = word;
158
+ done = true;
159
+ break;
160
+ }
161
+ entry = entry->next;
162
+ }
163
+ while (entry);
164
+
165
+ if (!done)
166
+ {
167
+ entry = static_cast<Entry *>(pool_alloc(sizeof(Entry)));
168
+ entry->word = word;
169
+ entry->next = bins[h];
170
+ bins[h] = entry;
171
+ }
172
+ }
173
+
174
+ bool load_chars(const char *filename)
175
+ {
176
+ FILE *fp = fopen(filename, "r");
177
+ if (!fp)
178
+ {
179
+ return false;
180
+ }
181
+
182
+ const int buf_len = 24;
183
+ char buf[buf_len];
184
+ char *ptr;
185
+
186
+ while(fgets(buf, buf_len, fp))
187
+ {
188
+ // NOTE: there SHOULD be a newline at the end of the file
189
+ buf[strlen(buf)-1] = '\0'; // truncate the newline
190
+ ptr = strchr(buf, ' ');
191
+ if (!ptr)
192
+ continue; // illegal input
193
+ *ptr = '\0';
194
+ add(make_word(ptr+1, 1, atoi(buf)));
195
+ }
196
+
197
+ fclose(fp);
198
+ return true;
199
+ }
200
+
201
+ bool load_words(const char *filename)
202
+ {
203
+ FILE *fp = fopen(filename, "r");
204
+ if (!fp)
205
+ {
206
+ return false;
207
+ }
208
+
209
+ const int buf_len = 48;
210
+ char buf[buf_len];
211
+ char *ptr;
212
+
213
+ while(fgets(buf, buf_len, fp))
214
+ {
215
+ // NOTE: there SHOULD be a newline at the end of the file
216
+ buf[strlen(buf)-1] = '\0'; // truncate the newline
217
+ ptr = strchr(buf, ' ');
218
+ if (!ptr)
219
+ continue; // illegal input
220
+ *ptr = '\0';
221
+ add(make_word(ptr+1, atoi(buf), 0));
222
+ }
223
+
224
+ fclose(fp);
225
+ return true;
226
+ }
227
+ }
228
+ }
data/ext/rmmseg/dict.h ADDED
@@ -0,0 +1,34 @@
1
+ #ifndef _DICT_H_
2
+ #define _DICT_H_
3
+
4
+ #include "word.h"
5
+
6
+ /**
7
+ * A dictionary is a hash table of
8
+ * - key: string
9
+ * - value: word
10
+ *
11
+ * Dictionary data can be loaded from files. Two type of dictionary
12
+ * files are supported:
13
+ * - character file: Each line contains a number and a character,
14
+ * the number is the frequency of the character.
15
+ * The frequency should NOT exceeds 65535.
16
+ * - word file: Each line contains a number and a word, the
17
+ * number is the character count of the word.
18
+ */
19
+
20
+ namespace rmmseg
21
+ {
22
+ /* Instead of making a class with only one instance, i'll not
23
+ * bother to make it a class here. */
24
+
25
+ namespace dict
26
+ {
27
+ void add(Word *word);
28
+ bool load_chars(const char *filename);
29
+ bool load_words(const char *filename);
30
+ Word *get(const char *str, int len);
31
+ }
32
+ }
33
+
34
+ #endif /* _DICT_H_ */
@@ -0,0 +1,6 @@
1
+ require 'mkmf'
2
+
3
+ CONFIG['LDSHARED'] = 'g++ -shared'
4
+
5
+ $objs = ['algor.o', 'dict.o', 'memory.o', 'rmmseg.o']
6
+ create_makefile('rmmseg')
@@ -0,0 +1,9 @@
1
+ #include "memory.h"
2
+
3
+ #define PRE_ALLOC_SIZE 2097152 /* 2MB */
4
+
5
+ namespace rmmseg
6
+ {
7
+ char *_pool_base = static_cast<char *>(std::malloc(PRE_ALLOC_SIZE));
8
+ int _pool_size = PRE_ALLOC_SIZE;
9
+ }
@@ -0,0 +1,43 @@
1
+ #ifndef _MEMORY_H_
2
+ #define _MEMORY_H_
3
+
4
+ #include <cstdlib>
5
+
6
+ /**
7
+ * Pre-allocate a chunk of memory and allocate them in small pieces.
8
+ * Those memory are never freed after allocation. Used for persist
9
+ * data like dictionary contents that will never be destroyed unless
10
+ * the application exited.
11
+ */
12
+
13
+ namespace rmmseg
14
+ {
15
+ const int REALLOC_SIZE = 2048; /* 2KB */
16
+
17
+ extern int _pool_size;
18
+ extern char *_pool_base;
19
+
20
+ inline void *pool_alloc(int len)
21
+ {
22
+ void *mem = _pool_base;
23
+
24
+ if (len <= _pool_size)
25
+ {
26
+ _pool_size -= len;
27
+ _pool_base += len;
28
+ return mem;
29
+ }
30
+
31
+ /* NOTE: the remaining memory is simply discard, which WILL
32
+ * cause memory leak. However, this function is not for allocating
33
+ * large object. Larger pre-alloc chunk size will also reduce the
34
+ * impact of this leak. So this is generally not a problem. */
35
+ _pool_base = static_cast<char *>(std::malloc(REALLOC_SIZE));
36
+ mem = _pool_base;
37
+ _pool_base += len;
38
+ _pool_size = REALLOC_SIZE - len;
39
+ return mem;
40
+ }
41
+ }
42
+
43
+ #endif /* _MEMORY_H_ */