rmmseg-cpp-new 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,253 @@
1
+ #include <cctype>
2
+ #include <cassert>
3
+
4
+ #include "rules.h"
5
+ #include "algor.h"
6
+
7
+ using namespace std;
8
+
9
+ namespace rmmseg
10
+ {
11
+ Token Algorithm::next_token()
12
+ {
13
+ do
14
+ {
15
+ if (m_pos >= m_text_length)
16
+ return Token(NULL, 0);
17
+
18
+ Token tk(NULL, 0);
19
+ int len = next_char();
20
+ if (len == 1)
21
+ tk = get_basic_latin_word();
22
+ else
23
+ tk = get_cjk_word(len);
24
+ if (tk.length > 0)
25
+ return tk;
26
+ }
27
+ while (true);
28
+ }
29
+
30
+ Token Algorithm::get_basic_latin_word()
31
+ {
32
+ int len = 1;
33
+ int start, end;
34
+
35
+ // Skip pre-word whitespaces and punctuations
36
+ while (m_pos < m_text_length)
37
+ {
38
+ if (len > 1)
39
+ break;
40
+ if (isalnum(m_text[m_pos]))
41
+ break;
42
+ m_pos++;
43
+ len = next_char();
44
+ }
45
+
46
+ start = m_pos;
47
+ while (m_pos < m_text_length)
48
+ {
49
+ if (len > 1)
50
+ break;
51
+ if (!isalnum(m_text[m_pos]))
52
+ break;
53
+ m_pos++;
54
+ len = next_char();
55
+ }
56
+ end = m_pos;
57
+
58
+ // Skip post-word whitespaces and punctuations
59
+ while (m_pos < m_text_length)
60
+ {
61
+ if (len > 1)
62
+ break;
63
+ if (isalnum(m_text[m_pos]))
64
+ break;
65
+ m_pos++;
66
+ len = next_char();
67
+ }
68
+
69
+ return Token(m_text+start, end-start);
70
+ }
71
+
72
+ Token Algorithm::get_cjk_word(int len)
73
+ {
74
+ vector<Chunk> chunks = create_chunks();
75
+
76
+ if (chunks.size() > 1)
77
+ mm_filter(chunks);
78
+ if (chunks.size() > 1)
79
+ lawl_filter(chunks);
80
+ if (chunks.size() > 1)
81
+ svwl_filter(chunks);
82
+ if (chunks.size() > 1)
83
+ lsdmfocw_filter(chunks);
84
+
85
+ if (chunks.size() < 1)
86
+ return Token(NULL, 0);
87
+
88
+ Token token(m_text+m_pos, chunks[0].words[0]->nbytes);
89
+ m_pos += chunks[0].words[0]->nbytes;
90
+ return token;
91
+ }
92
+
93
+ vector<Chunk> Algorithm::create_chunks()
94
+ {
95
+ vector<Chunk> chunks;
96
+ Chunk chunk;
97
+ Word *w1, *w2, *w3;
98
+
99
+ int orig_pos = m_pos;
100
+ typedef vector<Word *> vec_t;
101
+ typedef vec_t::iterator it_t;
102
+
103
+ vec_t words1 = find_match_words();
104
+ for (it_t i1 = words1.begin();
105
+ i1 != words1.end();
106
+ ++i1)
107
+ {
108
+ w1 = *i1;
109
+ chunk.words[0] = w1;
110
+ m_pos += w1->nbytes;
111
+ if (m_pos < m_text_length)
112
+ {
113
+ vec_t words2 = find_match_words();
114
+ for (it_t i2 = words2.begin();
115
+ i2 != words2.end();
116
+ ++i2)
117
+ {
118
+ w2 = *i2;
119
+ chunk.words[1] = w2;
120
+ m_pos += w2->nbytes;
121
+ if (m_pos < m_text_length)
122
+ {
123
+ vec_t words3 = find_match_words();
124
+ for (it_t i3 = words3.begin();
125
+ i3 != words3.end();
126
+ ++i3)
127
+ {
128
+ w3 = *i3;
129
+ if (w3->length == -1) // tmp word
130
+ {
131
+ chunk.n = 2;
132
+ }
133
+ else
134
+ {
135
+ chunk.n = 3;
136
+ chunk.words[2] = w3;
137
+ }
138
+ chunks.push_back(chunk);
139
+ }
140
+ }
141
+ else if (m_pos == m_text_length)
142
+ {
143
+ chunk.n = 2;
144
+ chunks.push_back(chunk);
145
+ }
146
+ m_pos -= w2->nbytes;
147
+ }
148
+ }
149
+ else if (m_pos == m_text_length)
150
+ {
151
+ chunk.n = 1;
152
+ chunks.push_back(chunk);
153
+ }
154
+ m_pos -= w1->nbytes;
155
+ }
156
+
157
+ m_pos = orig_pos;
158
+ return chunks;
159
+ }
160
+
161
+ int Algorithm::next_char()
162
+ {
163
+ // ONLY for UTF-8
164
+ unsigned char ch = m_text[m_pos];
165
+
166
+ if ( ch>>7 == 0 )
167
+ return 1;
168
+ if ( ch>>5 == 0x6 )
169
+ return 2;
170
+ else if (ch>> 4 == 0xe)
171
+ return 3;
172
+ else if (ch>> 3 == 0x1e )
173
+ return 4;
174
+ else
175
+ return -1;
176
+ // if (ch >= 0xC0 && ch <= 0xDF)
177
+ // {
178
+ // return 1;
179
+ // }
180
+ // if (ch >= 0x60 && ch <= 0xEF)
181
+ // {
182
+ // return min(2, m_text_length-m_pos);
183
+ // }
184
+ // if (ch >= 0xE0 && ch <= 0xEF)
185
+ // {
186
+ // return min(3, m_text_length-m_pos);
187
+ // }
188
+ // return 1;
189
+ }
190
+
191
+ // int Algorithm::next_char(const char * p)
192
+ // {
193
+ // uint8 one = (unsigned char)(*p);
194
+
195
+ // if ( one>>7 == 0 )
196
+ // return 1;
197
+ // if ( one>>5 == 0x6 )
198
+ // return 2;
199
+ // else if (one >> 4 == 0xe)
200
+ // return 3;
201
+ // else if (one >> 3 == 0x1e )
202
+ // return 4;
203
+ // else
204
+ // return -1;
205
+ // }
206
+
207
+ vector<Word *> Algorithm::find_match_words()
208
+ {
209
+ for (int i = 0; i < match_cache_size; ++i)
210
+ if (m_match_cache[i].first == m_pos)
211
+ return m_match_cache[i].second;
212
+
213
+ vector<Word *> words;
214
+ Word *word;
215
+ int orig_pos = m_pos;
216
+ int n = 0, len;
217
+
218
+ while (m_pos < m_text_length)
219
+ {
220
+ if (n >= max_word_length())
221
+ break;
222
+ len = next_char();
223
+ if (len <= 1)
224
+ break;
225
+
226
+ m_pos += len;
227
+ n++;
228
+
229
+ word = dict::get(m_text+orig_pos, m_pos-orig_pos);
230
+ if (word)
231
+ words.push_back(word);
232
+ }
233
+
234
+ m_pos = orig_pos;
235
+
236
+ if (words.empty())
237
+ {
238
+ word = get_tmp_word();
239
+ word->nbytes = next_char();
240
+ word->length = -1;
241
+ strncpy(word->text, m_text+m_pos, word->nbytes);
242
+ word->text[word->nbytes] = '\0';
243
+ words.push_back(word);
244
+ }
245
+
246
+ m_match_cache[m_match_cache_i] = make_pair(m_pos, words);
247
+ m_match_cache_i++;
248
+ if (m_match_cache_i >= match_cache_size)
249
+ m_match_cache_i = 0;
250
+
251
+ return words;
252
+ }
253
+ }
@@ -0,0 +1,79 @@
1
+ #ifndef _ALGORITHM_H_
2
+ #define _ALGORITHM_H_
3
+
4
+ #include <vector>
5
+
6
+ #include "chunk.h"
7
+ #include "token.h"
8
+ #include "dict.h"
9
+
10
+ /**
11
+ * The Algorithm of MMSeg use four rules:
12
+ * - Maximum matching rule
13
+ * - Largest average word length rule
14
+ * - Smallest variance of word length rule
15
+ * - Largest sum of degree of morphemic freedom of one-character
16
+ * words rule
17
+ */
18
+
19
+ namespace rmmseg
20
+ {
21
+ class Algorithm
22
+ {
23
+ public:
24
+ Algorithm(const char *text, int length)
25
+ :m_text(text), m_pos(0),
26
+ m_text_length(length),
27
+ m_tmp_words_i(0),
28
+ m_match_cache_i(0)
29
+ {
30
+ for (int i = 0; i < match_cache_size; ++i)
31
+ m_match_cache[i].first = -1;
32
+ }
33
+
34
+ Token next_token();
35
+
36
+ const char *get_text() const
37
+ {
38
+ return m_text;
39
+ }
40
+
41
+ private:
42
+ Token get_basic_latin_word();
43
+ Token get_cjk_word(int);
44
+
45
+ std::vector<Chunk> create_chunks();
46
+ int next_word();
47
+ int next_char();
48
+ std::vector<Word *> find_match_words();
49
+ int max_word_length() { return 20; }
50
+
51
+ const char *m_text;
52
+ int m_pos;
53
+ int m_text_length;
54
+
55
+ /* tmp words are only for 1-char words which
56
+ * are not exist in the dictionary. It's length
57
+ * value will be set to -1 to indicate it is
58
+ * a tmp word. */
59
+ Word *get_tmp_word()
60
+ {
61
+ if (m_tmp_words_i >= max_tmp_words)
62
+ m_tmp_words_i = 0; // round wrap
63
+ return &m_tmp_words[m_tmp_words_i++];
64
+ }
65
+
66
+ /* related to max_word_length and match_cache_size */
67
+ static const int max_tmp_words = 512;
68
+ Word m_tmp_words[max_tmp_words];
69
+ int m_tmp_words_i;
70
+
71
+ /* match word caches */
72
+ static const int match_cache_size = 3;
73
+ typedef std::pair<int, std::vector<Word *> > match_cache_t;
74
+ match_cache_t m_match_cache[match_cache_size];
75
+ int m_match_cache_i;
76
+ };
77
+ }
78
+
79
+ #endif /* _ALGORITHM_H_ */
@@ -0,0 +1,59 @@
1
+ #ifndef _CHUNK_H_
2
+ #define _CHUNK_H_
3
+
4
+ #include <cmath>
5
+
6
+ #include "word.h"
7
+
8
+ namespace rmmseg
9
+ {
10
+ /**
11
+ * A chunk stores 3 (or less) successive words.
12
+ */
13
+ struct Chunk
14
+ {
15
+ int total_length() const
16
+ {
17
+ int len = 0;
18
+ for (int i = 0; i < n; ++i)
19
+ len += std::abs(words[i]->length);
20
+ //if (words[i]->length == -1) /* tmp word */
21
+ // len += 1;
22
+ //else
23
+ // len += words[i]->length;
24
+ return len;
25
+ }
26
+ double average_length() const
27
+ {
28
+ return ((double)total_length())/n;
29
+ }
30
+ double variance() const
31
+ {
32
+ double avg = average_length();
33
+ double sqr_sum = 0;
34
+ double tmp;
35
+ for (int i = 0; i < n; ++i)
36
+ {
37
+ tmp = std::abs(words[i]->length);
38
+ //if (tmp == -1)
39
+ // tmp = 1;
40
+ tmp = tmp-avg;
41
+ sqr_sum += tmp*tmp;
42
+ }
43
+ return std::sqrt(sqr_sum);
44
+ }
45
+ int degree_of_morphemic_freedom() const
46
+ {
47
+ int sum = 0;
48
+ for (int i = 0; i < n; ++i)
49
+ sum += words[i]->freq;
50
+ return sum;
51
+ }
52
+
53
+
54
+ int n;
55
+ Word *words[3];
56
+ };
57
+ }
58
+
59
+ #endif /* _CHUNK_H_ */
@@ -0,0 +1,230 @@
1
+ #include <cstdio>
2
+
3
+ #include "dict.h"
4
+
5
+ using namespace std;
6
+
7
+ namespace rmmseg
8
+ {
9
+ struct Entry
10
+ {
11
+ Word *word;
12
+ Entry *next;
13
+ };
14
+
15
+ const size_t init_size = 262147;
16
+ const size_t max_density = 5;
17
+ /*
18
+ Table of prime numbers 2^n+a, 2<=n<=30.
19
+ */
20
+ static size_t primes[] = {
21
+ 524288 + 21,
22
+ 1048576 + 7,
23
+ 2097152 + 17,
24
+ 4194304 + 15,
25
+ 8388608 + 9,
26
+ 16777216 + 43,
27
+ 33554432 + 35,
28
+ 67108864 + 15,
29
+ 134217728 + 29,
30
+ 268435456 + 3,
31
+ 536870912 + 11,
32
+ 1073741824 + 85,
33
+ };
34
+
35
+
36
+ static size_t n_bins = init_size;
37
+ static size_t n_entries = 0;
38
+ static Entry **bins = static_cast<Entry **>(std::calloc(init_size,
39
+ sizeof(Entry *)));
40
+
41
+ static size_t new_size()
42
+ {
43
+ for (size_t i = 0;
44
+ i < sizeof(primes)/sizeof(primes[0]);
45
+ ++i)
46
+ {
47
+ if (primes[i] > n_bins)
48
+ {
49
+ return primes[i];
50
+ }
51
+ }
52
+ // TODO: raise exception here
53
+ return n_bins;
54
+ }
55
+
56
+ static unsigned int hash(const char *str, int len)
57
+ {
58
+ unsigned int key = 0;
59
+ while (len--)
60
+ {
61
+ key += *str++;
62
+ key += (key << 10);
63
+ key ^= (key >> 6);
64
+ }
65
+ key += (key << 3);
66
+ key ^= (key >> 11);
67
+ key += (key << 15);
68
+ return key;
69
+ }
70
+
71
+ static void rehash()
72
+ {
73
+ int new_n_bins = new_size();
74
+ Entry **new_bins = static_cast<Entry **>(calloc(new_n_bins,
75
+ sizeof(Entry *)));
76
+ Entry *entry, *next;
77
+ unsigned int hash_val;
78
+
79
+ for (size_t i = 0; i < n_bins; ++i)
80
+ {
81
+ entry = bins[i];
82
+ while (entry)
83
+ {
84
+ next = entry->next;
85
+ hash_val = hash(entry->word->text,
86
+ entry->word->nbytes) % new_n_bins;
87
+ entry->next = new_bins[hash_val];
88
+ new_bins[hash_val] = entry;
89
+ entry = next;
90
+ }
91
+ }
92
+ free(bins);
93
+ n_bins = new_n_bins;
94
+ bins = new_bins;
95
+ }
96
+
97
+ namespace dict
98
+ {
99
+
100
+ /**
101
+ * str: the base of the string
102
+ * len: length of the string (in bytes)
103
+ *
104
+ * str may be a substring of a big chunk of text thus not nul-terminated,
105
+ * so len is necessary here.
106
+ */
107
+ Word *get(const char *str, int len)
108
+ {
109
+ unsigned int h = hash(str, len) % n_bins;
110
+ Entry *entry = bins[h];
111
+ if (!entry)
112
+ return NULL;
113
+ do
114
+ {
115
+ if (len == entry->word->nbytes &&
116
+ strncmp(str, entry->word->text, len) == 0)
117
+ return entry->word;
118
+ entry = entry->next;
119
+ }
120
+ while (entry);
121
+
122
+ return NULL;
123
+ }
124
+
125
+ void add(Word *word)
126
+ {
127
+ unsigned int hash_val = hash(word->text, word->nbytes);
128
+ unsigned int h = hash_val % n_bins;
129
+ Entry *entry = bins[h];
130
+ if (!entry)
131
+ {
132
+ if (n_entries/n_bins > max_density)
133
+ {
134
+ rehash();
135
+ h = hash_val % n_bins;
136
+ }
137
+
138
+ entry = static_cast<Entry *>(pool_alloc(sizeof(Entry)));
139
+ entry->word = word;
140
+ entry->next = NULL;
141
+ bins[h] = entry;
142
+ n_entries++;
143
+ return;
144
+ }
145
+
146
+ bool done = false;
147
+ do
148
+ {
149
+ if (word->nbytes == entry->word->nbytes &&
150
+ strncmp(word->text, entry->word->text, word->nbytes) == 0)
151
+ {
152
+ /* Overwriting. WARNING: the original Word object is
153
+ * permanently lost. This IS a memory leak, because
154
+ * the memory is allocated by pool_alloc. Instead of
155
+ * fixing this, tuning the dictionary file is a better
156
+ * idea
157
+ */
158
+ entry->word = word;
159
+ done = true;
160
+ break;
161
+ }
162
+ entry = entry->next;
163
+ }
164
+ while (entry);
165
+
166
+ if (!done)
167
+ {
168
+ entry = static_cast<Entry *>(pool_alloc(sizeof(Entry)));
169
+ entry->word = word;
170
+ entry->next = bins[h];
171
+ bins[h] = entry;
172
+ n_entries++;
173
+ }
174
+ }
175
+
176
+ bool load_chars(const char *filename)
177
+ {
178
+ FILE *fp = fopen(filename, "r");
179
+ if (!fp)
180
+ {
181
+ return false;
182
+ }
183
+
184
+ const size_t buf_len = 24;
185
+ char buf[buf_len];
186
+ char *ptr;
187
+
188
+ while(fgets(buf, buf_len, fp))
189
+ {
190
+ // NOTE: there SHOULD be a newline at the end of the file
191
+ buf[strlen(buf)-1] = '\0'; // truncate the newline
192
+ ptr = strchr(buf, ' ');
193
+ if (!ptr)
194
+ continue; // illegal input
195
+ *ptr = '\0';
196
+ add(make_word(ptr+1, 1, atoi(buf)));
197
+ }
198
+
199
+ fclose(fp);
200
+ return true;
201
+ }
202
+
203
+ bool load_words(const char *filename)
204
+ {
205
+ FILE *fp = fopen(filename, "r");
206
+ if (!fp)
207
+ {
208
+ return false;
209
+ }
210
+
211
+ const int buf_len = 48;
212
+ char buf[buf_len];
213
+ char *ptr;
214
+
215
+ while(fgets(buf, buf_len, fp))
216
+ {
217
+ // NOTE: there SHOULD be a newline at the end of the file
218
+ buf[strlen(buf)-1] = '\0'; // truncate the newline
219
+ ptr = strchr(buf, ' ');
220
+ if (!ptr)
221
+ continue; // illegal input
222
+ *ptr = '\0';
223
+ add(make_word(ptr+1, atoi(buf), 0));
224
+ }
225
+
226
+ fclose(fp);
227
+ return true;
228
+ }
229
+ }
230
+ }