rmmseg-cpp-new 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,253 @@
1
+ #include <cctype>
2
+ #include <cassert>
3
+
4
+ #include "rules.h"
5
+ #include "algor.h"
6
+
7
+ using namespace std;
8
+
9
+ namespace rmmseg
10
+ {
11
+ Token Algorithm::next_token()
12
+ {
13
+ do
14
+ {
15
+ if (m_pos >= m_text_length)
16
+ return Token(NULL, 0);
17
+
18
+ Token tk(NULL, 0);
19
+ int len = next_char();
20
+ if (len == 1)
21
+ tk = get_basic_latin_word();
22
+ else
23
+ tk = get_cjk_word(len);
24
+ if (tk.length > 0)
25
+ return tk;
26
+ }
27
+ while (true);
28
+ }
29
+
30
+ Token Algorithm::get_basic_latin_word()
31
+ {
32
+ int len = 1;
33
+ int start, end;
34
+
35
+ // Skip pre-word whitespaces and punctuations
36
+ while (m_pos < m_text_length)
37
+ {
38
+ if (len > 1)
39
+ break;
40
+ if (isalnum(m_text[m_pos]))
41
+ break;
42
+ m_pos++;
43
+ len = next_char();
44
+ }
45
+
46
+ start = m_pos;
47
+ while (m_pos < m_text_length)
48
+ {
49
+ if (len > 1)
50
+ break;
51
+ if (!isalnum(m_text[m_pos]))
52
+ break;
53
+ m_pos++;
54
+ len = next_char();
55
+ }
56
+ end = m_pos;
57
+
58
+ // Skip post-word whitespaces and punctuations
59
+ while (m_pos < m_text_length)
60
+ {
61
+ if (len > 1)
62
+ break;
63
+ if (isalnum(m_text[m_pos]))
64
+ break;
65
+ m_pos++;
66
+ len = next_char();
67
+ }
68
+
69
+ return Token(m_text+start, end-start);
70
+ }
71
+
72
+ Token Algorithm::get_cjk_word(int len)
73
+ {
74
+ vector<Chunk> chunks = create_chunks();
75
+
76
+ if (chunks.size() > 1)
77
+ mm_filter(chunks);
78
+ if (chunks.size() > 1)
79
+ lawl_filter(chunks);
80
+ if (chunks.size() > 1)
81
+ svwl_filter(chunks);
82
+ if (chunks.size() > 1)
83
+ lsdmfocw_filter(chunks);
84
+
85
+ if (chunks.size() < 1)
86
+ return Token(NULL, 0);
87
+
88
+ Token token(m_text+m_pos, chunks[0].words[0]->nbytes);
89
+ m_pos += chunks[0].words[0]->nbytes;
90
+ return token;
91
+ }
92
+
93
+ vector<Chunk> Algorithm::create_chunks()
94
+ {
95
+ vector<Chunk> chunks;
96
+ Chunk chunk;
97
+ Word *w1, *w2, *w3;
98
+
99
+ int orig_pos = m_pos;
100
+ typedef vector<Word *> vec_t;
101
+ typedef vec_t::iterator it_t;
102
+
103
+ vec_t words1 = find_match_words();
104
+ for (it_t i1 = words1.begin();
105
+ i1 != words1.end();
106
+ ++i1)
107
+ {
108
+ w1 = *i1;
109
+ chunk.words[0] = w1;
110
+ m_pos += w1->nbytes;
111
+ if (m_pos < m_text_length)
112
+ {
113
+ vec_t words2 = find_match_words();
114
+ for (it_t i2 = words2.begin();
115
+ i2 != words2.end();
116
+ ++i2)
117
+ {
118
+ w2 = *i2;
119
+ chunk.words[1] = w2;
120
+ m_pos += w2->nbytes;
121
+ if (m_pos < m_text_length)
122
+ {
123
+ vec_t words3 = find_match_words();
124
+ for (it_t i3 = words3.begin();
125
+ i3 != words3.end();
126
+ ++i3)
127
+ {
128
+ w3 = *i3;
129
+ if (w3->length == -1) // tmp word
130
+ {
131
+ chunk.n = 2;
132
+ }
133
+ else
134
+ {
135
+ chunk.n = 3;
136
+ chunk.words[2] = w3;
137
+ }
138
+ chunks.push_back(chunk);
139
+ }
140
+ }
141
+ else if (m_pos == m_text_length)
142
+ {
143
+ chunk.n = 2;
144
+ chunks.push_back(chunk);
145
+ }
146
+ m_pos -= w2->nbytes;
147
+ }
148
+ }
149
+ else if (m_pos == m_text_length)
150
+ {
151
+ chunk.n = 1;
152
+ chunks.push_back(chunk);
153
+ }
154
+ m_pos -= w1->nbytes;
155
+ }
156
+
157
+ m_pos = orig_pos;
158
+ return chunks;
159
+ }
160
+
161
+ int Algorithm::next_char()
162
+ {
163
+ // ONLY for UTF-8
164
+ unsigned char ch = m_text[m_pos];
165
+
166
+ if ( ch>>7 == 0 )
167
+ return 1;
168
+ if ( ch>>5 == 0x6 )
169
+ return 2;
170
+ else if (ch>> 4 == 0xe)
171
+ return 3;
172
+ else if (ch>> 3 == 0x1e )
173
+ return 4;
174
+ else
175
+ return -1;
176
+ // if (ch >= 0xC0 && ch <= 0xDF)
177
+ // {
178
+ // return 1;
179
+ // }
180
+ // if (ch >= 0x60 && ch <= 0xEF)
181
+ // {
182
+ // return min(2, m_text_length-m_pos);
183
+ // }
184
+ // if (ch >= 0xE0 && ch <= 0xEF)
185
+ // {
186
+ // return min(3, m_text_length-m_pos);
187
+ // }
188
+ // return 1;
189
+ }
190
+
191
+ // int Algorithm::next_char(const char * p)
192
+ // {
193
+ // uint8 one = (unsigned char)(*p);
194
+
195
+ // if ( one>>7 == 0 )
196
+ // return 1;
197
+ // if ( one>>5 == 0x6 )
198
+ // return 2;
199
+ // else if (one >> 4 == 0xe)
200
+ // return 3;
201
+ // else if (one >> 3 == 0x1e )
202
+ // return 4;
203
+ // else
204
+ // return -1;
205
+ // }
206
+
207
+ vector<Word *> Algorithm::find_match_words()
208
+ {
209
+ for (int i = 0; i < match_cache_size; ++i)
210
+ if (m_match_cache[i].first == m_pos)
211
+ return m_match_cache[i].second;
212
+
213
+ vector<Word *> words;
214
+ Word *word;
215
+ int orig_pos = m_pos;
216
+ int n = 0, len;
217
+
218
+ while (m_pos < m_text_length)
219
+ {
220
+ if (n >= max_word_length())
221
+ break;
222
+ len = next_char();
223
+ if (len <= 1)
224
+ break;
225
+
226
+ m_pos += len;
227
+ n++;
228
+
229
+ word = dict::get(m_text+orig_pos, m_pos-orig_pos);
230
+ if (word)
231
+ words.push_back(word);
232
+ }
233
+
234
+ m_pos = orig_pos;
235
+
236
+ if (words.empty())
237
+ {
238
+ word = get_tmp_word();
239
+ word->nbytes = next_char();
240
+ word->length = -1;
241
+ strncpy(word->text, m_text+m_pos, word->nbytes);
242
+ word->text[word->nbytes] = '\0';
243
+ words.push_back(word);
244
+ }
245
+
246
+ m_match_cache[m_match_cache_i] = make_pair(m_pos, words);
247
+ m_match_cache_i++;
248
+ if (m_match_cache_i >= match_cache_size)
249
+ m_match_cache_i = 0;
250
+
251
+ return words;
252
+ }
253
+ }
@@ -0,0 +1,79 @@
1
+ #ifndef _ALGORITHM_H_
2
+ #define _ALGORITHM_H_
3
+
4
+ #include <vector>
5
+
6
+ #include "chunk.h"
7
+ #include "token.h"
8
+ #include "dict.h"
9
+
10
+ /**
11
+ * The Algorithm of MMSeg use four rules:
12
+ * - Maximum matching rule
13
+ * - Largest average word length rule
14
+ * - Smallest variance of word length rule
15
+ * - Largest sum of degree of morphemic freedom of one-character
16
+ * words rule
17
+ */
18
+
19
+ namespace rmmseg
20
+ {
21
+ class Algorithm
22
+ {
23
+ public:
24
+ Algorithm(const char *text, int length)
25
+ :m_text(text), m_pos(0),
26
+ m_text_length(length),
27
+ m_tmp_words_i(0),
28
+ m_match_cache_i(0)
29
+ {
30
+ for (int i = 0; i < match_cache_size; ++i)
31
+ m_match_cache[i].first = -1;
32
+ }
33
+
34
+ Token next_token();
35
+
36
+ const char *get_text() const
37
+ {
38
+ return m_text;
39
+ }
40
+
41
+ private:
42
+ Token get_basic_latin_word();
43
+ Token get_cjk_word(int);
44
+
45
+ std::vector<Chunk> create_chunks();
46
+ int next_word();
47
+ int next_char();
48
+ std::vector<Word *> find_match_words();
49
+ int max_word_length() { return 20; }
50
+
51
+ const char *m_text;
52
+ int m_pos;
53
+ int m_text_length;
54
+
55
+ /* tmp words are only for 1-char words which
56
+ * are not exist in the dictionary. It's length
57
+ * value will be set to -1 to indicate it is
58
+ * a tmp word. */
59
+ Word *get_tmp_word()
60
+ {
61
+ if (m_tmp_words_i >= max_tmp_words)
62
+ m_tmp_words_i = 0; // round wrap
63
+ return &m_tmp_words[m_tmp_words_i++];
64
+ }
65
+
66
+ /* related to max_word_length and match_cache_size */
67
+ static const int max_tmp_words = 512;
68
+ Word m_tmp_words[max_tmp_words];
69
+ int m_tmp_words_i;
70
+
71
+ /* match word caches */
72
+ static const int match_cache_size = 3;
73
+ typedef std::pair<int, std::vector<Word *> > match_cache_t;
74
+ match_cache_t m_match_cache[match_cache_size];
75
+ int m_match_cache_i;
76
+ };
77
+ }
78
+
79
+ #endif /* _ALGORITHM_H_ */
@@ -0,0 +1,59 @@
1
+ #ifndef _CHUNK_H_
2
+ #define _CHUNK_H_
3
+
4
+ #include <cmath>
5
+
6
+ #include "word.h"
7
+
8
+ namespace rmmseg
9
+ {
10
+ /**
11
+ * A chunk stores 3 (or less) successive words.
12
+ */
13
+ struct Chunk
14
+ {
15
+ int total_length() const
16
+ {
17
+ int len = 0;
18
+ for (int i = 0; i < n; ++i)
19
+ len += std::abs(words[i]->length);
20
+ //if (words[i]->length == -1) /* tmp word */
21
+ // len += 1;
22
+ //else
23
+ // len += words[i]->length;
24
+ return len;
25
+ }
26
+ double average_length() const
27
+ {
28
+ return ((double)total_length())/n;
29
+ }
30
+ double variance() const
31
+ {
32
+ double avg = average_length();
33
+ double sqr_sum = 0;
34
+ double tmp;
35
+ for (int i = 0; i < n; ++i)
36
+ {
37
+ tmp = std::abs(words[i]->length);
38
+ //if (tmp == -1)
39
+ // tmp = 1;
40
+ tmp = tmp-avg;
41
+ sqr_sum += tmp*tmp;
42
+ }
43
+ return std::sqrt(sqr_sum);
44
+ }
45
+ int degree_of_morphemic_freedom() const
46
+ {
47
+ int sum = 0;
48
+ for (int i = 0; i < n; ++i)
49
+ sum += words[i]->freq;
50
+ return sum;
51
+ }
52
+
53
+
54
+ int n;
55
+ Word *words[3];
56
+ };
57
+ }
58
+
59
+ #endif /* _CHUNK_H_ */
@@ -0,0 +1,230 @@
1
+ #include <cstdio>
2
+
3
+ #include "dict.h"
4
+
5
+ using namespace std;
6
+
7
+ namespace rmmseg
8
+ {
9
+ struct Entry
10
+ {
11
+ Word *word;
12
+ Entry *next;
13
+ };
14
+
15
+ const size_t init_size = 262147;
16
+ const size_t max_density = 5;
17
+ /*
18
+ Table of prime numbers 2^n+a, 2<=n<=30.
19
+ */
20
+ static size_t primes[] = {
21
+ 524288 + 21,
22
+ 1048576 + 7,
23
+ 2097152 + 17,
24
+ 4194304 + 15,
25
+ 8388608 + 9,
26
+ 16777216 + 43,
27
+ 33554432 + 35,
28
+ 67108864 + 15,
29
+ 134217728 + 29,
30
+ 268435456 + 3,
31
+ 536870912 + 11,
32
+ 1073741824 + 85,
33
+ };
34
+
35
+
36
+ static size_t n_bins = init_size;
37
+ static size_t n_entries = 0;
38
+ static Entry **bins = static_cast<Entry **>(std::calloc(init_size,
39
+ sizeof(Entry *)));
40
+
41
+ static size_t new_size()
42
+ {
43
+ for (size_t i = 0;
44
+ i < sizeof(primes)/sizeof(primes[0]);
45
+ ++i)
46
+ {
47
+ if (primes[i] > n_bins)
48
+ {
49
+ return primes[i];
50
+ }
51
+ }
52
+ // TODO: raise exception here
53
+ return n_bins;
54
+ }
55
+
56
+ static unsigned int hash(const char *str, int len)
57
+ {
58
+ unsigned int key = 0;
59
+ while (len--)
60
+ {
61
+ key += *str++;
62
+ key += (key << 10);
63
+ key ^= (key >> 6);
64
+ }
65
+ key += (key << 3);
66
+ key ^= (key >> 11);
67
+ key += (key << 15);
68
+ return key;
69
+ }
70
+
71
+ static void rehash()
72
+ {
73
+ int new_n_bins = new_size();
74
+ Entry **new_bins = static_cast<Entry **>(calloc(new_n_bins,
75
+ sizeof(Entry *)));
76
+ Entry *entry, *next;
77
+ unsigned int hash_val;
78
+
79
+ for (size_t i = 0; i < n_bins; ++i)
80
+ {
81
+ entry = bins[i];
82
+ while (entry)
83
+ {
84
+ next = entry->next;
85
+ hash_val = hash(entry->word->text,
86
+ entry->word->nbytes) % new_n_bins;
87
+ entry->next = new_bins[hash_val];
88
+ new_bins[hash_val] = entry;
89
+ entry = next;
90
+ }
91
+ }
92
+ free(bins);
93
+ n_bins = new_n_bins;
94
+ bins = new_bins;
95
+ }
96
+
97
+ namespace dict
98
+ {
99
+
100
+ /**
101
+ * str: the base of the string
102
+ * len: length of the string (in bytes)
103
+ *
104
+ * str may be a substring of a big chunk of text thus not nul-terminated,
105
+ * so len is necessary here.
106
+ */
107
+ Word *get(const char *str, int len)
108
+ {
109
+ unsigned int h = hash(str, len) % n_bins;
110
+ Entry *entry = bins[h];
111
+ if (!entry)
112
+ return NULL;
113
+ do
114
+ {
115
+ if (len == entry->word->nbytes &&
116
+ strncmp(str, entry->word->text, len) == 0)
117
+ return entry->word;
118
+ entry = entry->next;
119
+ }
120
+ while (entry);
121
+
122
+ return NULL;
123
+ }
124
+
125
+ void add(Word *word)
126
+ {
127
+ unsigned int hash_val = hash(word->text, word->nbytes);
128
+ unsigned int h = hash_val % n_bins;
129
+ Entry *entry = bins[h];
130
+ if (!entry)
131
+ {
132
+ if (n_entries/n_bins > max_density)
133
+ {
134
+ rehash();
135
+ h = hash_val % n_bins;
136
+ }
137
+
138
+ entry = static_cast<Entry *>(pool_alloc(sizeof(Entry)));
139
+ entry->word = word;
140
+ entry->next = NULL;
141
+ bins[h] = entry;
142
+ n_entries++;
143
+ return;
144
+ }
145
+
146
+ bool done = false;
147
+ do
148
+ {
149
+ if (word->nbytes == entry->word->nbytes &&
150
+ strncmp(word->text, entry->word->text, word->nbytes) == 0)
151
+ {
152
+ /* Overwriting. WARNING: the original Word object is
153
+ * permanently lost. This IS a memory leak, because
154
+ * the memory is allocated by pool_alloc. Instead of
155
+ * fixing this, tuning the dictionary file is a better
156
+ * idea
157
+ */
158
+ entry->word = word;
159
+ done = true;
160
+ break;
161
+ }
162
+ entry = entry->next;
163
+ }
164
+ while (entry);
165
+
166
+ if (!done)
167
+ {
168
+ entry = static_cast<Entry *>(pool_alloc(sizeof(Entry)));
169
+ entry->word = word;
170
+ entry->next = bins[h];
171
+ bins[h] = entry;
172
+ n_entries++;
173
+ }
174
+ }
175
+
176
+ bool load_chars(const char *filename)
177
+ {
178
+ FILE *fp = fopen(filename, "r");
179
+ if (!fp)
180
+ {
181
+ return false;
182
+ }
183
+
184
+ const size_t buf_len = 24;
185
+ char buf[buf_len];
186
+ char *ptr;
187
+
188
+ while(fgets(buf, buf_len, fp))
189
+ {
190
+ // NOTE: there SHOULD be a newline at the end of the file
191
+ buf[strlen(buf)-1] = '\0'; // truncate the newline
192
+ ptr = strchr(buf, ' ');
193
+ if (!ptr)
194
+ continue; // illegal input
195
+ *ptr = '\0';
196
+ add(make_word(ptr+1, 1, atoi(buf)));
197
+ }
198
+
199
+ fclose(fp);
200
+ return true;
201
+ }
202
+
203
+ bool load_words(const char *filename)
204
+ {
205
+ FILE *fp = fopen(filename, "r");
206
+ if (!fp)
207
+ {
208
+ return false;
209
+ }
210
+
211
+ const int buf_len = 48;
212
+ char buf[buf_len];
213
+ char *ptr;
214
+
215
+ while(fgets(buf, buf_len, fp))
216
+ {
217
+ // NOTE: there SHOULD be a newline at the end of the file
218
+ buf[strlen(buf)-1] = '\0'; // truncate the newline
219
+ ptr = strchr(buf, ' ');
220
+ if (!ptr)
221
+ continue; // illegal input
222
+ *ptr = '\0';
223
+ add(make_word(ptr+1, atoi(buf), 0));
224
+ }
225
+
226
+ fclose(fp);
227
+ return true;
228
+ }
229
+ }
230
+ }