rmmseg-cpp-traditional 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. data/.gitignore +17 -0
  2. data/Gemfile +4 -0
  3. data/History.txt +21 -0
  4. data/LICENSE.txt +22 -0
  5. data/Manifest.txt +43 -0
  6. data/README +111 -0
  7. data/README.md +29 -0
  8. data/Rakefile +19 -0
  9. data/bin/rmmseg +63 -0
  10. data/data/chars.dic +12638 -0
  11. data/data/words.dic +120308 -0
  12. data/ext/rmmseg/algor.cpp +222 -0
  13. data/ext/rmmseg/algor.h +80 -0
  14. data/ext/rmmseg/chunk.h +59 -0
  15. data/ext/rmmseg/dict.cpp +230 -0
  16. data/ext/rmmseg/dict.h +34 -0
  17. data/ext/rmmseg/extconf.rb +17 -0
  18. data/ext/rmmseg/memory.cpp +9 -0
  19. data/ext/rmmseg/memory.h +43 -0
  20. data/ext/rmmseg/rmmseg.cpp +263 -0
  21. data/ext/rmmseg/rules.h +86 -0
  22. data/ext/rmmseg/token.h +19 -0
  23. data/ext/rmmseg/word.h +44 -0
  24. data/lib/rmmseg/dictionary.rb +59 -0
  25. data/lib/rmmseg/ferret.rb +64 -0
  26. data/lib/rmmseg-cpp-traditional/version.rb +7 -0
  27. data/lib/rmmseg-cpp-traditional.rb +9 -0
  28. data/lib/rmmseg.rb +3 -0
  29. data/misc/convert.rb +114 -0
  30. data/misc/ferret_example.rb +59 -0
  31. data/misc/homepage.erb +196 -0
  32. data/misc/homepage.html +1212 -0
  33. data/rmmseg-cpp-traditional.gemspec +19 -0
  34. data/spec/rmmseg_spec.rb +8 -0
  35. data/spec/spec_helper.rb +17 -0
  36. data/tasks/ann.rake +81 -0
  37. data/tasks/bones.rake +21 -0
  38. data/tasks/gem.rake +126 -0
  39. data/tasks/git.rake +41 -0
  40. data/tasks/homepage.rake +15 -0
  41. data/tasks/manifest.rake +49 -0
  42. data/tasks/notes.rake +28 -0
  43. data/tasks/post_load.rake +39 -0
  44. data/tasks/rdoc.rake +51 -0
  45. data/tasks/rubyforge.rake +58 -0
  46. data/tasks/setup.rb +268 -0
  47. data/tasks/spec.rake +55 -0
  48. data/tasks/svn.rake +48 -0
  49. data/tasks/test.rake +38 -0
  50. data/test/test_rmmseg.rb +0 -0
  51. metadata +116 -0
@@ -0,0 +1,222 @@
1
+ #include <cctype>
2
+ #include <cassert>
3
+
4
+ #include "rules.h"
5
+ #include "algor.h"
6
+
7
+ using namespace std;
8
+
9
+ namespace rmmseg
10
+ {
11
+ Token Algorithm::next_token()
12
+ {
13
+ do
14
+ {
15
+ if (m_pos >= m_text_length)
16
+ return Token(NULL, 0);
17
+
18
+ Token tk(NULL, 0);
19
+ int len = next_char();
20
+ if (len == 1)
21
+ tk = get_basic_latin_word();
22
+ else
23
+ tk = get_cjk_word(len);
24
+ if (tk.length > 0)
25
+ return tk;
26
+ }
27
+ while (true);
28
+ }
29
+
30
+ Token Algorithm::get_basic_latin_word()
31
+ {
32
+ int len = 1;
33
+ int start, end;
34
+
35
+ // Skip pre-word whitespaces and punctuations
36
+ while (m_pos < m_text_length)
37
+ {
38
+ if (len > 1)
39
+ break;
40
+ if (isalnum(m_text[m_pos]))
41
+ break;
42
+ m_pos++;
43
+ len = next_char();
44
+ }
45
+
46
+ start = m_pos;
47
+ while (m_pos < m_text_length)
48
+ {
49
+ if (len > 1)
50
+ break;
51
+ if (!isalnum(m_text[m_pos]))
52
+ break;
53
+ m_pos++;
54
+ len = next_char();
55
+ }
56
+ end = m_pos;
57
+
58
+ // Skip post-word whitespaces and punctuations
59
+ while (m_pos < m_text_length)
60
+ {
61
+ if (len > 1)
62
+ break;
63
+ if (isalnum(m_text[m_pos]))
64
+ break;
65
+ m_pos++;
66
+ len = next_char();
67
+ }
68
+
69
+ return Token(m_text+start, end-start);
70
+ }
71
+
72
+ Token Algorithm::get_cjk_word(int len)
73
+ {
74
+ vector<Chunk> chunks = create_chunks();
75
+
76
+ if (chunks.size() > 1)
77
+ mm_filter(chunks);
78
+ if (chunks.size() > 1)
79
+ lawl_filter(chunks);
80
+ if (chunks.size() > 1)
81
+ svwl_filter(chunks);
82
+ if (chunks.size() > 1)
83
+ lsdmfocw_filter(chunks);
84
+
85
+ if (chunks.size() < 1)
86
+ return Token(NULL, 0);
87
+
88
+ Token token(m_text+m_pos, chunks[0].words[0]->nbytes);
89
+ m_pos += chunks[0].words[0]->nbytes;
90
+ return token;
91
+ }
92
+
93
+ vector<Chunk> Algorithm::create_chunks()
94
+ {
95
+ vector<Chunk> chunks;
96
+ Chunk chunk;
97
+ Word *w1, *w2, *w3;
98
+
99
+ int orig_pos = m_pos;
100
+ typedef vector<Word *> vec_t;
101
+ typedef vec_t::iterator it_t;
102
+
103
+ vec_t words1 = find_match_words();
104
+ for (it_t i1 = words1.begin();
105
+ i1 != words1.end();
106
+ ++i1)
107
+ {
108
+ w1 = *i1;
109
+ chunk.words[0] = w1;
110
+ m_pos += w1->nbytes;
111
+ if (m_pos < m_text_length)
112
+ {
113
+ vec_t words2 = find_match_words();
114
+ for (it_t i2 = words2.begin();
115
+ i2 != words2.end();
116
+ ++i2)
117
+ {
118
+ w2 = *i2;
119
+ chunk.words[1] = w2;
120
+ m_pos += w2->nbytes;
121
+ if (m_pos < m_text_length)
122
+ {
123
+ vec_t words3 = find_match_words();
124
+ for (it_t i3 = words3.begin();
125
+ i3 != words3.end();
126
+ ++i3)
127
+ {
128
+ w3 = *i3;
129
+ if (w3->length == -1) // tmp word
130
+ {
131
+ chunk.n = 2;
132
+ }
133
+ else
134
+ {
135
+ chunk.n = 3;
136
+ chunk.words[2] = w3;
137
+ }
138
+ chunks.push_back(chunk);
139
+ }
140
+ }
141
+ else if (m_pos == m_text_length)
142
+ {
143
+ chunk.n = 2;
144
+ chunks.push_back(chunk);
145
+ }
146
+ m_pos -= w2->nbytes;
147
+ }
148
+ }
149
+ else if (m_pos == m_text_length)
150
+ {
151
+ chunk.n = 1;
152
+ chunks.push_back(chunk);
153
+ }
154
+ m_pos -= w1->nbytes;
155
+ }
156
+
157
+ m_pos = orig_pos;
158
+ return chunks;
159
+ }
160
+
161
+ int Algorithm::next_char()
162
+ {
163
+ // ONLY for UTF-8
164
+ unsigned char ch = m_text[m_pos];
165
+ if (ch >= 0xC0 && ch <= 0xDF)
166
+ {
167
+ return min(2, m_text_length-m_pos);
168
+ }
169
+ if (ch >= 0xE0 && ch <= 0xEF)
170
+ {
171
+ return min(3, m_text_length-m_pos);
172
+ }
173
+ return 1;
174
+ }
175
+
176
+ vector<Word *> Algorithm::find_match_words()
177
+ {
178
+ for (int i = 0; i < match_cache_size; ++i)
179
+ if (m_match_cache[i].first == m_pos)
180
+ return m_match_cache[i].second;
181
+
182
+ vector<Word *> words;
183
+ Word *word;
184
+ int orig_pos = m_pos;
185
+ int n = 0, len;
186
+
187
+ while (m_pos < m_text_length)
188
+ {
189
+ if (n >= max_word_length())
190
+ break;
191
+ len = next_char();
192
+ if (len <= 1)
193
+ break;
194
+
195
+ m_pos += len;
196
+ n++;
197
+
198
+ word = dict::get(m_text+orig_pos, m_pos-orig_pos);
199
+ if (word)
200
+ words.push_back(word);
201
+ }
202
+
203
+ m_pos = orig_pos;
204
+
205
+ if (words.empty())
206
+ {
207
+ word = get_tmp_word();
208
+ word->nbytes = next_char();
209
+ word->length = -1;
210
+ strncpy(word->text, m_text+m_pos, word->nbytes);
211
+ word->text[word->nbytes] = '\0';
212
+ words.push_back(word);
213
+ }
214
+
215
+ m_match_cache[m_match_cache_i] = make_pair(m_pos, words);
216
+ m_match_cache_i++;
217
+ if (m_match_cache_i >= match_cache_size)
218
+ m_match_cache_i = 0;
219
+
220
+ return words;
221
+ }
222
+ }
@@ -0,0 +1,80 @@
1
+ #ifndef _ALGORITHM_H_
2
+ #define _ALGORITHM_H_
3
+
4
+ #include <vector>
5
+
6
+ #include "chunk.h"
7
+ #include "token.h"
8
+ #include "dict.h"
9
+
10
+ /**
11
+ * The Algorithm of MMSeg use four rules:
12
+ * - Maximum matching rule
13
+ * - Largest average word length rule
14
+ * - Smallest variance of word length rule
15
+ * - Largest sum of degree of morphemic freedom of one-character
16
+ * words rule
17
+ */
18
+
19
+ namespace rmmseg
20
+ {
21
+ class Algorithm
22
+ {
23
+ public:
24
+ Algorithm(const char *text, int length)
25
+ :m_text(text), m_pos(0),
26
+ m_text_length(length),
27
+ m_tmp_words_i(0),
28
+ m_match_cache_i(0)
29
+ {
30
+ for (int i = 0; i < match_cache_size; ++i)
31
+ m_match_cache[i].first = -1;
32
+ }
33
+
34
+ Token next_token();
35
+
36
+ const char *get_text() const
37
+ {
38
+ return m_text;
39
+ }
40
+
41
+ private:
42
+ Token get_basic_latin_word();
43
+ Token get_cjk_word(int);
44
+
45
+ std::vector<Chunk> create_chunks();
46
+ int next_word();
47
+ int next_char();
48
+ std::vector<Word *> find_match_words();
49
+ int max_word_length() { return 4; }
50
+
51
+
52
+ const char *m_text;
53
+ int m_pos;
54
+ int m_text_length;
55
+
56
+ /* tmp words are only for 1-char words which
57
+ * are not exist in the dictionary. It's length
58
+ * value will be set to -1 to indicate it is
59
+ * a tmp word. */
60
+ Word *get_tmp_word()
61
+ {
62
+ if (m_tmp_words_i >= max_tmp_words)
63
+ m_tmp_words_i = 0; // round wrap
64
+ return &m_tmp_words[m_tmp_words_i++];
65
+ }
66
+
67
+ /* related to max_word_length and match_cache_size */
68
+ static const int max_tmp_words = 64;
69
+ Word m_tmp_words[max_tmp_words];
70
+ int m_tmp_words_i;
71
+
72
+ /* match word caches */
73
+ static const int match_cache_size = 3;
74
+ typedef std::pair<int, std::vector<Word *> > match_cache_t;
75
+ match_cache_t m_match_cache[match_cache_size];
76
+ int m_match_cache_i;
77
+ };
78
+ }
79
+
80
+ #endif /* _ALGORITHM_H_ */
@@ -0,0 +1,59 @@
1
+ #ifndef _CHUNK_H_
2
+ #define _CHUNK_H_
3
+
4
+ #include <cmath>
5
+
6
+ #include "word.h"
7
+
8
+ namespace rmmseg
9
+ {
10
+ /**
11
+ * A chunk stores 3 (or less) successive words.
12
+ */
13
+ struct Chunk
14
+ {
15
+ int total_length() const
16
+ {
17
+ int len = 0;
18
+ for (int i = 0; i < n; ++i)
19
+ len += std::abs(words[i]->length);
20
+ //if (words[i]->length == -1) /* tmp word */
21
+ // len += 1;
22
+ //else
23
+ // len += words[i]->length;
24
+ return len;
25
+ }
26
+ double average_length() const
27
+ {
28
+ return ((double)total_length())/n;
29
+ }
30
+ double variance() const
31
+ {
32
+ double avg = average_length();
33
+ double sqr_sum = 0;
34
+ double tmp;
35
+ for (int i = 0; i < n; ++i)
36
+ {
37
+ tmp = std::abs(words[i]->length);
38
+ //if (tmp == -1)
39
+ // tmp = 1;
40
+ tmp = tmp-avg;
41
+ sqr_sum += tmp*tmp;
42
+ }
43
+ return std::sqrt(sqr_sum);
44
+ }
45
+ int degree_of_morphemic_freedom() const
46
+ {
47
+ int sum = 0;
48
+ for (int i = 0; i < n; ++i)
49
+ sum += words[i]->freq;
50
+ return sum;
51
+ }
52
+
53
+
54
+ int n;
55
+ Word *words[3];
56
+ };
57
+ }
58
+
59
+ #endif /* _CHUNK_H_ */
@@ -0,0 +1,230 @@
1
+ #include <cstdio>
2
+
3
+ #include "dict.h"
4
+
5
+ using namespace std;
6
+
7
+ namespace rmmseg
8
+ {
9
+ struct Entry
10
+ {
11
+ Word *word;
12
+ Entry *next;
13
+ };
14
+
15
+ const size_t init_size = 262147;
16
+ const size_t max_density = 5;
17
+ /*
18
+ Table of prime numbers 2^n+a, 2<=n<=30.
19
+ */
20
+ static size_t primes[] = {
21
+ 524288 + 21,
22
+ 1048576 + 7,
23
+ 2097152 + 17,
24
+ 4194304 + 15,
25
+ 8388608 + 9,
26
+ 16777216 + 43,
27
+ 33554432 + 35,
28
+ 67108864 + 15,
29
+ 134217728 + 29,
30
+ 268435456 + 3,
31
+ 536870912 + 11,
32
+ 1073741824 + 85,
33
+ };
34
+
35
+
36
+ static size_t n_bins = init_size;
37
+ static size_t n_entries = 0;
38
+ static Entry **bins = static_cast<Entry **>(std::calloc(init_size,
39
+ sizeof(Entry *)));
40
+
41
+ static size_t new_size()
42
+ {
43
+ for (size_t i = 0;
44
+ i < sizeof(primes)/sizeof(primes[0]);
45
+ ++i)
46
+ {
47
+ if (primes[i] > n_bins)
48
+ {
49
+ return primes[i];
50
+ }
51
+ }
52
+ // TODO: raise exception here
53
+ return n_bins;
54
+ }
55
+
56
+ static unsigned int hash(const char *str, int len)
57
+ {
58
+ unsigned int key = 0;
59
+ while (len--)
60
+ {
61
+ key += *str++;
62
+ key += (key << 10);
63
+ key ^= (key >> 6);
64
+ }
65
+ key += (key << 3);
66
+ key ^= (key >> 11);
67
+ key += (key << 15);
68
+ return key;
69
+ }
70
+
71
+ static void rehash()
72
+ {
73
+ int new_n_bins = new_size();
74
+ Entry **new_bins = static_cast<Entry **>(calloc(new_n_bins,
75
+ sizeof(Entry *)));
76
+ Entry *entry, *next;
77
+ unsigned int hash_val;
78
+
79
+ for (size_t i = 0; i < n_bins; ++i)
80
+ {
81
+ entry = bins[i];
82
+ while (entry)
83
+ {
84
+ next = entry->next;
85
+ hash_val = hash(entry->word->text,
86
+ entry->word->nbytes) % new_n_bins;
87
+ entry->next = new_bins[hash_val];
88
+ new_bins[hash_val] = entry;
89
+ entry = next;
90
+ }
91
+ }
92
+ free(bins);
93
+ n_bins = new_n_bins;
94
+ bins = new_bins;
95
+ }
96
+
97
+ namespace dict
98
+ {
99
+
100
+ /**
101
+ * str: the base of the string
102
+ * len: length of the string (in bytes)
103
+ *
104
+ * str may be a substring of a big chunk of text thus not nul-terminated,
105
+ * so len is necessary here.
106
+ */
107
+ Word *get(const char *str, int len)
108
+ {
109
+ unsigned int h = hash(str, len) % n_bins;
110
+ Entry *entry = bins[h];
111
+ if (!entry)
112
+ return NULL;
113
+ do
114
+ {
115
+ if (len == entry->word->nbytes &&
116
+ strncmp(str, entry->word->text, len) == 0)
117
+ return entry->word;
118
+ entry = entry->next;
119
+ }
120
+ while (entry);
121
+
122
+ return NULL;
123
+ }
124
+
125
+ void add(Word *word)
126
+ {
127
+ unsigned int hash_val = hash(word->text, word->nbytes);
128
+ unsigned int h = hash_val % n_bins;
129
+ Entry *entry = bins[h];
130
+ if (!entry)
131
+ {
132
+ if (n_entries/n_bins > max_density)
133
+ {
134
+ rehash();
135
+ h = hash_val % n_bins;
136
+ }
137
+
138
+ entry = static_cast<Entry *>(pool_alloc(sizeof(Entry)));
139
+ entry->word = word;
140
+ entry->next = NULL;
141
+ bins[h] = entry;
142
+ n_entries++;
143
+ return;
144
+ }
145
+
146
+ bool done = false;
147
+ do
148
+ {
149
+ if (word->nbytes == entry->word->nbytes &&
150
+ strncmp(word->text, entry->word->text, word->nbytes) == 0)
151
+ {
152
+ /* Overwriting. WARNING: the original Word object is
153
+ * permanently lost. This IS a memory leak, because
154
+ * the memory is allocated by pool_alloc. Instead of
155
+ * fixing this, tuning the dictionary file is a better
156
+ * idea
157
+ */
158
+ entry->word = word;
159
+ done = true;
160
+ break;
161
+ }
162
+ entry = entry->next;
163
+ }
164
+ while (entry);
165
+
166
+ if (!done)
167
+ {
168
+ entry = static_cast<Entry *>(pool_alloc(sizeof(Entry)));
169
+ entry->word = word;
170
+ entry->next = bins[h];
171
+ bins[h] = entry;
172
+ n_entries++;
173
+ }
174
+ }
175
+
176
+ bool load_chars(const char *filename)
177
+ {
178
+ FILE *fp = fopen(filename, "r");
179
+ if (!fp)
180
+ {
181
+ return false;
182
+ }
183
+
184
+ const size_t buf_len = 24;
185
+ char buf[buf_len];
186
+ char *ptr;
187
+
188
+ while(fgets(buf, buf_len, fp))
189
+ {
190
+ // NOTE: there SHOULD be a newline at the end of the file
191
+ buf[strlen(buf)-1] = '\0'; // truncate the newline
192
+ ptr = strchr(buf, ' ');
193
+ if (!ptr)
194
+ continue; // illegal input
195
+ *ptr = '\0';
196
+ add(make_word(ptr+1, 1, atoi(buf)));
197
+ }
198
+
199
+ fclose(fp);
200
+ return true;
201
+ }
202
+
203
+ bool load_words(const char *filename)
204
+ {
205
+ FILE *fp = fopen(filename, "r");
206
+ if (!fp)
207
+ {
208
+ return false;
209
+ }
210
+
211
+ const int buf_len = 48;
212
+ char buf[buf_len];
213
+ char *ptr;
214
+
215
+ while(fgets(buf, buf_len, fp))
216
+ {
217
+ // NOTE: there SHOULD be a newline at the end of the file
218
+ buf[strlen(buf)-1] = '\0'; // truncate the newline
219
+ ptr = strchr(buf, ' ');
220
+ if (!ptr)
221
+ continue; // illegal input
222
+ *ptr = '\0';
223
+ add(make_word(ptr+1, atoi(buf), 0));
224
+ }
225
+
226
+ fclose(fp);
227
+ return true;
228
+ }
229
+ }
230
+ }
data/ext/rmmseg/dict.h ADDED
@@ -0,0 +1,34 @@
1
+ #ifndef _DICT_H_
2
+ #define _DICT_H_
3
+
4
+ #include "word.h"
5
+
6
+ /**
7
+ * A dictionary is a hash table of
8
+ * - key: string
9
+ * - value: word
10
+ *
11
+ * Dictionary data can be loaded from files. Two type of dictionary
12
+ * files are supported:
13
+ * - character file: Each line contains a number and a character,
14
+ * the number is the frequency of the character.
15
+ * The frequency should NOT exceeds 65535.
16
+ * - word file: Each line contains a number and a word, the
17
+ * number is the character count of the word.
18
+ */
19
+
20
+ namespace rmmseg
21
+ {
22
+ /* Instead of making a class with only one instance, i'll not
23
+ * bother to make it a class here. */
24
+
25
+ namespace dict
26
+ {
27
+ void add(Word *word);
28
+ bool load_chars(const char *filename);
29
+ bool load_words(const char *filename);
30
+ Word *get(const char *str, int len);
31
+ }
32
+ }
33
+
34
+ #endif /* _DICT_H_ */
@@ -0,0 +1,17 @@
1
+ require 'mkmf'
2
+
3
+ CONFIG['LDSHARED'] = CONFIG['LDSHARED'].sub(/^\$\(CC\)/, 'g++')
4
+
5
+ # if RUBY_PLATFORM =~ /darwin/
6
+ # # CONFIG['LDSHARED'] = 'g++ --dynamiclib -flat_namespace -undefined suppress'
7
+ # CONFIG['LDSHARED'] = 'g++ --dynamiclib'
8
+ # elsif RUBY_PLATFORM =~ /linux/
9
+ # CONFIG['LDSHARED'] = 'g++ -shared'
10
+ # end
11
+
12
+ if RUBY_PLATFORM =~ /darwin/
13
+ CONFIG['LDSHARED'] = 'g++ -dynamiclib -single_module -flat_namespace -undefined suppress'
14
+ end
15
+
16
+ $objs = ['algor.o', 'dict.o', 'memory.o', 'rmmseg.o']
17
+ create_makefile('rmmseg')
@@ -0,0 +1,9 @@
1
+ #include "memory.h"
2
+
3
+ #define PRE_ALLOC_SIZE 2097152 /* 2MB */
4
+
5
+ namespace rmmseg
6
+ {
7
+ char *_pool_base = static_cast<char *>(std::malloc(PRE_ALLOC_SIZE));
8
+ size_t _pool_size = PRE_ALLOC_SIZE;
9
+ }
@@ -0,0 +1,43 @@
1
+ #ifndef _MEMORY_H_
2
+ #define _MEMORY_H_
3
+
4
+ #include <cstdlib>
5
+
6
+ /**
7
+ * Pre-allocate a chunk of memory and allocate them in small pieces.
8
+ * Those memory are never freed after allocation. Used for persist
9
+ * data like dictionary contents that will never be destroyed unless
10
+ * the application exited.
11
+ */
12
+
13
+ namespace rmmseg
14
+ {
15
+ const size_t REALLOC_SIZE = 2048; /* 2KB */
16
+
17
+ extern size_t _pool_size;
18
+ extern char *_pool_base;
19
+
20
+ inline void *pool_alloc(size_t len)
21
+ {
22
+ void *mem = _pool_base;
23
+
24
+ if (len <= _pool_size)
25
+ {
26
+ _pool_size -= len;
27
+ _pool_base += len;
28
+ return mem;
29
+ }
30
+
31
+ /* NOTE: the remaining memory is simply discard, which WILL
32
+ * cause memory leak. However, this function is not for allocating
33
+ * large object. Larger pre-alloc chunk size will also reduce the
34
+ * impact of this leak. So this is generally not a problem. */
35
+ _pool_base = static_cast<char *>(std::malloc(REALLOC_SIZE));
36
+ mem = _pool_base;
37
+ _pool_base += len;
38
+ _pool_size = REALLOC_SIZE - len;
39
+ return mem;
40
+ }
41
+ }
42
+
43
+ #endif /* _MEMORY_H_ */