rmmseg-cpp-traditional 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. data/.gitignore +17 -0
  2. data/Gemfile +4 -0
  3. data/History.txt +21 -0
  4. data/LICENSE.txt +22 -0
  5. data/Manifest.txt +43 -0
  6. data/README +111 -0
  7. data/README.md +29 -0
  8. data/Rakefile +19 -0
  9. data/bin/rmmseg +63 -0
  10. data/data/chars.dic +12638 -0
  11. data/data/words.dic +120308 -0
  12. data/ext/rmmseg/algor.cpp +222 -0
  13. data/ext/rmmseg/algor.h +80 -0
  14. data/ext/rmmseg/chunk.h +59 -0
  15. data/ext/rmmseg/dict.cpp +230 -0
  16. data/ext/rmmseg/dict.h +34 -0
  17. data/ext/rmmseg/extconf.rb +17 -0
  18. data/ext/rmmseg/memory.cpp +9 -0
  19. data/ext/rmmseg/memory.h +43 -0
  20. data/ext/rmmseg/rmmseg.cpp +263 -0
  21. data/ext/rmmseg/rules.h +86 -0
  22. data/ext/rmmseg/token.h +19 -0
  23. data/ext/rmmseg/word.h +44 -0
  24. data/lib/rmmseg/dictionary.rb +59 -0
  25. data/lib/rmmseg/ferret.rb +64 -0
  26. data/lib/rmmseg-cpp-traditional/version.rb +7 -0
  27. data/lib/rmmseg-cpp-traditional.rb +9 -0
  28. data/lib/rmmseg.rb +3 -0
  29. data/misc/convert.rb +114 -0
  30. data/misc/ferret_example.rb +59 -0
  31. data/misc/homepage.erb +196 -0
  32. data/misc/homepage.html +1212 -0
  33. data/rmmseg-cpp-traditional.gemspec +19 -0
  34. data/spec/rmmseg_spec.rb +8 -0
  35. data/spec/spec_helper.rb +17 -0
  36. data/tasks/ann.rake +81 -0
  37. data/tasks/bones.rake +21 -0
  38. data/tasks/gem.rake +126 -0
  39. data/tasks/git.rake +41 -0
  40. data/tasks/homepage.rake +15 -0
  41. data/tasks/manifest.rake +49 -0
  42. data/tasks/notes.rake +28 -0
  43. data/tasks/post_load.rake +39 -0
  44. data/tasks/rdoc.rake +51 -0
  45. data/tasks/rubyforge.rake +58 -0
  46. data/tasks/setup.rb +268 -0
  47. data/tasks/spec.rake +55 -0
  48. data/tasks/svn.rake +48 -0
  49. data/tasks/test.rake +38 -0
  50. data/test/test_rmmseg.rb +0 -0
  51. metadata +116 -0
@@ -0,0 +1,222 @@
1
+ #include <cctype>
2
+ #include <cassert>
3
+
4
+ #include "rules.h"
5
+ #include "algor.h"
6
+
7
+ using namespace std;
8
+
9
+ namespace rmmseg
10
+ {
11
+ Token Algorithm::next_token()
12
+ {
13
+ do
14
+ {
15
+ if (m_pos >= m_text_length)
16
+ return Token(NULL, 0);
17
+
18
+ Token tk(NULL, 0);
19
+ int len = next_char();
20
+ if (len == 1)
21
+ tk = get_basic_latin_word();
22
+ else
23
+ tk = get_cjk_word(len);
24
+ if (tk.length > 0)
25
+ return tk;
26
+ }
27
+ while (true);
28
+ }
29
+
30
+ Token Algorithm::get_basic_latin_word()
31
+ {
32
+ int len = 1;
33
+ int start, end;
34
+
35
+ // Skip pre-word whitespaces and punctuations
36
+ while (m_pos < m_text_length)
37
+ {
38
+ if (len > 1)
39
+ break;
40
+ if (isalnum(m_text[m_pos]))
41
+ break;
42
+ m_pos++;
43
+ len = next_char();
44
+ }
45
+
46
+ start = m_pos;
47
+ while (m_pos < m_text_length)
48
+ {
49
+ if (len > 1)
50
+ break;
51
+ if (!isalnum(m_text[m_pos]))
52
+ break;
53
+ m_pos++;
54
+ len = next_char();
55
+ }
56
+ end = m_pos;
57
+
58
+ // Skip post-word whitespaces and punctuations
59
+ while (m_pos < m_text_length)
60
+ {
61
+ if (len > 1)
62
+ break;
63
+ if (isalnum(m_text[m_pos]))
64
+ break;
65
+ m_pos++;
66
+ len = next_char();
67
+ }
68
+
69
+ return Token(m_text+start, end-start);
70
+ }
71
+
72
+ Token Algorithm::get_cjk_word(int len)
73
+ {
74
+ vector<Chunk> chunks = create_chunks();
75
+
76
+ if (chunks.size() > 1)
77
+ mm_filter(chunks);
78
+ if (chunks.size() > 1)
79
+ lawl_filter(chunks);
80
+ if (chunks.size() > 1)
81
+ svwl_filter(chunks);
82
+ if (chunks.size() > 1)
83
+ lsdmfocw_filter(chunks);
84
+
85
+ if (chunks.size() < 1)
86
+ return Token(NULL, 0);
87
+
88
+ Token token(m_text+m_pos, chunks[0].words[0]->nbytes);
89
+ m_pos += chunks[0].words[0]->nbytes;
90
+ return token;
91
+ }
92
+
93
+ vector<Chunk> Algorithm::create_chunks()
94
+ {
95
+ vector<Chunk> chunks;
96
+ Chunk chunk;
97
+ Word *w1, *w2, *w3;
98
+
99
+ int orig_pos = m_pos;
100
+ typedef vector<Word *> vec_t;
101
+ typedef vec_t::iterator it_t;
102
+
103
+ vec_t words1 = find_match_words();
104
+ for (it_t i1 = words1.begin();
105
+ i1 != words1.end();
106
+ ++i1)
107
+ {
108
+ w1 = *i1;
109
+ chunk.words[0] = w1;
110
+ m_pos += w1->nbytes;
111
+ if (m_pos < m_text_length)
112
+ {
113
+ vec_t words2 = find_match_words();
114
+ for (it_t i2 = words2.begin();
115
+ i2 != words2.end();
116
+ ++i2)
117
+ {
118
+ w2 = *i2;
119
+ chunk.words[1] = w2;
120
+ m_pos += w2->nbytes;
121
+ if (m_pos < m_text_length)
122
+ {
123
+ vec_t words3 = find_match_words();
124
+ for (it_t i3 = words3.begin();
125
+ i3 != words3.end();
126
+ ++i3)
127
+ {
128
+ w3 = *i3;
129
+ if (w3->length == -1) // tmp word
130
+ {
131
+ chunk.n = 2;
132
+ }
133
+ else
134
+ {
135
+ chunk.n = 3;
136
+ chunk.words[2] = w3;
137
+ }
138
+ chunks.push_back(chunk);
139
+ }
140
+ }
141
+ else if (m_pos == m_text_length)
142
+ {
143
+ chunk.n = 2;
144
+ chunks.push_back(chunk);
145
+ }
146
+ m_pos -= w2->nbytes;
147
+ }
148
+ }
149
+ else if (m_pos == m_text_length)
150
+ {
151
+ chunk.n = 1;
152
+ chunks.push_back(chunk);
153
+ }
154
+ m_pos -= w1->nbytes;
155
+ }
156
+
157
+ m_pos = orig_pos;
158
+ return chunks;
159
+ }
160
+
161
+ int Algorithm::next_char()
162
+ {
163
+ // ONLY for UTF-8
164
+ unsigned char ch = m_text[m_pos];
165
+ if (ch >= 0xC0 && ch <= 0xDF)
166
+ {
167
+ return min(2, m_text_length-m_pos);
168
+ }
169
+ if (ch >= 0xE0 && ch <= 0xEF)
170
+ {
171
+ return min(3, m_text_length-m_pos);
172
+ }
173
+ return 1;
174
+ }
175
+
176
+ vector<Word *> Algorithm::find_match_words()
177
+ {
178
+ for (int i = 0; i < match_cache_size; ++i)
179
+ if (m_match_cache[i].first == m_pos)
180
+ return m_match_cache[i].second;
181
+
182
+ vector<Word *> words;
183
+ Word *word;
184
+ int orig_pos = m_pos;
185
+ int n = 0, len;
186
+
187
+ while (m_pos < m_text_length)
188
+ {
189
+ if (n >= max_word_length())
190
+ break;
191
+ len = next_char();
192
+ if (len <= 1)
193
+ break;
194
+
195
+ m_pos += len;
196
+ n++;
197
+
198
+ word = dict::get(m_text+orig_pos, m_pos-orig_pos);
199
+ if (word)
200
+ words.push_back(word);
201
+ }
202
+
203
+ m_pos = orig_pos;
204
+
205
+ if (words.empty())
206
+ {
207
+ word = get_tmp_word();
208
+ word->nbytes = next_char();
209
+ word->length = -1;
210
+ strncpy(word->text, m_text+m_pos, word->nbytes);
211
+ word->text[word->nbytes] = '\0';
212
+ words.push_back(word);
213
+ }
214
+
215
+ m_match_cache[m_match_cache_i] = make_pair(m_pos, words);
216
+ m_match_cache_i++;
217
+ if (m_match_cache_i >= match_cache_size)
218
+ m_match_cache_i = 0;
219
+
220
+ return words;
221
+ }
222
+ }
@@ -0,0 +1,80 @@
1
+ #ifndef _ALGORITHM_H_
2
+ #define _ALGORITHM_H_
3
+
4
+ #include <vector>
5
+
6
+ #include "chunk.h"
7
+ #include "token.h"
8
+ #include "dict.h"
9
+
10
+ /**
11
+ * The Algorithm of MMSeg use four rules:
12
+ * - Maximum matching rule
13
+ * - Largest average word length rule
14
+ * - Smallest variance of word length rule
15
+ * - Largest sum of degree of morphemic freedom of one-character
16
+ * words rule
17
+ */
18
+
19
+ namespace rmmseg
20
+ {
21
+ class Algorithm
22
+ {
23
+ public:
24
+ Algorithm(const char *text, int length)
25
+ :m_text(text), m_pos(0),
26
+ m_text_length(length),
27
+ m_tmp_words_i(0),
28
+ m_match_cache_i(0)
29
+ {
30
+ for (int i = 0; i < match_cache_size; ++i)
31
+ m_match_cache[i].first = -1;
32
+ }
33
+
34
+ Token next_token();
35
+
36
+ const char *get_text() const
37
+ {
38
+ return m_text;
39
+ }
40
+
41
+ private:
42
+ Token get_basic_latin_word();
43
+ Token get_cjk_word(int);
44
+
45
+ std::vector<Chunk> create_chunks();
46
+ int next_word();
47
+ int next_char();
48
+ std::vector<Word *> find_match_words();
49
+ int max_word_length() { return 4; }
50
+
51
+
52
+ const char *m_text;
53
+ int m_pos;
54
+ int m_text_length;
55
+
56
+ /* tmp words are only for 1-char words which
57
+ * are not exist in the dictionary. It's length
58
+ * value will be set to -1 to indicate it is
59
+ * a tmp word. */
60
+ Word *get_tmp_word()
61
+ {
62
+ if (m_tmp_words_i >= max_tmp_words)
63
+ m_tmp_words_i = 0; // round wrap
64
+ return &m_tmp_words[m_tmp_words_i++];
65
+ }
66
+
67
+ /* related to max_word_length and match_cache_size */
68
+ static const int max_tmp_words = 64;
69
+ Word m_tmp_words[max_tmp_words];
70
+ int m_tmp_words_i;
71
+
72
+ /* match word caches */
73
+ static const int match_cache_size = 3;
74
+ typedef std::pair<int, std::vector<Word *> > match_cache_t;
75
+ match_cache_t m_match_cache[match_cache_size];
76
+ int m_match_cache_i;
77
+ };
78
+ }
79
+
80
+ #endif /* _ALGORITHM_H_ */
@@ -0,0 +1,59 @@
1
+ #ifndef _CHUNK_H_
2
+ #define _CHUNK_H_
3
+
4
+ #include <cmath>
5
+
6
+ #include "word.h"
7
+
8
+ namespace rmmseg
9
+ {
10
+ /**
11
+ * A chunk stores 3 (or less) successive words.
12
+ */
13
+ struct Chunk
14
+ {
15
+ int total_length() const
16
+ {
17
+ int len = 0;
18
+ for (int i = 0; i < n; ++i)
19
+ len += std::abs(words[i]->length);
20
+ //if (words[i]->length == -1) /* tmp word */
21
+ // len += 1;
22
+ //else
23
+ // len += words[i]->length;
24
+ return len;
25
+ }
26
+ double average_length() const
27
+ {
28
+ return ((double)total_length())/n;
29
+ }
30
+ double variance() const
31
+ {
32
+ double avg = average_length();
33
+ double sqr_sum = 0;
34
+ double tmp;
35
+ for (int i = 0; i < n; ++i)
36
+ {
37
+ tmp = std::abs(words[i]->length);
38
+ //if (tmp == -1)
39
+ // tmp = 1;
40
+ tmp = tmp-avg;
41
+ sqr_sum += tmp*tmp;
42
+ }
43
+ return std::sqrt(sqr_sum);
44
+ }
45
+ int degree_of_morphemic_freedom() const
46
+ {
47
+ int sum = 0;
48
+ for (int i = 0; i < n; ++i)
49
+ sum += words[i]->freq;
50
+ return sum;
51
+ }
52
+
53
+
54
+ int n;
55
+ Word *words[3];
56
+ };
57
+ }
58
+
59
+ #endif /* _CHUNK_H_ */
@@ -0,0 +1,230 @@
1
+ #include <cstdio>
2
+
3
+ #include "dict.h"
4
+
5
+ using namespace std;
6
+
7
+ namespace rmmseg
8
+ {
9
+ struct Entry
10
+ {
11
+ Word *word;
12
+ Entry *next;
13
+ };
14
+
15
+ const size_t init_size = 262147;
16
+ const size_t max_density = 5;
17
+ /*
18
+ Table of prime numbers 2^n+a, 2<=n<=30.
19
+ */
20
+ static size_t primes[] = {
21
+ 524288 + 21,
22
+ 1048576 + 7,
23
+ 2097152 + 17,
24
+ 4194304 + 15,
25
+ 8388608 + 9,
26
+ 16777216 + 43,
27
+ 33554432 + 35,
28
+ 67108864 + 15,
29
+ 134217728 + 29,
30
+ 268435456 + 3,
31
+ 536870912 + 11,
32
+ 1073741824 + 85,
33
+ };
34
+
35
+
36
+ static size_t n_bins = init_size;
37
+ static size_t n_entries = 0;
38
+ static Entry **bins = static_cast<Entry **>(std::calloc(init_size,
39
+ sizeof(Entry *)));
40
+
41
+ static size_t new_size()
42
+ {
43
+ for (size_t i = 0;
44
+ i < sizeof(primes)/sizeof(primes[0]);
45
+ ++i)
46
+ {
47
+ if (primes[i] > n_bins)
48
+ {
49
+ return primes[i];
50
+ }
51
+ }
52
+ // TODO: raise exception here
53
+ return n_bins;
54
+ }
55
+
56
+ static unsigned int hash(const char *str, int len)
57
+ {
58
+ unsigned int key = 0;
59
+ while (len--)
60
+ {
61
+ key += *str++;
62
+ key += (key << 10);
63
+ key ^= (key >> 6);
64
+ }
65
+ key += (key << 3);
66
+ key ^= (key >> 11);
67
+ key += (key << 15);
68
+ return key;
69
+ }
70
+
71
+ static void rehash()
72
+ {
73
+ int new_n_bins = new_size();
74
+ Entry **new_bins = static_cast<Entry **>(calloc(new_n_bins,
75
+ sizeof(Entry *)));
76
+ Entry *entry, *next;
77
+ unsigned int hash_val;
78
+
79
+ for (size_t i = 0; i < n_bins; ++i)
80
+ {
81
+ entry = bins[i];
82
+ while (entry)
83
+ {
84
+ next = entry->next;
85
+ hash_val = hash(entry->word->text,
86
+ entry->word->nbytes) % new_n_bins;
87
+ entry->next = new_bins[hash_val];
88
+ new_bins[hash_val] = entry;
89
+ entry = next;
90
+ }
91
+ }
92
+ free(bins);
93
+ n_bins = new_n_bins;
94
+ bins = new_bins;
95
+ }
96
+
97
+ namespace dict
98
+ {
99
+
100
+ /**
101
+ * str: the base of the string
102
+ * len: length of the string (in bytes)
103
+ *
104
+ * str may be a substring of a big chunk of text thus not nul-terminated,
105
+ * so len is necessary here.
106
+ */
107
+ Word *get(const char *str, int len)
108
+ {
109
+ unsigned int h = hash(str, len) % n_bins;
110
+ Entry *entry = bins[h];
111
+ if (!entry)
112
+ return NULL;
113
+ do
114
+ {
115
+ if (len == entry->word->nbytes &&
116
+ strncmp(str, entry->word->text, len) == 0)
117
+ return entry->word;
118
+ entry = entry->next;
119
+ }
120
+ while (entry);
121
+
122
+ return NULL;
123
+ }
124
+
125
+ void add(Word *word)
126
+ {
127
+ unsigned int hash_val = hash(word->text, word->nbytes);
128
+ unsigned int h = hash_val % n_bins;
129
+ Entry *entry = bins[h];
130
+ if (!entry)
131
+ {
132
+ if (n_entries/n_bins > max_density)
133
+ {
134
+ rehash();
135
+ h = hash_val % n_bins;
136
+ }
137
+
138
+ entry = static_cast<Entry *>(pool_alloc(sizeof(Entry)));
139
+ entry->word = word;
140
+ entry->next = NULL;
141
+ bins[h] = entry;
142
+ n_entries++;
143
+ return;
144
+ }
145
+
146
+ bool done = false;
147
+ do
148
+ {
149
+ if (word->nbytes == entry->word->nbytes &&
150
+ strncmp(word->text, entry->word->text, word->nbytes) == 0)
151
+ {
152
+ /* Overwriting. WARNING: the original Word object is
153
+ * permanently lost. This IS a memory leak, because
154
+ * the memory is allocated by pool_alloc. Instead of
155
+ * fixing this, tuning the dictionary file is a better
156
+ * idea
157
+ */
158
+ entry->word = word;
159
+ done = true;
160
+ break;
161
+ }
162
+ entry = entry->next;
163
+ }
164
+ while (entry);
165
+
166
+ if (!done)
167
+ {
168
+ entry = static_cast<Entry *>(pool_alloc(sizeof(Entry)));
169
+ entry->word = word;
170
+ entry->next = bins[h];
171
+ bins[h] = entry;
172
+ n_entries++;
173
+ }
174
+ }
175
+
176
+ bool load_chars(const char *filename)
177
+ {
178
+ FILE *fp = fopen(filename, "r");
179
+ if (!fp)
180
+ {
181
+ return false;
182
+ }
183
+
184
+ const size_t buf_len = 24;
185
+ char buf[buf_len];
186
+ char *ptr;
187
+
188
+ while(fgets(buf, buf_len, fp))
189
+ {
190
+ // NOTE: there SHOULD be a newline at the end of the file
191
+ buf[strlen(buf)-1] = '\0'; // truncate the newline
192
+ ptr = strchr(buf, ' ');
193
+ if (!ptr)
194
+ continue; // illegal input
195
+ *ptr = '\0';
196
+ add(make_word(ptr+1, 1, atoi(buf)));
197
+ }
198
+
199
+ fclose(fp);
200
+ return true;
201
+ }
202
+
203
+ bool load_words(const char *filename)
204
+ {
205
+ FILE *fp = fopen(filename, "r");
206
+ if (!fp)
207
+ {
208
+ return false;
209
+ }
210
+
211
+ const int buf_len = 48;
212
+ char buf[buf_len];
213
+ char *ptr;
214
+
215
+ while(fgets(buf, buf_len, fp))
216
+ {
217
+ // NOTE: there SHOULD be a newline at the end of the file
218
+ buf[strlen(buf)-1] = '\0'; // truncate the newline
219
+ ptr = strchr(buf, ' ');
220
+ if (!ptr)
221
+ continue; // illegal input
222
+ *ptr = '\0';
223
+ add(make_word(ptr+1, atoi(buf), 0));
224
+ }
225
+
226
+ fclose(fp);
227
+ return true;
228
+ }
229
+ }
230
+ }
data/ext/rmmseg/dict.h ADDED
@@ -0,0 +1,34 @@
1
+ #ifndef _DICT_H_
2
+ #define _DICT_H_
3
+
4
+ #include "word.h"
5
+
6
+ /**
7
+ * A dictionary is a hash table of
8
+ * - key: string
9
+ * - value: word
10
+ *
11
+ * Dictionary data can be loaded from files. Two type of dictionary
12
+ * files are supported:
13
+ * - character file: Each line contains a number and a character,
14
+ * the number is the frequency of the character.
15
+ * The frequency should NOT exceeds 65535.
16
+ * - word file: Each line contains a number and a word, the
17
+ * number is the character count of the word.
18
+ */
19
+
20
+ namespace rmmseg
21
+ {
22
+ /* Instead of making a class with only one instance, i'll not
23
+ * bother to make it a class here. */
24
+
25
+ namespace dict
26
+ {
27
+ void add(Word *word);
28
+ bool load_chars(const char *filename);
29
+ bool load_words(const char *filename);
30
+ Word *get(const char *str, int len);
31
+ }
32
+ }
33
+
34
+ #endif /* _DICT_H_ */
@@ -0,0 +1,17 @@
1
+ require 'mkmf'
2
+
3
+ CONFIG['LDSHARED'] = CONFIG['LDSHARED'].sub(/^\$\(CC\)/, 'g++')
4
+
5
+ # if RUBY_PLATFORM =~ /darwin/
6
+ # # CONFIG['LDSHARED'] = 'g++ --dynamiclib -flat_namespace -undefined suppress'
7
+ # CONFIG['LDSHARED'] = 'g++ --dynamiclib'
8
+ # elsif RUBY_PLATFORM =~ /linux/
9
+ # CONFIG['LDSHARED'] = 'g++ -shared'
10
+ # end
11
+
12
+ if RUBY_PLATFORM =~ /darwin/
13
+ CONFIG['LDSHARED'] = 'g++ -dynamiclib -single_module -flat_namespace -undefined suppress'
14
+ end
15
+
16
+ $objs = ['algor.o', 'dict.o', 'memory.o', 'rmmseg.o']
17
+ create_makefile('rmmseg')
@@ -0,0 +1,9 @@
1
+ #include "memory.h"
2
+
3
+ #define PRE_ALLOC_SIZE 2097152 /* 2MB */
4
+
5
+ namespace rmmseg
6
+ {
7
+ char *_pool_base = static_cast<char *>(std::malloc(PRE_ALLOC_SIZE));
8
+ size_t _pool_size = PRE_ALLOC_SIZE;
9
+ }
@@ -0,0 +1,43 @@
1
+ #ifndef _MEMORY_H_
2
+ #define _MEMORY_H_
3
+
4
+ #include <cstdlib>
5
+
6
+ /**
7
+ * Pre-allocate a chunk of memory and allocate them in small pieces.
8
+ * Those memory are never freed after allocation. Used for persist
9
+ * data like dictionary contents that will never be destroyed unless
10
+ * the application exited.
11
+ */
12
+
13
+ namespace rmmseg
14
+ {
15
+ const size_t REALLOC_SIZE = 2048; /* 2KB */
16
+
17
+ extern size_t _pool_size;
18
+ extern char *_pool_base;
19
+
20
+ inline void *pool_alloc(size_t len)
21
+ {
22
+ void *mem = _pool_base;
23
+
24
+ if (len <= _pool_size)
25
+ {
26
+ _pool_size -= len;
27
+ _pool_base += len;
28
+ return mem;
29
+ }
30
+
31
+ /* NOTE: the remaining memory is simply discard, which WILL
32
+ * cause memory leak. However, this function is not for allocating
33
+ * large object. Larger pre-alloc chunk size will also reduce the
34
+ * impact of this leak. So this is generally not a problem. */
35
+ _pool_base = static_cast<char *>(std::malloc(REALLOC_SIZE));
36
+ mem = _pool_base;
37
+ _pool_base += len;
38
+ _pool_size = REALLOC_SIZE - len;
39
+ return mem;
40
+ }
41
+ }
42
+
43
+ #endif /* _MEMORY_H_ */