rmmseg-cpp 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
data/ext/rmmseg/dict.h ADDED
@@ -0,0 +1,34 @@
1
+ #ifndef _DICT_H_
2
+ #define _DICT_H_
3
+
4
+ #include "word.h"
5
+
6
+ /**
7
+ * A dictionary is a hash table of
8
+ * - key: string
9
+ * - value: word
10
+ *
11
+ * Dictionary data can be loaded from files. Two type of dictionary
12
+ * files are supported:
13
+ * - character file: Each line contains a number and a character,
14
+ * the number is the frequency of the character.
15
+ * The frequency should NOT exceeds 65535.
16
+ * - word file: Each line contains a number and a word, the
17
+ * number is the character count of the word.
18
+ */
19
+
20
+ namespace rmmseg
21
+ {
22
+ /* Instead of making a class with only one instance, i'll not
23
+ * bother to make it a class here. */
24
+
25
+ namespace dict
26
+ {
27
+ void add(Word *word);
28
+ bool load_chars(const char *filename);
29
+ bool load_words(const char *filename);
30
+ Word *get(const char *str, int len);
31
+ }
32
+ }
33
+
34
+ #endif /* _DICT_H_ */
@@ -0,0 +1,10 @@
1
+ require 'mkmf'
2
+
3
+ if RUBY_PLATFORM =~ /darwin/
4
+ CONFIG['LDSHARED'] = 'g++ --dynamiclib -flat_namespace -undefined suppress'
5
+ else
6
+ CONFIG['LDSHARED'] = 'g++ -shared'
7
+ end
8
+
9
+ $objs = ['algor.o', 'dict.o', 'memory.o', 'rmmseg.o']
10
+ create_makefile('rmmseg')
@@ -0,0 +1,9 @@
1
+ #include "memory.h"
2
+
3
+ #define PRE_ALLOC_SIZE 2097152 /* 2MB */
4
+
5
+ namespace rmmseg
6
+ {
7
+ char *_pool_base = static_cast<char *>(std::malloc(PRE_ALLOC_SIZE));
8
+ int _pool_size = PRE_ALLOC_SIZE;
9
+ }
@@ -0,0 +1,43 @@
1
+ #ifndef _MEMORY_H_
2
+ #define _MEMORY_H_
3
+
4
+ #include <cstdlib>
5
+
6
+ /**
7
+ * Pre-allocate a chunk of memory and allocate them in small pieces.
8
+ * Those memory are never freed after allocation. Used for persist
9
+ * data like dictionary contents that will never be destroyed unless
10
+ * the application exited.
11
+ */
12
+
13
+ namespace rmmseg
14
+ {
15
+ const int REALLOC_SIZE = 2048; /* 2KB */
16
+
17
+ extern int _pool_size;
18
+ extern char *_pool_base;
19
+
20
+ inline void *pool_alloc(int len)
21
+ {
22
+ void *mem = _pool_base;
23
+
24
+ if (len <= _pool_size)
25
+ {
26
+ _pool_size -= len;
27
+ _pool_base += len;
28
+ return mem;
29
+ }
30
+
31
+ /* NOTE: the remaining memory is simply discard, which WILL
32
+ * cause memory leak. However, this function is not for allocating
33
+ * large object. Larger pre-alloc chunk size will also reduce the
34
+ * impact of this leak. So this is generally not a problem. */
35
+ _pool_base = static_cast<char *>(std::malloc(REALLOC_SIZE));
36
+ mem = _pool_base;
37
+ _pool_base += len;
38
+ _pool_size = REALLOC_SIZE - len;
39
+ return mem;
40
+ }
41
+ }
42
+
43
+ #endif /* _MEMORY_H_ */
@@ -0,0 +1,261 @@
1
+ #include <ruby.h>
2
+ #include <cstdio> // for debug
3
+
4
+ #include "token.h"
5
+ #include "dict.h"
6
+ #include "algor.h"
7
+
8
+ using namespace std;
9
+
10
+ extern "C" {
11
+
12
+ /*****************************************
13
+ *
14
+ * Normal interface
15
+ *
16
+ *****************************************/
17
+
18
+ /*********************
19
+ * RMMSeg module
20
+ *********************/
21
+ static VALUE mRMMSeg;
22
+
23
+
24
+ /*********************
25
+ * Dictionary module
26
+ *********************/
27
+ static VALUE mDictionary;
28
+
29
+ /*
30
+ * Load a character dictionary.
31
+ *
32
+ * call-seq:
33
+ * load_chars(path) -> status
34
+ *
35
+ * Return +true+ if loaded successfully, +false+ otherwise.
36
+ */
37
+ static VALUE dic_load_chars(VALUE mod, VALUE path)
38
+ {
39
+ if (rmmseg::dict::load_chars(RSTRING(path)->ptr))
40
+ return Qtrue;
41
+ return Qfalse;
42
+ }
43
+
44
+ /*
45
+ * Load a word dictionary.
46
+ *
47
+ * call-seq:
48
+ * load_words(path) -> status
49
+ *
50
+ * Return +true+ if loaded successfully, +false+ otherwise.
51
+ */
52
+ static VALUE dic_load_words(VALUE mod, VALUE path)
53
+ {
54
+ if (rmmseg::dict::load_words(RSTRING(path)->ptr))
55
+ return Qtrue;
56
+ return Qfalse;
57
+ }
58
+
59
+ /*
60
+ * Add a word to the in-memory dictionary.
61
+ *
62
+ * call-seq:
63
+ * add(word, length, freq)
64
+ *
65
+ * - +word+ is a String.
66
+ * - +length+ is number of characters (not number of bytes) of the
67
+ * word to be added.
68
+ * - +freq+ is the frequency of the word. This is only used when
69
+ * it is a one-character word.
70
+ */
71
+ static VALUE dic_add(VALUE mod, VALUE word, VALUE len, VALUE freq)
72
+ {
73
+ const char *str = RSTRING(word)->ptr;
74
+ int nbytes = RSTRING(word)->len;
75
+ rmmseg::Word *w = rmmseg::make_word(str, FIX2INT(len), FIX2INT(freq), nbytes);
76
+ rmmseg::dict::add(w);
77
+ return Qnil;
78
+ }
79
+
80
+ /*
81
+ * Check whether one word is included in the dictionary.
82
+ *
83
+ * call-seq:
84
+ * has_word?(word) -> result
85
+ *
86
+ * Return +true+ if the word is included in the dictionary,
87
+ * +false+ otherwise.
88
+ */
89
+ static VALUE dic_has_word(VALUE mod, VALUE word)
90
+ {
91
+ const char *str = RSTRING(word)->ptr;
92
+ int nbytes = RSTRING(word)->len;
93
+ if (rmmseg::dict::get(str, nbytes) != NULL)
94
+ return Qtrue;
95
+ return Qfalse;
96
+ }
97
+
98
+
99
+ /**********************
100
+ * Token Class
101
+ **********************/
102
+ struct Token
103
+ {
104
+ VALUE text;
105
+ VALUE start;
106
+ VALUE end;
107
+ };
108
+
109
+ static void tk_mark(Token *t)
110
+ {
111
+ // start and end are Fixnums, no need to mark
112
+ rb_gc_mark(t->text);
113
+ }
114
+ static void tk_free(Token *t)
115
+ {
116
+ free(t);
117
+ }
118
+
119
+ /*
120
+ * Get the text held by this token.
121
+ *
122
+ * call-seq:
123
+ * text() -> text
124
+ *
125
+ */
126
+ static VALUE tk_text(VALUE self)
127
+ {
128
+ Token *tk = (Token *)DATA_PTR(self);
129
+ return tk->text;
130
+ }
131
+
132
+ /*
133
+ * Get the start position of this token.
134
+ *
135
+ * call-seq:
136
+ * start() -> start_pos
137
+ *
138
+ */
139
+ static VALUE tk_start(VALUE self)
140
+ {
141
+ Token *tk = (Token *)DATA_PTR(self);
142
+ return tk->start;
143
+ }
144
+
145
+ /*
146
+ * Get the end position of this token.
147
+ *
148
+ * call-seq:
149
+ * end() -> end_pos
150
+ *
151
+ */
152
+ static VALUE tk_end(VALUE self)
153
+ {
154
+ Token *tk = (Token *)DATA_PTR(self);
155
+ return tk->end;
156
+ }
157
+
158
+ static VALUE cToken;
159
+ static VALUE tk_create(const char* base, const rmmseg::Token &t)
160
+ {
161
+ Token *tk = ALLOC(Token);
162
+ int start = t.text-base;
163
+
164
+ // This is necessary, see
165
+ // http://pluskid.lifegoo.com/?p=348
166
+ volatile VALUE text = rb_str_new(t.text, t.length);
167
+ tk->text = text;
168
+
169
+ tk->start = INT2FIX(start);
170
+ tk->end = INT2FIX(start + t.length);
171
+ return Data_Wrap_Struct(cToken,
172
+ (RUBY_DATA_FUNC)tk_mark,
173
+ (RUBY_DATA_FUNC)tk_free,
174
+ tk);
175
+ }
176
+
177
+ /*********************
178
+ * Algorithm Class
179
+ *********************/
180
+ struct Algorithm
181
+ {
182
+ VALUE text; // hold to avoid being garbage collected
183
+ rmmseg::Algorithm *algor;
184
+ };
185
+
186
+ static void algor_mark(Algorithm *a)
187
+ {
188
+ rb_gc_mark(a->text);
189
+ }
190
+ static void algor_free(Algorithm *a)
191
+ {
192
+ free(a->algor);
193
+ }
194
+
195
+ static VALUE cAlgorithm;
196
+
197
+ /*
198
+ * Create an Algorithm object to do segmenting on +text+.
199
+ *
200
+ * call-seq:
201
+ * new(text) -> algorithm
202
+ *
203
+ */
204
+ static VALUE algor_create(VALUE klass, VALUE text)
205
+ {
206
+ Algorithm *algor = ALLOC(Algorithm);
207
+ void *mem;
208
+ algor->text = text;
209
+ mem = malloc(sizeof(rmmseg::Algorithm));
210
+ algor->algor = new(mem) rmmseg::Algorithm(RSTRING(text)->ptr,
211
+ RSTRING(text)->len);
212
+
213
+ return Data_Wrap_Struct(klass,
214
+ (RUBY_DATA_FUNC)algor_mark,
215
+ (RUBY_DATA_FUNC)algor_free,
216
+ algor);
217
+ }
218
+
219
+ /*
220
+ * Get next token.
221
+ *
222
+ * call-seq:
223
+ * next_token() -> token
224
+ *
225
+ * Return +nil+ if no more token available.
226
+ */
227
+ static VALUE algor_next_token(VALUE self)
228
+ {
229
+ Algorithm *algor = (Algorithm *)DATA_PTR(self);
230
+ rmmseg::Token tk = algor->algor->next_token();
231
+
232
+ if (tk.length == 0)
233
+ return Qnil;
234
+ return tk_create(RSTRING(algor->text)->ptr, tk);
235
+ }
236
+
237
+
238
+ void Init_rmmseg()
239
+ {
240
+ mRMMSeg = rb_define_module("RMMSeg");
241
+
242
+ /* Manage dictionaries used by rmmseg. */
243
+ mDictionary = rb_define_module_under(mRMMSeg, "Dictionary");
244
+ rb_define_singleton_method(mDictionary, "load_chars", RUBY_METHOD_FUNC(dic_load_chars), 1);
245
+ rb_define_singleton_method(mDictionary, "load_words", RUBY_METHOD_FUNC(dic_load_words), 1);
246
+ rb_define_singleton_method(mDictionary, "add", RUBY_METHOD_FUNC(dic_add), 3);
247
+ rb_define_singleton_method(mDictionary, "has_word?", RUBY_METHOD_FUNC(dic_has_word), 1);
248
+
249
+ /* A Token hold the text and related position information. */
250
+ cToken = rb_define_class_under(mRMMSeg, "Token", rb_cObject);
251
+ rb_undef_method(rb_singleton_class(cToken), "new");
252
+ rb_define_method(cToken, "text", RUBY_METHOD_FUNC(tk_text), 0);
253
+ rb_define_method(cToken, "start", RUBY_METHOD_FUNC(tk_start), 0);
254
+ rb_define_method(cToken, "end", RUBY_METHOD_FUNC(tk_end), 0);
255
+
256
+ /* An Algorithm object use the MMSEG algorithm to do segmenting. */
257
+ cAlgorithm = rb_define_class_under(mRMMSeg, "Algorithm", rb_cObject);
258
+ rb_define_singleton_method(cAlgorithm, "new", RUBY_METHOD_FUNC(algor_create), 1);
259
+ rb_define_method(cAlgorithm, "next_token", RUBY_METHOD_FUNC(algor_next_token), 0);
260
+ }
261
+ }
@@ -0,0 +1,87 @@
1
+ #ifndef _RULES_H_
2
+ #define _RULES_H_
3
+
4
+ #include <vector>
5
+ #include <algorithm>
6
+
7
+ #include "chunk.h"
8
+
9
+ namespace rmmseg
10
+ {
11
+ template <typename Cmp>
12
+ void take_highest(std::vector<Chunk> &chunks, Cmp &cmp)
13
+ {
14
+ int i = 1, j;
15
+ Chunk& max = chunks[0];
16
+
17
+ for (j = 1; j < chunks.size(); ++j)
18
+ {
19
+ int rlt = cmp(chunks[j], max);
20
+ if (rlt > 0)
21
+ i = 0;
22
+ if (rlt >= 0)
23
+ std::swap(chunks[i++], chunks[j]);
24
+ }
25
+ chunks.erase(chunks.begin()+i, chunks.end());
26
+ }
27
+
28
+ struct MMCmp_t
29
+ {
30
+ int operator()(Chunk &a, Chunk &b)
31
+ {
32
+ return a.total_length() - b.total_length();
33
+ }
34
+ } MMCmp;
35
+ void mm_filter(std::vector<Chunk> &chunks)
36
+ {
37
+ take_highest(chunks, MMCmp);
38
+ }
39
+
40
+ struct LAWLCmp_t
41
+ {
42
+ int operator()(Chunk &a, Chunk &b)
43
+ {
44
+ double rlt = a.average_length() - b.average_length();
45
+ if (rlt == 0)
46
+ return 0;
47
+ if (rlt > 0)
48
+ return 1;
49
+ return -1;
50
+ }
51
+ } LAWLCmp;
52
+ void lawl_filter(std::vector<Chunk> &chunks)
53
+ {
54
+ take_highest(chunks, LAWLCmp);
55
+ }
56
+
57
+ struct SVWLCmp_t
58
+ {
59
+ int operator()(Chunk &a, Chunk& b)
60
+ {
61
+ double rlt = a.variance() - b.variance();
62
+ if (rlt == 0)
63
+ return 0;
64
+ if (rlt < 0)
65
+ return 1;
66
+ return -1;
67
+ }
68
+ } SVWLCmp;
69
+ void svwl_filter(std::vector<Chunk> &chunks)
70
+ {
71
+ take_highest(chunks, SVWLCmp);
72
+ }
73
+
74
+ struct LSDMFOCWCmp_t
75
+ {
76
+ int operator()(Chunk &a, Chunk& b)
77
+ {
78
+ return a.degree_of_morphemic_freedom() - b.degree_of_morphemic_freedom();
79
+ }
80
+ } LSDMFOCWCmp;
81
+ void lsdmfocw_filter(std::vector<Chunk> &chunks)
82
+ {
83
+ take_highest(chunks, LSDMFOCWCmp);
84
+ }
85
+ }
86
+
87
+ #endif /* _RULES_H_ */