lijia-rmmseg-cpp 10.2.9.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,34 @@
1
+ #ifndef _DICT_H_
2
+ #define _DICT_H_
3
+
4
+ #include "word.h"
5
+
6
+ /**
7
+ * A dictionary is a hash table of
8
+ * - key: string
9
+ * - value: word
10
+ *
11
+ * Dictionary data can be loaded from files. Two type of dictionary
12
+ * files are supported:
13
+ * - character file: Each line contains a number and a character,
14
+ * the number is the frequency of the character.
15
+ * The frequency should NOT exceeds 65535.
16
+ * - word file: Each line contains a number and a word, the
17
+ * number is the character count of the word.
18
+ */
19
+
20
+ namespace rmmseg
21
+ {
22
+ /* Instead of making a class with only one instance, i'll not
23
+ * bother to make it a class here. */
24
+
25
+ namespace dict
26
+ {
27
+ void add(Word *word);
28
+ bool load_chars(const char *filename);
29
+ bool load_words(const char *filename);
30
+ Word *get(const char *str, int len);
31
+ }
32
+ }
33
+
34
+ #endif /* _DICT_H_ */
@@ -0,0 +1,17 @@
1
+ require 'mkmf'
2
+
3
+ CONFIG['LDSHARED'] = CONFIG['LDSHARED'].sub(/^\$\(CC\)/, 'g++')
4
+
5
+ # if RUBY_PLATFORM =~ /darwin/
6
+ # # CONFIG['LDSHARED'] = 'g++ --dynamiclib -flat_namespace -undefined suppress'
7
+ # CONFIG['LDSHARED'] = 'g++ --dynamiclib'
8
+ # elsif RUBY_PLATFORM =~ /linux/
9
+ # CONFIG['LDSHARED'] = 'g++ -shared'
10
+ # end
11
+
12
+ if RUBY_PLATFORM =~ /darwin/ and `which brew`.empty?
13
+ CONFIG['LDSHARED'] = 'g++ -dynamiclib -single_module -flat_namespace -undefined suppress'
14
+ end
15
+
16
+ $objs = ['algor.o', 'dict.o', 'memory.o', 'rmmseg.o']
17
+ create_makefile('rmmseg')
@@ -0,0 +1,9 @@
1
+ #include "memory.h"
2
+
3
+ #define PRE_ALLOC_SIZE 2097152 /* 2MB */
4
+
5
+ namespace rmmseg
6
+ {
7
+ char *_pool_base = static_cast<char *>(std::malloc(PRE_ALLOC_SIZE));
8
+ size_t _pool_size = PRE_ALLOC_SIZE;
9
+ }
@@ -0,0 +1,43 @@
1
+ #ifndef _MEMORY_H_
2
+ #define _MEMORY_H_
3
+
4
+ #include <cstdlib>
5
+
6
+ /**
7
+ * Pre-allocate a chunk of memory and allocate them in small pieces.
8
+ * Those memory are never freed after allocation. Used for persist
9
+ * data like dictionary contents that will never be destroyed unless
10
+ * the application exited.
11
+ */
12
+
13
+ namespace rmmseg
14
+ {
15
+ const size_t REALLOC_SIZE = 2048; /* 2KB */
16
+
17
+ extern size_t _pool_size;
18
+ extern char *_pool_base;
19
+
20
+ inline void *pool_alloc(size_t len)
21
+ {
22
+ void *mem = _pool_base;
23
+
24
+ if (len <= _pool_size)
25
+ {
26
+ _pool_size -= len;
27
+ _pool_base += len;
28
+ return mem;
29
+ }
30
+
31
+ /* NOTE: the remaining memory is simply discard, which WILL
32
+ * cause memory leak. However, this function is not for allocating
33
+ * large object. Larger pre-alloc chunk size will also reduce the
34
+ * impact of this leak. So this is generally not a problem. */
35
+ _pool_base = static_cast<char *>(std::malloc(REALLOC_SIZE));
36
+ mem = _pool_base;
37
+ _pool_base += len;
38
+ _pool_size = REALLOC_SIZE - len;
39
+ return mem;
40
+ }
41
+ }
42
+
43
+ #endif /* _MEMORY_H_ */
@@ -0,0 +1,263 @@
1
+ #include <ruby.h>
2
+ #include <cstdio> // for debug
3
+
4
+ #include "token.h"
5
+ #include "dict.h"
6
+ #include "algor.h"
7
+
8
+ using namespace std;
9
+
10
+ extern "C" {
11
+
12
+ /*****************************************
13
+ *
14
+ * Normal interface
15
+ *
16
+ *****************************************/
17
+
18
+ /*********************
19
+ * RMMSeg module
20
+ *********************/
21
+ static VALUE mRMMSeg;
22
+
23
+
24
+ /*********************
25
+ * Dictionary module
26
+ *********************/
27
+ static VALUE mDictionary;
28
+
29
+ /*
30
+ * Load a character dictionary.
31
+ *
32
+ * call-seq:
33
+ * load_chars(path) -> status
34
+ *
35
+ * Return +true+ if loaded successfully, +false+ otherwise.
36
+ */
37
+ static VALUE dic_load_chars(VALUE mod, VALUE path)
38
+ {
39
+ if (rmmseg::dict::load_chars(RSTRING_PTR(path)))
40
+ return Qtrue;
41
+ return Qfalse;
42
+ }
43
+
44
+ /*
45
+ * Load a word dictionary.
46
+ *
47
+ * call-seq:
48
+ * load_words(path) -> status
49
+ *
50
+ * Return +true+ if loaded successfully, +false+ otherwise.
51
+ */
52
+ static VALUE dic_load_words(VALUE mod, VALUE path)
53
+ {
54
+ if (rmmseg::dict::load_words(RSTRING_PTR(path)))
55
+ return Qtrue;
56
+ return Qfalse;
57
+ }
58
+
59
+ /*
60
+ * Add a word to the in-memory dictionary.
61
+ *
62
+ * call-seq:
63
+ * add(word, length, freq)
64
+ *
65
+ * - +word+ is a String.
66
+ * - +length+ is number of characters (not number of bytes) of the
67
+ * word to be added.
68
+ * - +freq+ is the frequency of the word. This is only used when
69
+ * it is a one-character word.
70
+ */
71
+ static VALUE dic_add(VALUE mod, VALUE word, VALUE len, VALUE freq)
72
+ {
73
+ const char *str = RSTRING_PTR(word);
74
+ int nbytes = RSTRING_LEN(word);
75
+ rmmseg::Word *w = rmmseg::make_word(str, FIX2INT(len), FIX2INT(freq), nbytes);
76
+ rmmseg::dict::add(w);
77
+ return Qnil;
78
+ }
79
+
80
+ /*
81
+ * Check whether one word is included in the dictionary.
82
+ *
83
+ * call-seq:
84
+ * has_word?(word) -> result
85
+ *
86
+ * Return +true+ if the word is included in the dictionary,
87
+ * +false+ otherwise.
88
+ */
89
+ static VALUE dic_has_word(VALUE mod, VALUE word)
90
+ {
91
+ const char *str = RSTRING_PTR(word);
92
+ int nbytes = RSTRING_LEN(word);
93
+ if (rmmseg::dict::get(str, nbytes) != NULL)
94
+ return Qtrue;
95
+ return Qfalse;
96
+ }
97
+
98
+
99
+ /**********************
100
+ * Token Class
101
+ **********************/
102
+ struct Token
103
+ {
104
+ VALUE text;
105
+ VALUE start;
106
+ VALUE end;
107
+ };
108
+
109
+ static void tk_mark(Token *t)
110
+ {
111
+ // start and end are Fixnums, no need to mark
112
+ rb_gc_mark(t->text);
113
+ }
114
+ static void tk_free(Token *t)
115
+ {
116
+ free(t);
117
+ }
118
+
119
+ /*
120
+ * Get the text held by this token.
121
+ *
122
+ * call-seq:
123
+ * text() -> text
124
+ *
125
+ */
126
+ static VALUE tk_text(VALUE self)
127
+ {
128
+ Token *tk = (Token *)DATA_PTR(self);
129
+ return tk->text;
130
+ }
131
+
132
+ /*
133
+ * Get the start position of this token.
134
+ *
135
+ * call-seq:
136
+ * start() -> start_pos
137
+ *
138
+ */
139
+ static VALUE tk_start(VALUE self)
140
+ {
141
+ Token *tk = (Token *)DATA_PTR(self);
142
+ return tk->start;
143
+ }
144
+
145
+ /*
146
+ * Get the end position of this token.
147
+ *
148
+ * call-seq:
149
+ * end() -> end_pos
150
+ *
151
+ */
152
+ static VALUE tk_end(VALUE self)
153
+ {
154
+ Token *tk = (Token *)DATA_PTR(self);
155
+ return tk->end;
156
+ }
157
+
158
+ static VALUE cToken;
159
+ static VALUE tk_create(const char* base, const rmmseg::Token &t)
160
+ {
161
+ Token *tk = ALLOC(Token);
162
+ int start = t.text-base;
163
+
164
+ // This is necessary, see
165
+ // http://lifegoo.pluskid.org/?p=348
166
+ volatile VALUE text = rb_str_new(t.text, t.length);
167
+ tk->text = text;
168
+
169
+ tk->start = INT2FIX(start);
170
+ tk->end = INT2FIX(start + t.length);
171
+ volatile VALUE tok = Data_Wrap_Struct(cToken,
172
+ (RUBY_DATA_FUNC)tk_mark,
173
+ (RUBY_DATA_FUNC)tk_free,
174
+ tk);
175
+ return tok;
176
+ }
177
+
178
+ /*********************
179
+ * Algorithm Class
180
+ *********************/
181
+ struct Algorithm
182
+ {
183
+ VALUE text; // hold to avoid being garbage collected
184
+ rmmseg::Algorithm *algor;
185
+ };
186
+
187
+ static void algor_mark(Algorithm *a)
188
+ {
189
+ rb_gc_mark(a->text);
190
+ }
191
+ static void algor_free(Algorithm *a)
192
+ {
193
+ free(a->algor);
194
+ }
195
+
196
+ static VALUE cAlgorithm;
197
+
198
+ /*
199
+ * Create an Algorithm object to do segmenting on +text+.
200
+ *
201
+ * call-seq:
202
+ * new(text) -> algorithm
203
+ *
204
+ */
205
+ static VALUE algor_create(VALUE klass, VALUE text)
206
+ {
207
+ Algorithm *algor = ALLOC(Algorithm);
208
+ void *mem;
209
+ algor->text = text;
210
+ mem = malloc(sizeof(rmmseg::Algorithm));
211
+ algor->algor = new(mem) rmmseg::Algorithm(RSTRING_PTR(text),
212
+ RSTRING_LEN(text));
213
+
214
+ return Data_Wrap_Struct(klass,
215
+ (RUBY_DATA_FUNC)algor_mark,
216
+ (RUBY_DATA_FUNC)algor_free,
217
+ algor);
218
+ }
219
+
220
+ /*
221
+ * Get next token.
222
+ *
223
+ * call-seq:
224
+ * next_token() -> token
225
+ *
226
+ * Return +nil+ if no more token available.
227
+ */
228
+ static VALUE algor_next_token(VALUE self)
229
+ {
230
+ Algorithm *algor = (Algorithm *)DATA_PTR(self);
231
+ rmmseg::Token tk = algor->algor->next_token();
232
+
233
+ if (tk.length == 0)
234
+ return Qnil;
235
+ volatile VALUE rtk = tk_create(RSTRING_PTR(algor->text), tk);
236
+ return rtk;
237
+ }
238
+
239
+
240
+ void Init_rmmseg()
241
+ {
242
+ mRMMSeg = rb_define_module("RMMSeg");
243
+
244
+ /* Manage dictionaries used by rmmseg. */
245
+ mDictionary = rb_define_module_under(mRMMSeg, "Dictionary");
246
+ rb_define_singleton_method(mDictionary, "load_chars", RUBY_METHOD_FUNC(dic_load_chars), 1);
247
+ rb_define_singleton_method(mDictionary, "load_words", RUBY_METHOD_FUNC(dic_load_words), 1);
248
+ rb_define_singleton_method(mDictionary, "add", RUBY_METHOD_FUNC(dic_add), 3);
249
+ rb_define_singleton_method(mDictionary, "has_word?", RUBY_METHOD_FUNC(dic_has_word), 1);
250
+
251
+ /* A Token hold the text and related position information. */
252
+ cToken = rb_define_class_under(mRMMSeg, "Token", rb_cObject);
253
+ rb_undef_method(rb_singleton_class(cToken), "new");
254
+ rb_define_method(cToken, "text", RUBY_METHOD_FUNC(tk_text), 0);
255
+ rb_define_method(cToken, "start", RUBY_METHOD_FUNC(tk_start), 0);
256
+ rb_define_method(cToken, "end", RUBY_METHOD_FUNC(tk_end), 0);
257
+
258
+ /* An Algorithm object use the MMSEG algorithm to do segmenting. */
259
+ cAlgorithm = rb_define_class_under(mRMMSeg, "Algorithm", rb_cObject);
260
+ rb_define_singleton_method(cAlgorithm, "new", RUBY_METHOD_FUNC(algor_create), 1);
261
+ rb_define_method(cAlgorithm, "next_token", RUBY_METHOD_FUNC(algor_next_token), 0);
262
+ }
263
+ }
@@ -0,0 +1,86 @@
1
+ #ifndef _RULES_H_
2
+ #define _RULES_H_
3
+
4
+ #include <vector>
5
+ #include <algorithm>
6
+
7
+ #include "chunk.h"
8
+
9
+ namespace rmmseg
10
+ {
11
+ template <typename Cmp>
12
+ void take_highest(std::vector<Chunk> &chunks, const Cmp &cmp)
13
+ {
14
+ unsigned int i = 1, j;
15
+
16
+ for (j = 1; j < chunks.size(); ++j)
17
+ {
18
+ int rlt = cmp(chunks[j], chunks[0]);
19
+ if (rlt > 0)
20
+ i = 0;
21
+ if (rlt >= 0)
22
+ std::swap(chunks[i++], chunks[j]);
23
+ }
24
+ chunks.erase(chunks.begin()+i, chunks.end());
25
+ }
26
+
27
+ struct MMCmp_t
28
+ {
29
+ int operator()(const Chunk &a, const Chunk &b) const
30
+ {
31
+ return a.total_length() - b.total_length();
32
+ }
33
+ } MMCmp;
34
+ void mm_filter(std::vector<Chunk> &chunks)
35
+ {
36
+ take_highest(chunks, MMCmp);
37
+ }
38
+
39
+ struct LAWLCmp_t
40
+ {
41
+ int operator()(const Chunk &a, const Chunk &b) const
42
+ {
43
+ double rlt = a.average_length() - b.average_length();
44
+ if (rlt == 0)
45
+ return 0;
46
+ if (rlt > 0)
47
+ return 1;
48
+ return -1;
49
+ }
50
+ } LAWLCmp;
51
+ void lawl_filter(std::vector<Chunk> &chunks)
52
+ {
53
+ take_highest(chunks, LAWLCmp);
54
+ }
55
+
56
+ struct SVWLCmp_t
57
+ {
58
+ int operator()(const Chunk &a, const Chunk& b) const
59
+ {
60
+ double rlt = a.variance() - b.variance();
61
+ if (rlt == 0)
62
+ return 0;
63
+ if (rlt < 0)
64
+ return 1;
65
+ return -1;
66
+ }
67
+ } SVWLCmp;
68
+ void svwl_filter(std::vector<Chunk> &chunks)
69
+ {
70
+ take_highest(chunks, SVWLCmp);
71
+ }
72
+
73
+ struct LSDMFOCWCmp_t
74
+ {
75
+ int operator()(const Chunk &a, const Chunk& b) const
76
+ {
77
+ return a.degree_of_morphemic_freedom() - b.degree_of_morphemic_freedom();
78
+ }
79
+ } LSDMFOCWCmp;
80
+ void lsdmfocw_filter(std::vector<Chunk> &chunks)
81
+ {
82
+ take_highest(chunks, LSDMFOCWCmp);
83
+ }
84
+ }
85
+
86
+ #endif /* _RULES_H_ */