rmmseg-cpp-new 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +23 -0
- data/README.md +3 -0
- data/bin/rmmseg +63 -0
- data/data/chars.dic +12638 -0
- data/data/words.dic +120308 -0
- data/ext/rmmseg/algor.cpp +253 -0
- data/ext/rmmseg/algor.h +79 -0
- data/ext/rmmseg/chunk.h +59 -0
- data/ext/rmmseg/dict.cpp +230 -0
- data/ext/rmmseg/dict.h +34 -0
- data/ext/rmmseg/extconf.rb +13 -0
- data/ext/rmmseg/memory.cpp +9 -0
- data/ext/rmmseg/memory.h +43 -0
- data/ext/rmmseg/rmmseg.cpp +263 -0
- data/ext/rmmseg/rules.h +86 -0
- data/ext/rmmseg/token.h +19 -0
- data/ext/rmmseg/word.h +44 -0
- data/lib/rmmseg-cpp-new.rb +2 -0
- data/lib/rmmseg/dictionary.rb +59 -0
- data/lib/rmmseg/ferret.rb +64 -0
- metadata +68 -0
data/ext/rmmseg/dict.h
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
#ifndef _DICT_H_
|
2
|
+
#define _DICT_H_
|
3
|
+
|
4
|
+
#include "word.h"
|
5
|
+
|
6
|
+
/**
|
7
|
+
* A dictionary is a hash table of
|
8
|
+
* - key: string
|
9
|
+
* - value: word
|
10
|
+
*
|
11
|
+
* Dictionary data can be loaded from files. Two type of dictionary
|
12
|
+
* files are supported:
|
13
|
+
* - character file: Each line contains a number and a character,
|
14
|
+
* the number is the frequency of the character.
|
15
|
+
* The frequency should NOT exceeds 65535.
|
16
|
+
* - word file: Each line contains a number and a word, the
|
17
|
+
* number is the character count of the word.
|
18
|
+
*/
|
19
|
+
|
20
|
+
namespace rmmseg
|
21
|
+
{
|
22
|
+
/* Instead of making a class with only one instance, i'll not
|
23
|
+
* bother to make it a class here. */
|
24
|
+
|
25
|
+
namespace dict
|
26
|
+
{
|
27
|
+
void add(Word *word);
|
28
|
+
bool load_chars(const char *filename);
|
29
|
+
bool load_words(const char *filename);
|
30
|
+
Word *get(const char *str, int len);
|
31
|
+
}
|
32
|
+
}
|
33
|
+
|
34
|
+
#endif /* _DICT_H_ */
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'mkmf'
|
2
|
+
|
3
|
+
try_link('algor.cpp')
|
4
|
+
try_link('memory.cpp')
|
5
|
+
try_link('dict.cpp')
|
6
|
+
try_link('rmmseg.cpp')
|
7
|
+
|
8
|
+
case RUBY_PLATFORM
|
9
|
+
when /mswin32/, /mingw32/, /bccwin32/
|
10
|
+
CONFIG['LDSHAREDXX'] = "$(CXX) -shared -static-libgcc -static-libstdc++"
|
11
|
+
end
|
12
|
+
|
13
|
+
create_makefile('rmmseg')
|
data/ext/rmmseg/memory.h
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
#ifndef _MEMORY_H_
|
2
|
+
#define _MEMORY_H_
|
3
|
+
|
4
|
+
#include <cstdlib>
|
5
|
+
|
6
|
+
/**
|
7
|
+
* Pre-allocate a chunk of memory and allocate them in small pieces.
|
8
|
+
* Those memory are never freed after allocation. Used for persist
|
9
|
+
* data like dictionary contents that will never be destroyed unless
|
10
|
+
* the application exited.
|
11
|
+
*/
|
12
|
+
|
13
|
+
namespace rmmseg
|
14
|
+
{
|
15
|
+
const size_t REALLOC_SIZE = 2048; /* 2KB */
|
16
|
+
|
17
|
+
extern size_t _pool_size;
|
18
|
+
extern char *_pool_base;
|
19
|
+
|
20
|
+
inline void *pool_alloc(size_t len)
|
21
|
+
{
|
22
|
+
void *mem = _pool_base;
|
23
|
+
|
24
|
+
if (len <= _pool_size)
|
25
|
+
{
|
26
|
+
_pool_size -= len;
|
27
|
+
_pool_base += len;
|
28
|
+
return mem;
|
29
|
+
}
|
30
|
+
|
31
|
+
/* NOTE: the remaining memory is simply discard, which WILL
|
32
|
+
* cause memory leak. However, this function is not for allocating
|
33
|
+
* large object. Larger pre-alloc chunk size will also reduce the
|
34
|
+
* impact of this leak. So this is generally not a problem. */
|
35
|
+
_pool_base = static_cast<char *>(std::malloc(REALLOC_SIZE));
|
36
|
+
mem = _pool_base;
|
37
|
+
_pool_base += len;
|
38
|
+
_pool_size = REALLOC_SIZE - len;
|
39
|
+
return mem;
|
40
|
+
}
|
41
|
+
}
|
42
|
+
|
43
|
+
#endif /* _MEMORY_H_ */
|
@@ -0,0 +1,263 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
#include <cstdio> // for debug
|
3
|
+
|
4
|
+
#include "token.h"
|
5
|
+
#include "dict.h"
|
6
|
+
#include "algor.h"
|
7
|
+
|
8
|
+
using namespace std;
|
9
|
+
|
10
|
+
extern "C" {
|
11
|
+
|
12
|
+
/*****************************************
|
13
|
+
*
|
14
|
+
* Normal interface
|
15
|
+
*
|
16
|
+
*****************************************/
|
17
|
+
|
18
|
+
/*********************
|
19
|
+
* RMMSeg module
|
20
|
+
*********************/
|
21
|
+
static VALUE mRMMSeg;
|
22
|
+
|
23
|
+
|
24
|
+
/*********************
|
25
|
+
* Dictionary module
|
26
|
+
*********************/
|
27
|
+
static VALUE mDictionary;
|
28
|
+
|
29
|
+
/*
|
30
|
+
* Load a character dictionary.
|
31
|
+
*
|
32
|
+
* call-seq:
|
33
|
+
* load_chars(path) -> status
|
34
|
+
*
|
35
|
+
* Return +true+ if loaded successfully, +false+ otherwise.
|
36
|
+
*/
|
37
|
+
static VALUE dic_load_chars(VALUE mod, VALUE path)
|
38
|
+
{
|
39
|
+
if (rmmseg::dict::load_chars(RSTRING_PTR(path)))
|
40
|
+
return Qtrue;
|
41
|
+
return Qfalse;
|
42
|
+
}
|
43
|
+
|
44
|
+
/*
|
45
|
+
* Load a word dictionary.
|
46
|
+
*
|
47
|
+
* call-seq:
|
48
|
+
* load_words(path) -> status
|
49
|
+
*
|
50
|
+
* Return +true+ if loaded successfully, +false+ otherwise.
|
51
|
+
*/
|
52
|
+
static VALUE dic_load_words(VALUE mod, VALUE path)
|
53
|
+
{
|
54
|
+
if (rmmseg::dict::load_words(RSTRING_PTR(path)))
|
55
|
+
return Qtrue;
|
56
|
+
return Qfalse;
|
57
|
+
}
|
58
|
+
|
59
|
+
/*
|
60
|
+
* Add a word to the in-memory dictionary.
|
61
|
+
*
|
62
|
+
* call-seq:
|
63
|
+
* add(word, length, freq)
|
64
|
+
*
|
65
|
+
* - +word+ is a String.
|
66
|
+
* - +length+ is number of characters (not number of bytes) of the
|
67
|
+
* word to be added.
|
68
|
+
* - +freq+ is the frequency of the word. This is only used when
|
69
|
+
* it is a one-character word.
|
70
|
+
*/
|
71
|
+
static VALUE dic_add(VALUE mod, VALUE word, VALUE len, VALUE freq)
|
72
|
+
{
|
73
|
+
const char *str = RSTRING_PTR(word);
|
74
|
+
int nbytes = RSTRING_LEN(word);
|
75
|
+
rmmseg::Word *w = rmmseg::make_word(str, FIX2INT(len), FIX2INT(freq), nbytes);
|
76
|
+
rmmseg::dict::add(w);
|
77
|
+
return Qnil;
|
78
|
+
}
|
79
|
+
|
80
|
+
/*
|
81
|
+
* Check whether one word is included in the dictionary.
|
82
|
+
*
|
83
|
+
* call-seq:
|
84
|
+
* has_word?(word) -> result
|
85
|
+
*
|
86
|
+
* Return +true+ if the word is included in the dictionary,
|
87
|
+
* +false+ otherwise.
|
88
|
+
*/
|
89
|
+
static VALUE dic_has_word(VALUE mod, VALUE word)
|
90
|
+
{
|
91
|
+
const char *str = RSTRING_PTR(word);
|
92
|
+
int nbytes = RSTRING_LEN(word);
|
93
|
+
if (rmmseg::dict::get(str, nbytes) != NULL)
|
94
|
+
return Qtrue;
|
95
|
+
return Qfalse;
|
96
|
+
}
|
97
|
+
|
98
|
+
|
99
|
+
/**********************
|
100
|
+
* Token Class
|
101
|
+
**********************/
|
102
|
+
struct Token
|
103
|
+
{
|
104
|
+
VALUE text;
|
105
|
+
VALUE start;
|
106
|
+
VALUE end;
|
107
|
+
};
|
108
|
+
|
109
|
+
static void tk_mark(Token *t)
|
110
|
+
{
|
111
|
+
// start and end are Fixnums, no need to mark
|
112
|
+
rb_gc_mark(t->text);
|
113
|
+
}
|
114
|
+
static void tk_free(Token *t)
|
115
|
+
{
|
116
|
+
free(t);
|
117
|
+
}
|
118
|
+
|
119
|
+
/*
|
120
|
+
* Get the text held by this token.
|
121
|
+
*
|
122
|
+
* call-seq:
|
123
|
+
* text() -> text
|
124
|
+
*
|
125
|
+
*/
|
126
|
+
static VALUE tk_text(VALUE self)
|
127
|
+
{
|
128
|
+
Token *tk = (Token *)DATA_PTR(self);
|
129
|
+
return tk->text;
|
130
|
+
}
|
131
|
+
|
132
|
+
/*
|
133
|
+
* Get the start position of this token.
|
134
|
+
*
|
135
|
+
* call-seq:
|
136
|
+
* start() -> start_pos
|
137
|
+
*
|
138
|
+
*/
|
139
|
+
static VALUE tk_start(VALUE self)
|
140
|
+
{
|
141
|
+
Token *tk = (Token *)DATA_PTR(self);
|
142
|
+
return tk->start;
|
143
|
+
}
|
144
|
+
|
145
|
+
/*
|
146
|
+
* Get the end position of this token.
|
147
|
+
*
|
148
|
+
* call-seq:
|
149
|
+
* end() -> end_pos
|
150
|
+
*
|
151
|
+
*/
|
152
|
+
static VALUE tk_end(VALUE self)
|
153
|
+
{
|
154
|
+
Token *tk = (Token *)DATA_PTR(self);
|
155
|
+
return tk->end;
|
156
|
+
}
|
157
|
+
|
158
|
+
static VALUE cToken;
|
159
|
+
static VALUE tk_create(const char* base, const rmmseg::Token &t)
|
160
|
+
{
|
161
|
+
Token *tk = ALLOC(Token);
|
162
|
+
int start = t.text-base;
|
163
|
+
|
164
|
+
// This is necessary, see
|
165
|
+
// http://lifegoo.pluskid.org/?p=348
|
166
|
+
volatile VALUE text = rb_str_new(t.text, t.length);
|
167
|
+
tk->text = text;
|
168
|
+
|
169
|
+
tk->start = INT2FIX(start);
|
170
|
+
tk->end = INT2FIX(start + t.length);
|
171
|
+
volatile VALUE tok = Data_Wrap_Struct(cToken,
|
172
|
+
(RUBY_DATA_FUNC)tk_mark,
|
173
|
+
(RUBY_DATA_FUNC)tk_free,
|
174
|
+
tk);
|
175
|
+
return tok;
|
176
|
+
}
|
177
|
+
|
178
|
+
/*********************
|
179
|
+
* Algorithm Class
|
180
|
+
*********************/
|
181
|
+
struct Algorithm
|
182
|
+
{
|
183
|
+
VALUE text; // hold to avoid being garbage collected
|
184
|
+
rmmseg::Algorithm *algor;
|
185
|
+
};
|
186
|
+
|
187
|
+
static void algor_mark(Algorithm *a)
|
188
|
+
{
|
189
|
+
rb_gc_mark(a->text);
|
190
|
+
}
|
191
|
+
static void algor_free(Algorithm *a)
|
192
|
+
{
|
193
|
+
free(a->algor);
|
194
|
+
}
|
195
|
+
|
196
|
+
static VALUE cAlgorithm;
|
197
|
+
|
198
|
+
/*
|
199
|
+
* Create an Algorithm object to do segmenting on +text+.
|
200
|
+
*
|
201
|
+
* call-seq:
|
202
|
+
* new(text) -> algorithm
|
203
|
+
*
|
204
|
+
*/
|
205
|
+
static VALUE algor_create(VALUE klass, VALUE text)
|
206
|
+
{
|
207
|
+
Algorithm *algor = ALLOC(Algorithm);
|
208
|
+
void *mem;
|
209
|
+
algor->text = text;
|
210
|
+
mem = malloc(sizeof(rmmseg::Algorithm));
|
211
|
+
algor->algor = new(mem) rmmseg::Algorithm(RSTRING_PTR(text),
|
212
|
+
RSTRING_LEN(text));
|
213
|
+
|
214
|
+
return Data_Wrap_Struct(klass,
|
215
|
+
(RUBY_DATA_FUNC)algor_mark,
|
216
|
+
(RUBY_DATA_FUNC)algor_free,
|
217
|
+
algor);
|
218
|
+
}
|
219
|
+
|
220
|
+
/*
|
221
|
+
* Get next token.
|
222
|
+
*
|
223
|
+
* call-seq:
|
224
|
+
* next_token() -> token
|
225
|
+
*
|
226
|
+
* Return +nil+ if no more token available.
|
227
|
+
*/
|
228
|
+
static VALUE algor_next_token(VALUE self)
|
229
|
+
{
|
230
|
+
Algorithm *algor = (Algorithm *)DATA_PTR(self);
|
231
|
+
rmmseg::Token tk = algor->algor->next_token();
|
232
|
+
|
233
|
+
if (tk.length == 0)
|
234
|
+
return Qnil;
|
235
|
+
volatile VALUE rtk = tk_create(RSTRING_PTR(algor->text), tk);
|
236
|
+
return rtk;
|
237
|
+
}
|
238
|
+
|
239
|
+
|
240
|
+
void Init_rmmseg()
|
241
|
+
{
|
242
|
+
mRMMSeg = rb_define_module("RMMSeg");
|
243
|
+
|
244
|
+
/* Manage dictionaries used by rmmseg. */
|
245
|
+
mDictionary = rb_define_module_under(mRMMSeg, "Dictionary");
|
246
|
+
rb_define_singleton_method(mDictionary, "load_chars", RUBY_METHOD_FUNC(dic_load_chars), 1);
|
247
|
+
rb_define_singleton_method(mDictionary, "load_words", RUBY_METHOD_FUNC(dic_load_words), 1);
|
248
|
+
rb_define_singleton_method(mDictionary, "add", RUBY_METHOD_FUNC(dic_add), 3);
|
249
|
+
rb_define_singleton_method(mDictionary, "has_word?", RUBY_METHOD_FUNC(dic_has_word), 1);
|
250
|
+
|
251
|
+
/* A Token hold the text and related position information. */
|
252
|
+
cToken = rb_define_class_under(mRMMSeg, "Token", rb_cObject);
|
253
|
+
rb_undef_method(rb_singleton_class(cToken), "new");
|
254
|
+
rb_define_method(cToken, "text", RUBY_METHOD_FUNC(tk_text), 0);
|
255
|
+
rb_define_method(cToken, "start", RUBY_METHOD_FUNC(tk_start), 0);
|
256
|
+
rb_define_method(cToken, "end", RUBY_METHOD_FUNC(tk_end), 0);
|
257
|
+
|
258
|
+
/* An Algorithm object use the MMSEG algorithm to do segmenting. */
|
259
|
+
cAlgorithm = rb_define_class_under(mRMMSeg, "Algorithm", rb_cObject);
|
260
|
+
rb_define_singleton_method(cAlgorithm, "new", RUBY_METHOD_FUNC(algor_create), 1);
|
261
|
+
rb_define_method(cAlgorithm, "next_token", RUBY_METHOD_FUNC(algor_next_token), 0);
|
262
|
+
}
|
263
|
+
}
|
data/ext/rmmseg/rules.h
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
#ifndef _RULES_H_
|
2
|
+
#define _RULES_H_
|
3
|
+
|
4
|
+
#include <vector>
|
5
|
+
#include <algorithm>
|
6
|
+
|
7
|
+
#include "chunk.h"
|
8
|
+
|
9
|
+
namespace rmmseg
|
10
|
+
{
|
11
|
+
template <typename Cmp>
|
12
|
+
void take_highest(std::vector<Chunk> &chunks, const Cmp &cmp)
|
13
|
+
{
|
14
|
+
unsigned int i = 1, j;
|
15
|
+
|
16
|
+
for (j = 1; j < chunks.size(); ++j)
|
17
|
+
{
|
18
|
+
int rlt = cmp(chunks[j], chunks[0]);
|
19
|
+
if (rlt > 0)
|
20
|
+
i = 0;
|
21
|
+
if (rlt >= 0)
|
22
|
+
std::swap(chunks[i++], chunks[j]);
|
23
|
+
}
|
24
|
+
chunks.erase(chunks.begin()+i, chunks.end());
|
25
|
+
}
|
26
|
+
|
27
|
+
struct MMCmp_t
|
28
|
+
{
|
29
|
+
int operator()(const Chunk &a, const Chunk &b) const
|
30
|
+
{
|
31
|
+
return a.total_length() - b.total_length();
|
32
|
+
}
|
33
|
+
} MMCmp;
|
34
|
+
void mm_filter(std::vector<Chunk> &chunks)
|
35
|
+
{
|
36
|
+
take_highest(chunks, MMCmp);
|
37
|
+
}
|
38
|
+
|
39
|
+
struct LAWLCmp_t
|
40
|
+
{
|
41
|
+
int operator()(const Chunk &a, const Chunk &b) const
|
42
|
+
{
|
43
|
+
double rlt = a.average_length() - b.average_length();
|
44
|
+
if (rlt == 0)
|
45
|
+
return 0;
|
46
|
+
if (rlt > 0)
|
47
|
+
return 1;
|
48
|
+
return -1;
|
49
|
+
}
|
50
|
+
} LAWLCmp;
|
51
|
+
void lawl_filter(std::vector<Chunk> &chunks)
|
52
|
+
{
|
53
|
+
take_highest(chunks, LAWLCmp);
|
54
|
+
}
|
55
|
+
|
56
|
+
struct SVWLCmp_t
|
57
|
+
{
|
58
|
+
int operator()(const Chunk &a, const Chunk& b) const
|
59
|
+
{
|
60
|
+
double rlt = a.variance() - b.variance();
|
61
|
+
if (rlt == 0)
|
62
|
+
return 0;
|
63
|
+
if (rlt < 0)
|
64
|
+
return 1;
|
65
|
+
return -1;
|
66
|
+
}
|
67
|
+
} SVWLCmp;
|
68
|
+
void svwl_filter(std::vector<Chunk> &chunks)
|
69
|
+
{
|
70
|
+
take_highest(chunks, SVWLCmp);
|
71
|
+
}
|
72
|
+
|
73
|
+
struct LSDMFOCWCmp_t
|
74
|
+
{
|
75
|
+
int operator()(const Chunk &a, const Chunk& b) const
|
76
|
+
{
|
77
|
+
return a.degree_of_morphemic_freedom() - b.degree_of_morphemic_freedom();
|
78
|
+
}
|
79
|
+
} LSDMFOCWCmp;
|
80
|
+
void lsdmfocw_filter(std::vector<Chunk> &chunks)
|
81
|
+
{
|
82
|
+
take_highest(chunks, LSDMFOCWCmp);
|
83
|
+
}
|
84
|
+
}
|
85
|
+
|
86
|
+
#endif /* _RULES_H_ */
|