rmmseg-cpp-traditional 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/History.txt +21 -0
- data/LICENSE.txt +22 -0
- data/Manifest.txt +43 -0
- data/README +111 -0
- data/README.md +29 -0
- data/Rakefile +19 -0
- data/bin/rmmseg +63 -0
- data/data/chars.dic +12638 -0
- data/data/words.dic +120308 -0
- data/ext/rmmseg/algor.cpp +222 -0
- data/ext/rmmseg/algor.h +80 -0
- data/ext/rmmseg/chunk.h +59 -0
- data/ext/rmmseg/dict.cpp +230 -0
- data/ext/rmmseg/dict.h +34 -0
- data/ext/rmmseg/extconf.rb +17 -0
- data/ext/rmmseg/memory.cpp +9 -0
- data/ext/rmmseg/memory.h +43 -0
- data/ext/rmmseg/rmmseg.cpp +263 -0
- data/ext/rmmseg/rules.h +86 -0
- data/ext/rmmseg/token.h +19 -0
- data/ext/rmmseg/word.h +44 -0
- data/lib/rmmseg/dictionary.rb +59 -0
- data/lib/rmmseg/ferret.rb +64 -0
- data/lib/rmmseg-cpp-traditional/version.rb +7 -0
- data/lib/rmmseg-cpp-traditional.rb +9 -0
- data/lib/rmmseg.rb +3 -0
- data/misc/convert.rb +114 -0
- data/misc/ferret_example.rb +59 -0
- data/misc/homepage.erb +196 -0
- data/misc/homepage.html +1212 -0
- data/rmmseg-cpp-traditional.gemspec +19 -0
- data/spec/rmmseg_spec.rb +8 -0
- data/spec/spec_helper.rb +17 -0
- data/tasks/ann.rake +81 -0
- data/tasks/bones.rake +21 -0
- data/tasks/gem.rake +126 -0
- data/tasks/git.rake +41 -0
- data/tasks/homepage.rake +15 -0
- data/tasks/manifest.rake +49 -0
- data/tasks/notes.rake +28 -0
- data/tasks/post_load.rake +39 -0
- data/tasks/rdoc.rake +51 -0
- data/tasks/rubyforge.rake +58 -0
- data/tasks/setup.rb +268 -0
- data/tasks/spec.rake +55 -0
- data/tasks/svn.rake +48 -0
- data/tasks/test.rake +38 -0
- data/test/test_rmmseg.rb +0 -0
- metadata +116 -0
@@ -0,0 +1,263 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
#include <cstdio> // for debug
|
3
|
+
|
4
|
+
#include "token.h"
|
5
|
+
#include "dict.h"
|
6
|
+
#include "algor.h"
|
7
|
+
|
8
|
+
using namespace std;
|
9
|
+
|
10
|
+
extern "C" {
|
11
|
+
|
12
|
+
/*****************************************
|
13
|
+
*
|
14
|
+
* Normal interface
|
15
|
+
*
|
16
|
+
*****************************************/
|
17
|
+
|
18
|
+
/*********************
|
19
|
+
* RMMSeg module
|
20
|
+
*********************/
|
21
|
+
static VALUE mRMMSeg;
|
22
|
+
|
23
|
+
|
24
|
+
/*********************
|
25
|
+
* Dictionary module
|
26
|
+
*********************/
|
27
|
+
static VALUE mDictionary;
|
28
|
+
|
29
|
+
/*
|
30
|
+
* Load a character dictionary.
|
31
|
+
*
|
32
|
+
* call-seq:
|
33
|
+
* load_chars(path) -> status
|
34
|
+
*
|
35
|
+
* Return +true+ if loaded successfully, +false+ otherwise.
|
36
|
+
*/
|
37
|
+
static VALUE dic_load_chars(VALUE mod, VALUE path)
|
38
|
+
{
|
39
|
+
if (rmmseg::dict::load_chars(RSTRING_PTR(path)))
|
40
|
+
return Qtrue;
|
41
|
+
return Qfalse;
|
42
|
+
}
|
43
|
+
|
44
|
+
/*
|
45
|
+
* Load a word dictionary.
|
46
|
+
*
|
47
|
+
* call-seq:
|
48
|
+
* load_words(path) -> status
|
49
|
+
*
|
50
|
+
* Return +true+ if loaded successfully, +false+ otherwise.
|
51
|
+
*/
|
52
|
+
static VALUE dic_load_words(VALUE mod, VALUE path)
|
53
|
+
{
|
54
|
+
if (rmmseg::dict::load_words(RSTRING_PTR(path)))
|
55
|
+
return Qtrue;
|
56
|
+
return Qfalse;
|
57
|
+
}
|
58
|
+
|
59
|
+
/*
|
60
|
+
* Add a word to the in-memory dictionary.
|
61
|
+
*
|
62
|
+
* call-seq:
|
63
|
+
* add(word, length, freq)
|
64
|
+
*
|
65
|
+
* - +word+ is a String.
|
66
|
+
* - +length+ is number of characters (not number of bytes) of the
|
67
|
+
* word to be added.
|
68
|
+
* - +freq+ is the frequency of the word. This is only used when
|
69
|
+
* it is a one-character word.
|
70
|
+
*/
|
71
|
+
static VALUE dic_add(VALUE mod, VALUE word, VALUE len, VALUE freq)
|
72
|
+
{
|
73
|
+
const char *str = RSTRING_PTR(word);
|
74
|
+
int nbytes = RSTRING_LEN(word);
|
75
|
+
rmmseg::Word *w = rmmseg::make_word(str, FIX2INT(len), FIX2INT(freq), nbytes);
|
76
|
+
rmmseg::dict::add(w);
|
77
|
+
return Qnil;
|
78
|
+
}
|
79
|
+
|
80
|
+
/*
|
81
|
+
* Check whether one word is included in the dictionary.
|
82
|
+
*
|
83
|
+
* call-seq:
|
84
|
+
* has_word?(word) -> result
|
85
|
+
*
|
86
|
+
* Return +true+ if the word is included in the dictionary,
|
87
|
+
* +false+ otherwise.
|
88
|
+
*/
|
89
|
+
static VALUE dic_has_word(VALUE mod, VALUE word)
|
90
|
+
{
|
91
|
+
const char *str = RSTRING_PTR(word);
|
92
|
+
int nbytes = RSTRING_LEN(word);
|
93
|
+
if (rmmseg::dict::get(str, nbytes) != NULL)
|
94
|
+
return Qtrue;
|
95
|
+
return Qfalse;
|
96
|
+
}
|
97
|
+
|
98
|
+
|
99
|
+
/**********************
|
100
|
+
* Token Class
|
101
|
+
**********************/
|
102
|
+
struct Token
|
103
|
+
{
|
104
|
+
VALUE text;
|
105
|
+
VALUE start;
|
106
|
+
VALUE end;
|
107
|
+
};
|
108
|
+
|
109
|
+
static void tk_mark(Token *t)
|
110
|
+
{
|
111
|
+
// start and end are Fixnums, no need to mark
|
112
|
+
rb_gc_mark(t->text);
|
113
|
+
}
|
114
|
+
static void tk_free(Token *t)
|
115
|
+
{
|
116
|
+
free(t);
|
117
|
+
}
|
118
|
+
|
119
|
+
/*
|
120
|
+
* Get the text held by this token.
|
121
|
+
*
|
122
|
+
* call-seq:
|
123
|
+
* text() -> text
|
124
|
+
*
|
125
|
+
*/
|
126
|
+
static VALUE tk_text(VALUE self)
|
127
|
+
{
|
128
|
+
Token *tk = (Token *)DATA_PTR(self);
|
129
|
+
return tk->text;
|
130
|
+
}
|
131
|
+
|
132
|
+
/*
|
133
|
+
* Get the start position of this token.
|
134
|
+
*
|
135
|
+
* call-seq:
|
136
|
+
* start() -> start_pos
|
137
|
+
*
|
138
|
+
*/
|
139
|
+
static VALUE tk_start(VALUE self)
|
140
|
+
{
|
141
|
+
Token *tk = (Token *)DATA_PTR(self);
|
142
|
+
return tk->start;
|
143
|
+
}
|
144
|
+
|
145
|
+
/*
|
146
|
+
* Get the end position of this token.
|
147
|
+
*
|
148
|
+
* call-seq:
|
149
|
+
* end() -> end_pos
|
150
|
+
*
|
151
|
+
*/
|
152
|
+
static VALUE tk_end(VALUE self)
|
153
|
+
{
|
154
|
+
Token *tk = (Token *)DATA_PTR(self);
|
155
|
+
return tk->end;
|
156
|
+
}
|
157
|
+
|
158
|
+
static VALUE cToken;
|
159
|
+
static VALUE tk_create(const char* base, const rmmseg::Token &t)
|
160
|
+
{
|
161
|
+
Token *tk = ALLOC(Token);
|
162
|
+
int start = t.text-base;
|
163
|
+
|
164
|
+
// This is necessary, see
|
165
|
+
// http://lifegoo.pluskid.org/?p=348
|
166
|
+
volatile VALUE text = rb_str_new(t.text, t.length);
|
167
|
+
tk->text = text;
|
168
|
+
|
169
|
+
tk->start = INT2FIX(start);
|
170
|
+
tk->end = INT2FIX(start + t.length);
|
171
|
+
volatile VALUE tok = Data_Wrap_Struct(cToken,
|
172
|
+
(RUBY_DATA_FUNC)tk_mark,
|
173
|
+
(RUBY_DATA_FUNC)tk_free,
|
174
|
+
tk);
|
175
|
+
return tok;
|
176
|
+
}
|
177
|
+
|
178
|
+
/*********************
|
179
|
+
* Algorithm Class
|
180
|
+
*********************/
|
181
|
+
struct Algorithm
|
182
|
+
{
|
183
|
+
VALUE text; // hold to avoid being garbage collected
|
184
|
+
rmmseg::Algorithm *algor;
|
185
|
+
};
|
186
|
+
|
187
|
+
static void algor_mark(Algorithm *a)
|
188
|
+
{
|
189
|
+
rb_gc_mark(a->text);
|
190
|
+
}
|
191
|
+
static void algor_free(Algorithm *a)
|
192
|
+
{
|
193
|
+
free(a->algor);
|
194
|
+
}
|
195
|
+
|
196
|
+
static VALUE cAlgorithm;
|
197
|
+
|
198
|
+
/*
|
199
|
+
* Create an Algorithm object to do segmenting on +text+.
|
200
|
+
*
|
201
|
+
* call-seq:
|
202
|
+
* new(text) -> algorithm
|
203
|
+
*
|
204
|
+
*/
|
205
|
+
static VALUE algor_create(VALUE klass, VALUE text)
|
206
|
+
{
|
207
|
+
Algorithm *algor = ALLOC(Algorithm);
|
208
|
+
void *mem;
|
209
|
+
algor->text = text;
|
210
|
+
mem = malloc(sizeof(rmmseg::Algorithm));
|
211
|
+
algor->algor = new(mem) rmmseg::Algorithm(RSTRING_PTR(text),
|
212
|
+
RSTRING_LEN(text));
|
213
|
+
|
214
|
+
return Data_Wrap_Struct(klass,
|
215
|
+
(RUBY_DATA_FUNC)algor_mark,
|
216
|
+
(RUBY_DATA_FUNC)algor_free,
|
217
|
+
algor);
|
218
|
+
}
|
219
|
+
|
220
|
+
/*
|
221
|
+
* Get next token.
|
222
|
+
*
|
223
|
+
* call-seq:
|
224
|
+
* next_token() -> token
|
225
|
+
*
|
226
|
+
* Return +nil+ if no more token available.
|
227
|
+
*/
|
228
|
+
static VALUE algor_next_token(VALUE self)
|
229
|
+
{
|
230
|
+
Algorithm *algor = (Algorithm *)DATA_PTR(self);
|
231
|
+
rmmseg::Token tk = algor->algor->next_token();
|
232
|
+
|
233
|
+
if (tk.length == 0)
|
234
|
+
return Qnil;
|
235
|
+
volatile VALUE rtk = tk_create(RSTRING_PTR(algor->text), tk);
|
236
|
+
return rtk;
|
237
|
+
}
|
238
|
+
|
239
|
+
|
240
|
+
void Init_rmmseg()
|
241
|
+
{
|
242
|
+
mRMMSeg = rb_define_module("RMMSeg");
|
243
|
+
|
244
|
+
/* Manage dictionaries used by rmmseg. */
|
245
|
+
mDictionary = rb_define_module_under(mRMMSeg, "Dictionary");
|
246
|
+
rb_define_singleton_method(mDictionary, "load_chars", RUBY_METHOD_FUNC(dic_load_chars), 1);
|
247
|
+
rb_define_singleton_method(mDictionary, "load_words", RUBY_METHOD_FUNC(dic_load_words), 1);
|
248
|
+
rb_define_singleton_method(mDictionary, "add", RUBY_METHOD_FUNC(dic_add), 3);
|
249
|
+
rb_define_singleton_method(mDictionary, "has_word?", RUBY_METHOD_FUNC(dic_has_word), 1);
|
250
|
+
|
251
|
+
/* A Token hold the text and related position information. */
|
252
|
+
cToken = rb_define_class_under(mRMMSeg, "Token", rb_cObject);
|
253
|
+
rb_undef_method(rb_singleton_class(cToken), "new");
|
254
|
+
rb_define_method(cToken, "text", RUBY_METHOD_FUNC(tk_text), 0);
|
255
|
+
rb_define_method(cToken, "start", RUBY_METHOD_FUNC(tk_start), 0);
|
256
|
+
rb_define_method(cToken, "end", RUBY_METHOD_FUNC(tk_end), 0);
|
257
|
+
|
258
|
+
/* An Algorithm object use the MMSEG algorithm to do segmenting. */
|
259
|
+
cAlgorithm = rb_define_class_under(mRMMSeg, "Algorithm", rb_cObject);
|
260
|
+
rb_define_singleton_method(cAlgorithm, "new", RUBY_METHOD_FUNC(algor_create), 1);
|
261
|
+
rb_define_method(cAlgorithm, "next_token", RUBY_METHOD_FUNC(algor_next_token), 0);
|
262
|
+
}
|
263
|
+
}
|
data/ext/rmmseg/rules.h
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
#ifndef _RULES_H_
|
2
|
+
#define _RULES_H_
|
3
|
+
|
4
|
+
#include <vector>
|
5
|
+
#include <algorithm>
|
6
|
+
|
7
|
+
#include "chunk.h"
|
8
|
+
|
9
|
+
namespace rmmseg
|
10
|
+
{
|
11
|
+
template <typename Cmp>
|
12
|
+
void take_highest(std::vector<Chunk> &chunks, const Cmp &cmp)
|
13
|
+
{
|
14
|
+
unsigned int i = 1, j;
|
15
|
+
|
16
|
+
for (j = 1; j < chunks.size(); ++j)
|
17
|
+
{
|
18
|
+
int rlt = cmp(chunks[j], chunks[0]);
|
19
|
+
if (rlt > 0)
|
20
|
+
i = 0;
|
21
|
+
if (rlt >= 0)
|
22
|
+
std::swap(chunks[i++], chunks[j]);
|
23
|
+
}
|
24
|
+
chunks.erase(chunks.begin()+i, chunks.end());
|
25
|
+
}
|
26
|
+
|
27
|
+
struct MMCmp_t
|
28
|
+
{
|
29
|
+
int operator()(const Chunk &a, const Chunk &b) const
|
30
|
+
{
|
31
|
+
return a.total_length() - b.total_length();
|
32
|
+
}
|
33
|
+
} MMCmp;
|
34
|
+
void mm_filter(std::vector<Chunk> &chunks)
|
35
|
+
{
|
36
|
+
take_highest(chunks, MMCmp);
|
37
|
+
}
|
38
|
+
|
39
|
+
struct LAWLCmp_t
|
40
|
+
{
|
41
|
+
int operator()(const Chunk &a, const Chunk &b) const
|
42
|
+
{
|
43
|
+
double rlt = a.average_length() - b.average_length();
|
44
|
+
if (rlt == 0)
|
45
|
+
return 0;
|
46
|
+
if (rlt > 0)
|
47
|
+
return 1;
|
48
|
+
return -1;
|
49
|
+
}
|
50
|
+
} LAWLCmp;
|
51
|
+
void lawl_filter(std::vector<Chunk> &chunks)
|
52
|
+
{
|
53
|
+
take_highest(chunks, LAWLCmp);
|
54
|
+
}
|
55
|
+
|
56
|
+
struct SVWLCmp_t
|
57
|
+
{
|
58
|
+
int operator()(const Chunk &a, const Chunk& b) const
|
59
|
+
{
|
60
|
+
double rlt = a.variance() - b.variance();
|
61
|
+
if (rlt == 0)
|
62
|
+
return 0;
|
63
|
+
if (rlt < 0)
|
64
|
+
return 1;
|
65
|
+
return -1;
|
66
|
+
}
|
67
|
+
} SVWLCmp;
|
68
|
+
void svwl_filter(std::vector<Chunk> &chunks)
|
69
|
+
{
|
70
|
+
take_highest(chunks, SVWLCmp);
|
71
|
+
}
|
72
|
+
|
73
|
+
struct LSDMFOCWCmp_t
|
74
|
+
{
|
75
|
+
int operator()(const Chunk &a, const Chunk& b) const
|
76
|
+
{
|
77
|
+
return a.degree_of_morphemic_freedom() - b.degree_of_morphemic_freedom();
|
78
|
+
}
|
79
|
+
} LSDMFOCWCmp;
|
80
|
+
void lsdmfocw_filter(std::vector<Chunk> &chunks)
|
81
|
+
{
|
82
|
+
take_highest(chunks, LSDMFOCWCmp);
|
83
|
+
}
|
84
|
+
}
|
85
|
+
|
86
|
+
#endif /* _RULES_H_ */
|
data/ext/rmmseg/token.h
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
#ifndef _TOKEN_H_
|
2
|
+
#define _TOKEN_H_
|
3
|
+
|
4
|
+
namespace rmmseg
|
5
|
+
{
|
6
|
+
struct Token
|
7
|
+
{
|
8
|
+
Token(const char *txt, int len)
|
9
|
+
:text(txt), length(len) { }
|
10
|
+
// `text' may or may not be nul-terminated, its length
|
11
|
+
// should be stored in the `length' field.
|
12
|
+
//
|
13
|
+
// if length is 0, this is an empty token
|
14
|
+
const char *text;
|
15
|
+
int length;
|
16
|
+
};
|
17
|
+
}
|
18
|
+
|
19
|
+
#endif /* _TOKEN_H_ */
|
data/ext/rmmseg/word.h
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
#ifndef _WORD_H_
|
2
|
+
#define _WORD_H_
|
3
|
+
|
4
|
+
#include <climits>
|
5
|
+
#include <cstring>
|
6
|
+
|
7
|
+
#include "memory.h"
|
8
|
+
|
9
|
+
namespace rmmseg
|
10
|
+
{
|
11
|
+
const int word_embed_len = 4; /* at least 1 char (3 bytes+'\0') */
|
12
|
+
struct Word
|
13
|
+
{
|
14
|
+
unsigned char nbytes; /* number of bytes */
|
15
|
+
char length; /* number of characters */
|
16
|
+
unsigned short freq;
|
17
|
+
char text[word_embed_len];
|
18
|
+
};
|
19
|
+
|
20
|
+
/**
|
21
|
+
* text: the text of the word.
|
22
|
+
* length: number of characters (not bytes).
|
23
|
+
* freq: the frequency of the word.
|
24
|
+
*/
|
25
|
+
inline Word *make_word(const char *text, int length=1,
|
26
|
+
int freq=0, int nbytes=-1)
|
27
|
+
{
|
28
|
+
if (freq > USHRT_MAX)
|
29
|
+
freq = USHRT_MAX; /* avoid overflow */
|
30
|
+
if (nbytes == -1)
|
31
|
+
nbytes = std::strlen(text);
|
32
|
+
Word *w = static_cast<Word *>(pool_alloc(sizeof(Word)
|
33
|
+
+ nbytes+1
|
34
|
+
- word_embed_len));
|
35
|
+
w->nbytes = nbytes;
|
36
|
+
w->length = length;
|
37
|
+
w->freq = freq;
|
38
|
+
std::strncpy(w->text, text, nbytes);
|
39
|
+
w->text[nbytes] = '\0';
|
40
|
+
return w;
|
41
|
+
}
|
42
|
+
}
|
43
|
+
|
44
|
+
#endif /* _WORD_H_ */
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module RMMSeg
|
2
|
+
module Dictionary
|
3
|
+
@dictionaries = [
|
4
|
+
[:chars, File.join(File.dirname(__FILE__),
|
5
|
+
"..", "..", "data", "chars.dic")],
|
6
|
+
[:words, File.join(File.dirname(__FILE__),
|
7
|
+
"..", "..", "data", "words.dic")]
|
8
|
+
]
|
9
|
+
|
10
|
+
class << self
|
11
|
+
#
|
12
|
+
# An array of dictionaries used by RMMSeg. Each entry is of the
|
13
|
+
# following form:
|
14
|
+
#
|
15
|
+
# [type, path]
|
16
|
+
#
|
17
|
+
# where +type+ can either <tt>:chars</tt> or <tt>:words</tt>. +path+ is the path
|
18
|
+
# to the dictionary file.
|
19
|
+
#
|
20
|
+
# The format of <tt>:chars</tt> dictionary is a collection of lines of the
|
21
|
+
# following form:
|
22
|
+
#
|
23
|
+
# freq char
|
24
|
+
#
|
25
|
+
# Where +frequency+ is a number <b>less than 65535</b>. +char+ is the
|
26
|
+
# character. They are spearated by <b>exactly one space</b>.
|
27
|
+
#
|
28
|
+
# The format of <tt>:words</tt> dictionary is similar:
|
29
|
+
#
|
30
|
+
# length word
|
31
|
+
#
|
32
|
+
# except the first number is not the frequency, but the number of
|
33
|
+
# characters (not number of bytes) in the word.
|
34
|
+
#
|
35
|
+
# There's a script (convert.rb) in the tools directory that can be used
|
36
|
+
# to convert and normalize dictionaries.
|
37
|
+
attr_accessor :dictionaries
|
38
|
+
|
39
|
+
# Add a user defined dictionary, +type+ can be
|
40
|
+
# +:chars+ or <tt>:words</tt>. See doc of dictionaries.
|
41
|
+
def add_dictionary(path, type)
|
42
|
+
@dictionaries << [type, path]
|
43
|
+
end
|
44
|
+
|
45
|
+
# Load dictionaries. Call this method after set up the path of the
|
46
|
+
# dictionaries needed to load and before any Algorithm object is
|
47
|
+
# created.
|
48
|
+
def load_dictionaries()
|
49
|
+
@dictionaries.each do |type, path|
|
50
|
+
if type == :chars
|
51
|
+
load_chars(path)
|
52
|
+
elsif type == :words
|
53
|
+
load_words(path)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rmmseg'
|
3
|
+
require 'ferret'
|
4
|
+
|
5
|
+
module RMMSeg
|
6
|
+
module Ferret
|
7
|
+
# The Analyzer class can be used with Ferret .
|
8
|
+
class Analyzer < ::Ferret::Analysis::Analyzer
|
9
|
+
|
10
|
+
# Construct an Analyzer. Optional block can be used to
|
11
|
+
# add more +TokenFilter+s. e.g.
|
12
|
+
#
|
13
|
+
# analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
|
14
|
+
# Ferret::Analysis::LowerCaseFilter.new(tokenizer)
|
15
|
+
# }
|
16
|
+
#
|
17
|
+
def initialize(&brk)
|
18
|
+
@brk = brk
|
19
|
+
end
|
20
|
+
|
21
|
+
def token_stream(field, text)
|
22
|
+
t = Tokenizer.new(text)
|
23
|
+
if @brk
|
24
|
+
@brk.call(t)
|
25
|
+
else
|
26
|
+
t
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# The Tokenizer tokenize text with RMMSeg::Algorithm.
|
32
|
+
class Tokenizer < ::Ferret::Analysis::TokenStream
|
33
|
+
# Create a new Tokenizer to tokenize +text+
|
34
|
+
def initialize(str)
|
35
|
+
self.text = str
|
36
|
+
end
|
37
|
+
|
38
|
+
# Get next token
|
39
|
+
def next
|
40
|
+
tok = @algor.next_token
|
41
|
+
if tok.nil?
|
42
|
+
return nil
|
43
|
+
else
|
44
|
+
@token.text = tok.text
|
45
|
+
@token.start = tok.start
|
46
|
+
@token.end = tok.end
|
47
|
+
return @token
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# Get the text being tokenized
|
52
|
+
def text
|
53
|
+
@text
|
54
|
+
end
|
55
|
+
|
56
|
+
# Set the text to be tokenized
|
57
|
+
def text=(str)
|
58
|
+
@token = ::Ferret::Analysis::Token.new("", 0, 0)
|
59
|
+
@text = str
|
60
|
+
@algor = Algorithm.new(@text)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
data/lib/rmmseg.rb
ADDED
data/misc/convert.rb
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
# A utility used to convert the old RMMSeg dictionary
|
4
|
+
# to rmmseg-cpp format.
|
5
|
+
|
6
|
+
# There are several constrains for the new rmmseg-cpp
|
7
|
+
# dictionary format.
|
8
|
+
# - length of word should be specified in the dict
|
9
|
+
# - number and string should be separated by ONE space
|
10
|
+
# - there should be a newline at the end of file
|
11
|
+
|
12
|
+
$KCODE='u'
|
13
|
+
require 'jcode'
|
14
|
+
|
15
|
+
def usage(msg=nil)
|
16
|
+
puts "***ERROR: #{msg}\n\n" if msg
|
17
|
+
puts <<EOT
|
18
|
+
Usage:
|
19
|
+
|
20
|
+
#{$0} action type input.dic output.dic
|
21
|
+
|
22
|
+
action: either 'convert' or 'normalize'
|
23
|
+
- 'convert' is used to convert the dict from
|
24
|
+
old RMMSeg format.
|
25
|
+
- 'normalize' is used to normalize an existing
|
26
|
+
rmmseg-cpp dict.
|
27
|
+
|
28
|
+
type: either 'words' or 'chars'
|
29
|
+
|
30
|
+
EOT
|
31
|
+
exit(0)
|
32
|
+
end
|
33
|
+
|
34
|
+
usage if ARGV.size != 4
|
35
|
+
usage("unknown action #{ARGV[0]}") if ! ['convert', 'normalize'].include? ARGV[0]
|
36
|
+
usage("unknown type #{ARGV[1]}") if ! ['words', 'chars'].include? ARGV[1]
|
37
|
+
|
38
|
+
def output(data)
|
39
|
+
File.open(ARGV[3], "w") do |f|
|
40
|
+
data.each do |num, word|
|
41
|
+
f.puts "#{num} #{word}" if word
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def read_RMMSeg_chars
|
47
|
+
max = 0
|
48
|
+
File.readlines(ARGV[2]).map do |line|
|
49
|
+
if line =~ /^(.)\s+(\d+)$/
|
50
|
+
n = $2.to_i
|
51
|
+
max = n if n > max
|
52
|
+
[n, $1]
|
53
|
+
else
|
54
|
+
[nil, nil]
|
55
|
+
end
|
56
|
+
end.map do |num, word|
|
57
|
+
if word
|
58
|
+
[num*65535/max, word]
|
59
|
+
else
|
60
|
+
[nil, nil]
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def read_RMMSeg_words
|
66
|
+
File.readlines(ARGV[2]).map do |line|
|
67
|
+
line.chomp!
|
68
|
+
if !line.empty?
|
69
|
+
[line.jlength, line]
|
70
|
+
else
|
71
|
+
[nil, nil]
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def read_rmmseg_cpp_chars
|
77
|
+
max = 0
|
78
|
+
File.readlines(ARGV[2]).map do |line|
|
79
|
+
if line =~ /^(\d+)\s+(.)$/
|
80
|
+
n = $1.to_i
|
81
|
+
max = n if n > max
|
82
|
+
[n, $2]
|
83
|
+
else
|
84
|
+
[nil, nil]
|
85
|
+
end
|
86
|
+
end.map do |num, word|
|
87
|
+
if word
|
88
|
+
[num*65535/max, word]
|
89
|
+
else
|
90
|
+
[nil, nil]
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def read_rmmseg_cpp_words
|
96
|
+
File.readlines(ARGV[2]).map do |line|
|
97
|
+
if line =~ /^(\d+)\s+(\w+)$/
|
98
|
+
[$1, $2]
|
99
|
+
else
|
100
|
+
[nil, nil]
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
case ARGV[0,2]
|
106
|
+
when ['convert', 'chars']
|
107
|
+
output(read_RMMSeg_chars)
|
108
|
+
when ['convert', 'words']
|
109
|
+
output(read_RMMSeg_words)
|
110
|
+
when ['normalize', 'chars']
|
111
|
+
output(read_rmmseg_cpp_chars)
|
112
|
+
when ['normalize', 'words']
|
113
|
+
output(read_rmmseg_cpp_words)
|
114
|
+
end
|