rmmseg-cpp 0.2.7 → 0.2.9
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +8 -0
- data/README +11 -0
- data/Rakefile +1 -1
- data/ext/rmmseg/algor.cpp +24 -33
- data/ext/rmmseg/algor.h +13 -14
- data/ext/rmmseg/chunk.h +12 -11
- data/ext/rmmseg/dict.cpp +11 -9
- data/ext/rmmseg/memory.cpp +2 -2
- data/ext/rmmseg/memory.h +4 -4
- data/ext/rmmseg/rmmseg.cpp +13 -11
- data/ext/rmmseg/rules.h +18 -19
- data/ext/rmmseg/word.h +2 -2
- data/misc/homepage.erb +21 -4
- data/misc/homepage.html +97 -87
- metadata +32 -7
data/History.txt
CHANGED
data/README
CHANGED
@@ -43,6 +43,17 @@ init by loading the dictionaries:
|
|
43
43
|
|
44
44
|
RMMSeg::Dictionary.load_dictionaries
|
45
45
|
|
46
|
+
If you want to add customized dictionaries, append them to
|
47
|
+
+RMMSeg::Dictionary.dictionaries+ before calling +load_dictionaries+.
|
48
|
+
The formats of chars.dic and words.dic are NOT the same:
|
49
|
+
|
50
|
+
* For chars.dic, each line contains freq, a space, and then the character
|
51
|
+
* For words.dic, each line contains length, a space, and then the word.
|
52
|
+
|
53
|
+
Note length mean the length of the word, i.e. the number of characters
|
54
|
+
of the word, not number of bytes. WARNING: there should be a newline at
|
55
|
+
the end of every dictionary file.
|
56
|
+
|
46
57
|
Then create a +Algorithm+ object and call +next_token+ until got a
|
47
58
|
+nil+:
|
48
59
|
|
data/Rakefile
CHANGED
data/ext/rmmseg/algor.cpp
CHANGED
@@ -71,27 +71,28 @@ namespace rmmseg
|
|
71
71
|
|
72
72
|
Token Algorithm::get_cjk_word(int len)
|
73
73
|
{
|
74
|
-
create_chunks();
|
75
|
-
|
76
|
-
if (
|
77
|
-
|
78
|
-
if (
|
79
|
-
|
80
|
-
if (
|
81
|
-
|
82
|
-
if (
|
83
|
-
|
84
|
-
|
85
|
-
if (
|
74
|
+
vector<Chunk> chunks = create_chunks();
|
75
|
+
|
76
|
+
if (chunks.size() > 1)
|
77
|
+
mm_filter(chunks);
|
78
|
+
if (chunks.size() > 1)
|
79
|
+
lawl_filter(chunks);
|
80
|
+
if (chunks.size() > 1)
|
81
|
+
svwl_filter(chunks);
|
82
|
+
if (chunks.size() > 1)
|
83
|
+
lsdmfocw_filter(chunks);
|
84
|
+
|
85
|
+
if (chunks.size() < 1)
|
86
86
|
return Token(NULL, 0);
|
87
87
|
|
88
|
-
Token token(m_text+m_pos,
|
89
|
-
m_pos +=
|
88
|
+
Token token(m_text+m_pos, chunks[0].words[0]->nbytes);
|
89
|
+
m_pos += chunks[0].words[0]->nbytes;
|
90
90
|
return token;
|
91
91
|
}
|
92
92
|
|
93
|
-
|
93
|
+
vector<Chunk> Algorithm::create_chunks()
|
94
94
|
{
|
95
|
+
vector<Chunk> chunks;
|
95
96
|
Chunk chunk;
|
96
97
|
Word *w1, *w2, *w3;
|
97
98
|
|
@@ -100,8 +101,6 @@ namespace rmmseg
|
|
100
101
|
typedef vec_t::iterator it_t;
|
101
102
|
|
102
103
|
vec_t words1 = find_match_words();
|
103
|
-
m_chunks_size = 0;
|
104
|
-
|
105
104
|
for (it_t i1 = words1.begin();
|
106
105
|
i1 != words1.end();
|
107
106
|
++i1)
|
@@ -136,17 +135,13 @@ namespace rmmseg
|
|
136
135
|
chunk.n = 3;
|
137
136
|
chunk.words[2] = w3;
|
138
137
|
}
|
139
|
-
|
140
|
-
sizeof(Chunk));
|
141
|
-
m_chunks_size++;
|
138
|
+
chunks.push_back(chunk);
|
142
139
|
}
|
143
140
|
}
|
144
141
|
else if (m_pos == m_text_length)
|
145
142
|
{
|
146
143
|
chunk.n = 2;
|
147
|
-
|
148
|
-
sizeof(Chunk));
|
149
|
-
m_chunks_size++;
|
144
|
+
chunks.push_back(chunk);
|
150
145
|
}
|
151
146
|
m_pos -= w2->nbytes;
|
152
147
|
}
|
@@ -154,13 +149,13 @@ namespace rmmseg
|
|
154
149
|
else if (m_pos == m_text_length)
|
155
150
|
{
|
156
151
|
chunk.n = 1;
|
157
|
-
|
158
|
-
m_chunks_size++;
|
152
|
+
chunks.push_back(chunk);
|
159
153
|
}
|
160
154
|
m_pos -= w1->nbytes;
|
161
155
|
}
|
162
156
|
|
163
157
|
m_pos = orig_pos;
|
158
|
+
return chunks;
|
164
159
|
}
|
165
160
|
|
166
161
|
int Algorithm::next_char()
|
@@ -169,15 +164,11 @@ namespace rmmseg
|
|
169
164
|
unsigned char ch = m_text[m_pos];
|
170
165
|
if (ch >= 0xC0 && ch <= 0xDF)
|
171
166
|
{
|
172
|
-
|
173
|
-
return 1; /* broken text at the end */
|
174
|
-
return 2;
|
167
|
+
return min(2, m_text_length-m_pos);
|
175
168
|
}
|
176
169
|
if (ch >= 0xE0 && ch <= 0xEF)
|
177
170
|
{
|
178
|
-
|
179
|
-
return 1; /* broken text at the end */
|
180
|
-
return 3;
|
171
|
+
return min(3, m_text_length-m_pos);
|
181
172
|
}
|
182
173
|
return 1;
|
183
174
|
}
|
@@ -195,11 +186,11 @@ namespace rmmseg
|
|
195
186
|
|
196
187
|
while (m_pos < m_text_length)
|
197
188
|
{
|
189
|
+
if (n >= max_word_length())
|
190
|
+
break;
|
198
191
|
len = next_char();
|
199
192
|
if (len <= 1)
|
200
193
|
break;
|
201
|
-
if (n >= max_word_length())
|
202
|
-
break;
|
203
194
|
|
204
195
|
m_pos += len;
|
205
196
|
n++;
|
data/ext/rmmseg/algor.h
CHANGED
@@ -22,33 +22,32 @@ namespace rmmseg
|
|
22
22
|
{
|
23
23
|
public:
|
24
24
|
Algorithm(const char *text, int length)
|
25
|
-
:
|
25
|
+
:m_text(text), m_pos(0),
|
26
26
|
m_text_length(length),
|
27
27
|
m_tmp_words_i(0),
|
28
28
|
m_match_cache_i(0)
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
29
|
+
{
|
30
|
+
for (int i = 0; i < match_cache_size; ++i)
|
31
|
+
m_match_cache[i].first = -1;
|
32
|
+
}
|
33
33
|
|
34
34
|
Token next_token();
|
35
35
|
|
36
|
+
const char *get_text() const
|
37
|
+
{
|
38
|
+
return m_text;
|
39
|
+
}
|
40
|
+
|
36
41
|
private:
|
37
42
|
Token get_basic_latin_word();
|
38
43
|
Token get_cjk_word(int);
|
39
|
-
|
40
|
-
static const int MAX_WORD_LENGTH = 4;
|
41
|
-
static const int MAX_N_CHUNKS = \
|
42
|
-
MAX_WORD_LENGTH*MAX_WORD_LENGTH*MAX_WORD_LENGTH;
|
43
44
|
|
44
|
-
|
45
|
+
std::vector<Chunk> create_chunks();
|
45
46
|
int next_word();
|
46
47
|
int next_char();
|
47
48
|
std::vector<Word *> find_match_words();
|
48
|
-
int max_word_length() { return
|
49
|
+
int max_word_length() { return 4; }
|
49
50
|
|
50
|
-
Chunk m_chunks[MAX_N_CHUNKS];
|
51
|
-
int m_chunks_size;
|
52
51
|
|
53
52
|
const char *m_text;
|
54
53
|
int m_pos;
|
@@ -65,7 +64,7 @@ namespace rmmseg
|
|
65
64
|
return &m_tmp_words[m_tmp_words_i++];
|
66
65
|
}
|
67
66
|
|
68
|
-
/* related to max_word_length and
|
67
|
+
/* related to max_word_length and match_cache_size */
|
69
68
|
static const int max_tmp_words = 64;
|
70
69
|
Word m_tmp_words[max_tmp_words];
|
71
70
|
int m_tmp_words_i;
|
data/ext/rmmseg/chunk.h
CHANGED
@@ -12,36 +12,37 @@ namespace rmmseg
|
|
12
12
|
*/
|
13
13
|
struct Chunk
|
14
14
|
{
|
15
|
-
int total_length()
|
15
|
+
int total_length() const
|
16
16
|
{
|
17
17
|
int len = 0;
|
18
18
|
for (int i = 0; i < n; ++i)
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
19
|
+
len += std::abs(words[i]->length);
|
20
|
+
//if (words[i]->length == -1) /* tmp word */
|
21
|
+
// len += 1;
|
22
|
+
//else
|
23
|
+
// len += words[i]->length;
|
23
24
|
return len;
|
24
25
|
}
|
25
|
-
double average_length()
|
26
|
+
double average_length() const
|
26
27
|
{
|
27
28
|
return ((double)total_length())/n;
|
28
29
|
}
|
29
|
-
double variance()
|
30
|
+
double variance() const
|
30
31
|
{
|
31
32
|
double avg = average_length();
|
32
33
|
double sqr_sum = 0;
|
33
34
|
double tmp;
|
34
35
|
for (int i = 0; i < n; ++i)
|
35
36
|
{
|
36
|
-
tmp = words[i]->length;
|
37
|
-
if (tmp == -1)
|
38
|
-
|
37
|
+
tmp = std::abs(words[i]->length);
|
38
|
+
//if (tmp == -1)
|
39
|
+
// tmp = 1;
|
39
40
|
tmp = tmp-avg;
|
40
41
|
sqr_sum += tmp*tmp;
|
41
42
|
}
|
42
43
|
return std::sqrt(sqr_sum);
|
43
44
|
}
|
44
|
-
int degree_of_morphemic_freedom()
|
45
|
+
int degree_of_morphemic_freedom() const
|
45
46
|
{
|
46
47
|
int sum = 0;
|
47
48
|
for (int i = 0; i < n; ++i)
|
data/ext/rmmseg/dict.cpp
CHANGED
@@ -12,12 +12,12 @@ namespace rmmseg
|
|
12
12
|
Entry *next;
|
13
13
|
};
|
14
14
|
|
15
|
-
const
|
16
|
-
const
|
15
|
+
const size_t init_size = 262147;
|
16
|
+
const size_t max_density = 5;
|
17
17
|
/*
|
18
18
|
Table of prime numbers 2^n+a, 2<=n<=30.
|
19
19
|
*/
|
20
|
-
static
|
20
|
+
static size_t primes[] = {
|
21
21
|
524288 + 21,
|
22
22
|
1048576 + 7,
|
23
23
|
2097152 + 17,
|
@@ -33,14 +33,14 @@ namespace rmmseg
|
|
33
33
|
};
|
34
34
|
|
35
35
|
|
36
|
-
static
|
37
|
-
static
|
36
|
+
static size_t n_bins = init_size;
|
37
|
+
static size_t n_entries = 0;
|
38
38
|
static Entry **bins = static_cast<Entry **>(std::calloc(init_size,
|
39
39
|
sizeof(Entry *)));
|
40
40
|
|
41
|
-
static
|
41
|
+
static size_t new_size()
|
42
42
|
{
|
43
|
-
for (
|
43
|
+
for (size_t i = 0;
|
44
44
|
i < sizeof(primes)/sizeof(primes[0]);
|
45
45
|
++i)
|
46
46
|
{
|
@@ -76,7 +76,7 @@ namespace rmmseg
|
|
76
76
|
Entry *entry, *next;
|
77
77
|
unsigned int hash_val;
|
78
78
|
|
79
|
-
for (
|
79
|
+
for (size_t i = 0; i < n_bins; ++i)
|
80
80
|
{
|
81
81
|
entry = bins[i];
|
82
82
|
while (entry)
|
@@ -140,6 +140,7 @@ namespace rmmseg
|
|
140
140
|
entry->next = NULL;
|
141
141
|
bins[h] = entry;
|
142
142
|
n_entries++;
|
143
|
+
return;
|
143
144
|
}
|
144
145
|
|
145
146
|
bool done = false;
|
@@ -168,6 +169,7 @@ namespace rmmseg
|
|
168
169
|
entry->word = word;
|
169
170
|
entry->next = bins[h];
|
170
171
|
bins[h] = entry;
|
172
|
+
n_entries++;
|
171
173
|
}
|
172
174
|
}
|
173
175
|
|
@@ -179,7 +181,7 @@ namespace rmmseg
|
|
179
181
|
return false;
|
180
182
|
}
|
181
183
|
|
182
|
-
const
|
184
|
+
const size_t buf_len = 24;
|
183
185
|
char buf[buf_len];
|
184
186
|
char *ptr;
|
185
187
|
|
data/ext/rmmseg/memory.cpp
CHANGED
data/ext/rmmseg/memory.h
CHANGED
@@ -12,12 +12,12 @@
|
|
12
12
|
|
13
13
|
namespace rmmseg
|
14
14
|
{
|
15
|
-
const
|
15
|
+
const size_t REALLOC_SIZE = 2048; /* 2KB */
|
16
16
|
|
17
|
-
extern
|
18
|
-
extern char
|
17
|
+
extern size_t _pool_size;
|
18
|
+
extern char *_pool_base;
|
19
19
|
|
20
|
-
inline void *pool_alloc(
|
20
|
+
inline void *pool_alloc(size_t len)
|
21
21
|
{
|
22
22
|
void *mem = _pool_base;
|
23
23
|
|
data/ext/rmmseg/rmmseg.cpp
CHANGED
@@ -36,7 +36,7 @@ extern "C" {
|
|
36
36
|
*/
|
37
37
|
static VALUE dic_load_chars(VALUE mod, VALUE path)
|
38
38
|
{
|
39
|
-
if (rmmseg::dict::load_chars(
|
39
|
+
if (rmmseg::dict::load_chars(RSTRING_PTR(path)))
|
40
40
|
return Qtrue;
|
41
41
|
return Qfalse;
|
42
42
|
}
|
@@ -51,7 +51,7 @@ extern "C" {
|
|
51
51
|
*/
|
52
52
|
static VALUE dic_load_words(VALUE mod, VALUE path)
|
53
53
|
{
|
54
|
-
if (rmmseg::dict::load_words(
|
54
|
+
if (rmmseg::dict::load_words(RSTRING_PTR(path)))
|
55
55
|
return Qtrue;
|
56
56
|
return Qfalse;
|
57
57
|
}
|
@@ -70,8 +70,8 @@ extern "C" {
|
|
70
70
|
*/
|
71
71
|
static VALUE dic_add(VALUE mod, VALUE word, VALUE len, VALUE freq)
|
72
72
|
{
|
73
|
-
const char *str =
|
74
|
-
int nbytes =
|
73
|
+
const char *str = RSTRING_PTR(word);
|
74
|
+
int nbytes = RSTRING_LEN(word);
|
75
75
|
rmmseg::Word *w = rmmseg::make_word(str, FIX2INT(len), FIX2INT(freq), nbytes);
|
76
76
|
rmmseg::dict::add(w);
|
77
77
|
return Qnil;
|
@@ -88,8 +88,8 @@ extern "C" {
|
|
88
88
|
*/
|
89
89
|
static VALUE dic_has_word(VALUE mod, VALUE word)
|
90
90
|
{
|
91
|
-
const char *str =
|
92
|
-
int nbytes =
|
91
|
+
const char *str = RSTRING_PTR(word);
|
92
|
+
int nbytes = RSTRING_LEN(word);
|
93
93
|
if (rmmseg::dict::get(str, nbytes) != NULL)
|
94
94
|
return Qtrue;
|
95
95
|
return Qfalse;
|
@@ -162,16 +162,17 @@ extern "C" {
|
|
162
162
|
int start = t.text-base;
|
163
163
|
|
164
164
|
// This is necessary, see
|
165
|
-
// http://pluskid.
|
165
|
+
// http://lifegoo.pluskid.org/?p=348
|
166
166
|
volatile VALUE text = rb_str_new(t.text, t.length);
|
167
167
|
tk->text = text;
|
168
168
|
|
169
169
|
tk->start = INT2FIX(start);
|
170
170
|
tk->end = INT2FIX(start + t.length);
|
171
|
-
|
171
|
+
volatile VALUE tok = Data_Wrap_Struct(cToken,
|
172
172
|
(RUBY_DATA_FUNC)tk_mark,
|
173
173
|
(RUBY_DATA_FUNC)tk_free,
|
174
174
|
tk);
|
175
|
+
return tok;
|
175
176
|
}
|
176
177
|
|
177
178
|
/*********************
|
@@ -207,8 +208,8 @@ extern "C" {
|
|
207
208
|
void *mem;
|
208
209
|
algor->text = text;
|
209
210
|
mem = malloc(sizeof(rmmseg::Algorithm));
|
210
|
-
algor->algor = new(mem) rmmseg::Algorithm(
|
211
|
-
|
211
|
+
algor->algor = new(mem) rmmseg::Algorithm(RSTRING_PTR(text),
|
212
|
+
RSTRING_LEN(text));
|
212
213
|
|
213
214
|
return Data_Wrap_Struct(klass,
|
214
215
|
(RUBY_DATA_FUNC)algor_mark,
|
@@ -231,7 +232,8 @@ extern "C" {
|
|
231
232
|
|
232
233
|
if (tk.length == 0)
|
233
234
|
return Qnil;
|
234
|
-
|
235
|
+
volatile VALUE rtk = tk_create(RSTRING_PTR(algor->text), tk);
|
236
|
+
return rtk;
|
235
237
|
}
|
236
238
|
|
237
239
|
|
data/ext/rmmseg/rules.h
CHANGED
@@ -9,37 +9,36 @@
|
|
9
9
|
namespace rmmseg
|
10
10
|
{
|
11
11
|
template <typename Cmp>
|
12
|
-
|
12
|
+
void take_highest(std::vector<Chunk> &chunks, const Cmp &cmp)
|
13
13
|
{
|
14
|
-
int i = 1, j;
|
15
|
-
|
16
|
-
|
17
|
-
for (j = 1; j < n; ++j)
|
14
|
+
unsigned int i = 1, j;
|
15
|
+
|
16
|
+
for (j = 1; j < chunks.size(); ++j)
|
18
17
|
{
|
19
|
-
int rlt = cmp(chunks[j],
|
18
|
+
int rlt = cmp(chunks[j], chunks[0]);
|
20
19
|
if (rlt > 0)
|
21
20
|
i = 0;
|
22
21
|
if (rlt >= 0)
|
23
22
|
std::swap(chunks[i++], chunks[j]);
|
24
23
|
}
|
25
|
-
|
24
|
+
chunks.erase(chunks.begin()+i, chunks.end());
|
26
25
|
}
|
27
26
|
|
28
27
|
struct MMCmp_t
|
29
28
|
{
|
30
|
-
int operator()(Chunk &a, Chunk &b)
|
29
|
+
int operator()(const Chunk &a, const Chunk &b) const
|
31
30
|
{
|
32
31
|
return a.total_length() - b.total_length();
|
33
32
|
}
|
34
33
|
} MMCmp;
|
35
|
-
|
34
|
+
void mm_filter(std::vector<Chunk> &chunks)
|
36
35
|
{
|
37
|
-
|
36
|
+
take_highest(chunks, MMCmp);
|
38
37
|
}
|
39
38
|
|
40
39
|
struct LAWLCmp_t
|
41
40
|
{
|
42
|
-
int operator()(Chunk &a, Chunk &b)
|
41
|
+
int operator()(const Chunk &a, const Chunk &b) const
|
43
42
|
{
|
44
43
|
double rlt = a.average_length() - b.average_length();
|
45
44
|
if (rlt == 0)
|
@@ -49,14 +48,14 @@ namespace rmmseg
|
|
49
48
|
return -1;
|
50
49
|
}
|
51
50
|
} LAWLCmp;
|
52
|
-
|
51
|
+
void lawl_filter(std::vector<Chunk> &chunks)
|
53
52
|
{
|
54
|
-
|
53
|
+
take_highest(chunks, LAWLCmp);
|
55
54
|
}
|
56
55
|
|
57
56
|
struct SVWLCmp_t
|
58
57
|
{
|
59
|
-
int operator()(Chunk &a, Chunk& b)
|
58
|
+
int operator()(const Chunk &a, const Chunk& b) const
|
60
59
|
{
|
61
60
|
double rlt = a.variance() - b.variance();
|
62
61
|
if (rlt == 0)
|
@@ -66,21 +65,21 @@ namespace rmmseg
|
|
66
65
|
return -1;
|
67
66
|
}
|
68
67
|
} SVWLCmp;
|
69
|
-
|
68
|
+
void svwl_filter(std::vector<Chunk> &chunks)
|
70
69
|
{
|
71
|
-
|
70
|
+
take_highest(chunks, SVWLCmp);
|
72
71
|
}
|
73
72
|
|
74
73
|
struct LSDMFOCWCmp_t
|
75
74
|
{
|
76
|
-
int operator()(Chunk &a, Chunk& b)
|
75
|
+
int operator()(const Chunk &a, const Chunk& b) const
|
77
76
|
{
|
78
77
|
return a.degree_of_morphemic_freedom() - b.degree_of_morphemic_freedom();
|
79
78
|
}
|
80
79
|
} LSDMFOCWCmp;
|
81
|
-
|
80
|
+
void lsdmfocw_filter(std::vector<Chunk> &chunks)
|
82
81
|
{
|
83
|
-
|
82
|
+
take_highest(chunks, LSDMFOCWCmp);
|
84
83
|
}
|
85
84
|
}
|
86
85
|
|
data/ext/rmmseg/word.h
CHANGED
@@ -28,11 +28,11 @@ namespace rmmseg
|
|
28
28
|
if (freq > USHRT_MAX)
|
29
29
|
freq = USHRT_MAX; /* avoid overflow */
|
30
30
|
if (nbytes == -1)
|
31
|
-
nbytes = strlen(text);
|
31
|
+
nbytes = std::strlen(text);
|
32
32
|
Word *w = static_cast<Word *>(pool_alloc(sizeof(Word)
|
33
33
|
+ nbytes+1
|
34
34
|
- word_embed_len));
|
35
|
-
w->nbytes =
|
35
|
+
w->nbytes = nbytes;
|
36
36
|
w->length = length;
|
37
37
|
w->freq = freq;
|
38
38
|
std::strncpy(w->text, text, nbytes);
|
data/misc/homepage.erb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
<%# -*- mode: text; coding: utf-8 -*- %>
|
2
2
|
<%
|
3
3
|
$title = "rmmseg-cpp Homepage"
|
4
|
-
$authors = { 'pluskid' => 'http://pluskid.
|
4
|
+
$authors = { 'pluskid' => 'http://blog.pluskid.org' }
|
5
5
|
%>
|
6
6
|
|
7
7
|
<% chapter "Introduction" do %>
|
@@ -110,7 +110,24 @@
|
|
110
110
|
RMMSeg::Dictionary.load_dictionaries
|
111
111
|
</code>
|
112
112
|
|
113
|
-
Now rmmseg-cpp will be ready to do segmenting.
|
113
|
+
Now rmmseg-cpp will be ready to do segmenting. If you want to load your own customized
|
114
|
+
dictionaries, please customize <tt>RMMSeg::Dictionary.dictionaries</tt> before calling
|
115
|
+
<tt>load_dictionaries</tt>. e.g.
|
116
|
+
|
117
|
+
<code>
|
118
|
+
RMMSeg::Dictionary.dictionaries = [[:chars, "my_chars.dic"],
|
119
|
+
[:words, "my_words.dic"],
|
120
|
+
[:words, "my_words2.dic"]]
|
121
|
+
</code>
|
122
|
+
|
123
|
+
The basic format for char-dictionary and word-dictionary are similar. For each line,
|
124
|
+
there is a number, then *a* space, then the string. Note there *SHOULD* be a newline
|
125
|
+
at the end of the dictionary file. And the number in char-dictionary and word-dictionary
|
126
|
+
has different meaning.
|
127
|
+
|
128
|
+
In char-dictionary, the number means the frequency of the character. In word-dictionary,
|
129
|
+
the number mean the number of characters in the word. Note that this is NOT the number
|
130
|
+
of *bytes* in the word.
|
114
131
|
|
115
132
|
<% end %>
|
116
133
|
|
@@ -139,7 +156,7 @@
|
|
139
156
|
of running that example is shown in <%= xref "Ferret Example Screenshot" %>.
|
140
157
|
|
141
158
|
<% figure "Ferret Example Screenshot" do %>
|
142
|
-
!http://pluskid.
|
159
|
+
!http://lifegoo.pluskid.org/wp-content/uploads/2008/02/rmmseg.png!
|
143
160
|
<% end %>
|
144
161
|
|
145
162
|
<% end %>
|
@@ -174,6 +191,6 @@
|
|
174
191
|
<% chapter "Resources" do %>
|
175
192
|
* "Project Home":http://rubyforge.org/projects/rmmseg-cpp/: The Project page at RubyForge.
|
176
193
|
* "RDoc of rmmseg-cpp":http://rmmseg-cpp.rubyforge.org/rdoc/index.html: The auto generated rdoc of RMMSeg.
|
177
|
-
* "Free Mind":http://pluskid.
|
194
|
+
* "Free Mind":http://blog.pluskid.org/: The author's blog.
|
178
195
|
* "Author's Email":mailto:pluskid@gmail.com: Contact me if you have any problem.
|
179
196
|
<% end %>
|
data/misc/homepage.html
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
4
4
|
<head>
|
5
5
|
<meta http-equiv="content-type" content="text/html; charset=utf-8"/>
|
6
|
-
<meta name="date" content="
|
6
|
+
<meta name="date" content="10 September 2011"/>
|
7
7
|
<meta name="author" content="pluskid"/>
|
8
8
|
<meta name="generator" content="Gerbil 3.1.0"/>
|
9
9
|
<title>rmmseg-cpp Homepage</title>
|
@@ -928,8 +928,8 @@
|
|
928
928
|
|
929
929
|
|
930
930
|
<h1 class="title"><a class="here" href="#">rmmseg-cpp Homepage</a></h1>
|
931
|
-
<h2 class="authors"><a href="http://pluskid.
|
932
|
-
<h3 class="date">
|
931
|
+
<h2 class="authors"><a href="http://blog.pluskid.org">pluskid</a></h2>
|
932
|
+
<h3 class="date">10 September 2011</h3>
|
933
933
|
|
934
934
|
</div>
|
935
935
|
|
@@ -943,12 +943,12 @@
|
|
943
943
|
<div id="Contents">
|
944
944
|
<h1 class="title"><a class="here" href="#Contents">Contents</a></h1>
|
945
945
|
<ul>
|
946
|
-
<li>1 <a id="
|
946
|
+
<li>1 <a id="a16539600" href="#Introduction">Introduction</a></li><li>2 <a id="a16533660" href="#Setup">Setup</a><ul><li>2.1 <a id="a16530900" href="#Requirements">Requirements</a></li><li>2.2 <a id="a16472140" href="#Installation">Installation</a><ul><li>2.2.1 <a id="a16468300" href="#Using-RubyGems">Using RubyGems</a></li><li>2.2.2 <a id="a16363260" href="#From-Git">From Git</a></li></ul></li></ul></li><li>3 <a id="a16272720" href="#Usage">Usage</a><ul><li>3.1 <a id="a16246860" href="#Stand-Alone-rmmseg">Stand Alone rmmseg</a></li><li>3.2 <a id="a16240340" href="#Use-in-Ruby-program">Use in Ruby program</a><ul><li>3.2.1 <a id="a16231580" href="#Initialize">Initialize</a></li><li>3.2.2 <a id="a16187880" href="#Ferret-Integration">Ferret Integration</a></li><li>3.2.3 <a id="a16113620" href="#Normal-Ruby-program">Normal Ruby program</a></li></ul></li></ul></li><li>4 <a id="a16072000" href="#Who-use-it">Who use it</a></li><li>5 <a id="a16034860" href="#Resources">Resources</a></li>
|
947
947
|
|
948
948
|
</ul>
|
949
949
|
</div>
|
950
950
|
|
951
|
-
<div id="lof"><h1 id
|
951
|
+
<div id="lof"><h1 id=\"Figures\" class=\"title\"><a class=\"here\" href=\"#Figures\">Figures</a></h1> <ol><li><a id=\\\"a16148860\\\" href=\\\"#Ferret-Example-Screenshot\\\">Ferret Example Screenshot</a></li></ol><h1 id=\"Tips\" class=\"title\"><a class=\"here\" href=\"#Tips\">Tips</a></h1> <ol><li><a id=\\\"a16067160\\\" href=\\\"#Expand-this-list\\\">Expand this list</a></li></ol><h1 id=\"Warnings\" class=\"title\"><a class=\"here\" href=\"#Warnings\">Warnings</a></h1> <ol><li><a id=\\\"a16360020\\\" href=\\\"#The-latest-source-code-may-be-unstable\\\">The latest source code may be unstable</a></li></ol></div>
|
952
952
|
|
953
953
|
<br style="display: none"/>
|
954
954
|
<hr style="display: none"/>
|
@@ -958,30 +958,30 @@
|
|
958
958
|
<div class="chapter">
|
959
959
|
<h1 class="title">
|
960
960
|
Chapter
|
961
|
-
<a class="list" id="Introduction" href="#
|
961
|
+
<a class="list" id="Introduction" href="#a16539600">1</a>
|
962
962
|
|
963
963
|
<br/>
|
964
964
|
|
965
965
|
<a class="here" href="#Introduction"><big>Introduction</big></a>
|
966
966
|
</h1>
|
967
967
|
|
968
|
-
<div class="content"><p>rmmseg-cpp is a high performance Chinese word segmentation utility for
|
969
|
-
Ruby. It features full <a href="http://ferret.davebalmain.com/">Ferret</a> integration
|
968
|
+
<div class="content"><p>rmmseg-cpp is a high performance Chinese word segmentation utility for
|
969
|
+
Ruby. It features full <a href="http://ferret.davebalmain.com/">Ferret</a> integration
|
970
970
|
as well as support for normal Ruby program usage.</p>
|
971
|
-
<p>rmmseg-cpp is a re-written of the original
|
972
|
-
<a href="http://rmmseg.rubyforge.org/">RMMSeg</a> gem in C++. RMMSeg is written
|
973
|
-
in pure Ruby. Though I tried hard to tweak RMMSeg, it just consumes
|
971
|
+
<p>rmmseg-cpp is a re-written of the original
|
972
|
+
<a href="http://rmmseg.rubyforge.org/">RMMSeg</a> gem in C++. RMMSeg is written
|
973
|
+
in pure Ruby. Though I tried hard to tweak RMMSeg, it just consumes
|
974
974
|
lots of memory and the segmenting process is rather slow.</p>
|
975
|
-
<p>The interface is almost identical to RMMSeg but the performance is
|
976
|
-
much better. This gem is always preferable in production
|
977
|
-
use. However, if you want to understand how the MMSEG segmenting
|
978
|
-
algorithm works, the source code of RMMSeg is a better choice than
|
975
|
+
<p>The interface is almost identical to RMMSeg but the performance is
|
976
|
+
much better. This gem is always preferable in production
|
977
|
+
use. However, if you want to understand how the MMSEG segmenting
|
978
|
+
algorithm works, the source code of RMMSeg is a better choice than
|
979
979
|
this.</p></div>
|
980
980
|
</div>
|
981
981
|
<div class="chapter">
|
982
982
|
<h1 class="title">
|
983
983
|
Chapter
|
984
|
-
<a class="list" id="Setup" href="#
|
984
|
+
<a class="list" id="Setup" href="#a16533660">2</a>
|
985
985
|
|
986
986
|
<br/>
|
987
987
|
|
@@ -990,7 +990,7 @@ this.</p></div>
|
|
990
990
|
|
991
991
|
<div class="content"><div class="section">
|
992
992
|
<h2 class="title">
|
993
|
-
<a class="list" id="Requirements" href="#
|
993
|
+
<a class="list" id="Requirements" href="#a16530900">2.1</a> <a class="here" href="#Requirements">Requirements</a>
|
994
994
|
</h2>
|
995
995
|
<div class="content"><p>Your system needs the following software to run RMMSeg.</p>
|
996
996
|
<table border="1">
|
@@ -1011,37 +1011,37 @@ this.</p></div>
|
|
1011
1011
|
<td> Used to build the native extension </td>
|
1012
1012
|
</tr>
|
1013
1013
|
</table></div>
|
1014
|
-
</div
|
1014
|
+
</div>
|
1015
1015
|
<div class="section">
|
1016
1016
|
<h2 class="title">
|
1017
|
-
<a class="list" id="Installation" href="#
|
1017
|
+
<a class="list" id="Installation" href="#a16472140">2.2</a> <a class="here" href="#Installation">Installation</a>
|
1018
1018
|
</h2>
|
1019
1019
|
<div class="content"><div class="section">
|
1020
1020
|
<h3 class="title">
|
1021
|
-
<a class="list" id="Using-RubyGems" href="#
|
1021
|
+
<a class="list" id="Using-RubyGems" href="#a16468300">2.2.1</a> <a class="here" href="#Using-RubyGems">Using RubyGems</a>
|
1022
1022
|
</h3>
|
1023
1023
|
<div class="content"><p>To install the gem remotely from <a href="http://rubyforge.org">RubyForge</a>:</p>
|
1024
1024
|
sudo gem install rmmseg-cpp
|
1025
|
-
<p>Or you can download the gem file manually from
|
1026
|
-
<a href="http://rubyforge.org/projects/rmmseg-cpp/">RubyForge</a> and
|
1025
|
+
<p>Or you can download the gem file manually from
|
1026
|
+
<a href="http://rubyforge.org/projects/rmmseg-cpp/">RubyForge</a> and
|
1027
1027
|
install it locally:</p>
|
1028
1028
|
sudo gem install —local rmmseg-cpp-x.y.z.gem</div>
|
1029
|
-
</div
|
1029
|
+
</div>
|
1030
1030
|
<div class="section">
|
1031
1031
|
<h3 class="title">
|
1032
|
-
<a class="list" id="From-Git" href="#
|
1032
|
+
<a class="list" id="From-Git" href="#a16363260">2.2.2</a> <a class="here" href="#From-Git">From Git</a>
|
1033
1033
|
</h3>
|
1034
|
-
<div class="content"><p>To build the gem manually from the latest source code. You’ll
|
1034
|
+
<div class="content"><p>To build the gem manually from the latest source code. You’ll
|
1035
1035
|
need to have <strong>git</strong> and <strong>rake</strong> installed.</p>
|
1036
1036
|
<p><div class="warning">
|
1037
|
-
<p class="title"><a class="list" id="The-latest-source-code-may-be-unstable" href="#
|
1037
|
+
<p class="title"><a class="list" id="The-latest-source-code-may-be-unstable" href="#a16360020">Warning 1</a>. <a class="here" href="#The-latest-source-code-may-be-unstable">The latest source code may be unstable</a></p>
|
1038
1038
|
|
1039
|
-
<div class="content icon-warning">While I tried to avoid such kind of problems, the source
|
1040
|
-
code from the repository might still be broken sometimes
|
1039
|
+
<div class="content icon-warning">While I tried to avoid such kind of problems, the source
|
1040
|
+
code from the repository might still be broken sometimes.
|
1041
1041
|
It is generally not recommended to follow the source code.</div>
|
1042
|
-
</div>
|
1043
|
-
The source code of rmmseg-cpp is hosted at
|
1044
|
-
<a href="http://github.com/pluskid/rmmseg-cpp/">GitHub</a>. You can get the
|
1042
|
+
</div>
|
1043
|
+
The source code of rmmseg-cpp is hosted at
|
1044
|
+
<a href="http://github.com/pluskid/rmmseg-cpp/">GitHub</a>. You can get the
|
1045
1045
|
source code by git clone:</p>
|
1046
1046
|
git clone git://github.com/pluskid/rmmseg-cpp.git
|
1047
1047
|
<p>then you can use Rake to build and install the gem:</p>
|
@@ -1053,97 +1053,107 @@ rake gem:install</div>
|
|
1053
1053
|
<div class="chapter">
|
1054
1054
|
<h1 class="title">
|
1055
1055
|
Chapter
|
1056
|
-
<a class="list" id="Usage" href="#
|
1056
|
+
<a class="list" id="Usage" href="#a16272720">3</a>
|
1057
1057
|
|
1058
1058
|
<br/>
|
1059
1059
|
|
1060
1060
|
<a class="here" href="#Usage"><big>Usage</big></a>
|
1061
1061
|
</h1>
|
1062
1062
|
|
1063
|
-
<div class="content"
|
1063
|
+
<div class="content">
|
1064
|
+
<p><div class="section">
|
1064
1065
|
<h2 class="title">
|
1065
|
-
<a class="list" id="Stand-Alone-rmmseg" href="#
|
1066
|
+
<a class="list" id="Stand-Alone-rmmseg" href="#a16246860">3.1</a> <a class="here" href="#Stand-Alone-rmmseg">Stand Alone rmmseg</a>
|
1066
1067
|
</h2>
|
1067
|
-
<div class="content"><p>rmmseg-cpp comes with a script <strong>rmmseg</strong>. To get the basic usage, just execute it with <tt>-h</tt> option
|
1068
|
-
|
1069
|
-
|
1070
|
-
<p>It reads from STDIN and print result to STDOUT. Here is a real<br />
|
1068
|
+
<div class="content"><p>rmmseg-cpp comes with a script <strong>rmmseg</strong>. To get the basic usage, just execute it with <tt>-h</tt> option:</p>
|
1069
|
+
rmmseg -h
|
1070
|
+
<p>It reads from STDIN and print result to STDOUT. Here is a real
|
1071
1071
|
example:</p>
|
1072
1072
|
$ echo “我们都喜欢用 Ruby” | rmmseg
|
1073
1073
|
我们 都 喜欢 用 Ruby</div>
|
1074
|
-
</div
|
1074
|
+
</div>
|
1075
1075
|
<div class="section">
|
1076
1076
|
<h2 class="title">
|
1077
|
-
<a class="list" id="Use-in-Ruby-program" href="#
|
1077
|
+
<a class="list" id="Use-in-Ruby-program" href="#a16240340">3.2</a> <a class="here" href="#Use-in-Ruby-program">Use in Ruby program</a>
|
1078
1078
|
</h2>
|
1079
1079
|
<div class="content"><div class="section">
|
1080
1080
|
<h3 class="title">
|
1081
|
-
<a class="list" id="Initialize" href="#
|
1081
|
+
<a class="list" id="Initialize" href="#a16231580">3.2.1</a> <a class="here" href="#Initialize">Initialize</a>
|
1082
1082
|
</h3>
|
1083
1083
|
<div class="content"><p>To use rmmseg-cpp in Ruby program, you’ll first load it with RubyGems:</p>
|
1084
1084
|
<pre class="code">
|
1085
|
-
require <span style="background-color:#fff0f0"><span style="color:#710">'</span><span style="
|
1086
|
-
require <span style="background-color:#fff0f0"><span style="color:#710">'</span><span style="
|
1085
|
+
require <span style="background-color:#fff0f0;color:#D20"><span style="color:#710">'</span><span style="">rubygems</span><span style="color:#710">'</span></span>
|
1086
|
+
require <span style="background-color:#fff0f0;color:#D20"><span style="color:#710">'</span><span style="">rmmseg</span><span style="color:#710">'</span></span>
|
1087
1087
|
</pre>
|
1088
|
-
<p>Then you may customize the dictionaries used by rmmseg-cpp
|
1089
|
-
(see <a href="http://rmmseg-cpp.rubyforge.org/rdoc/classes/RMMSeg/Dictionary.html">the rdoc</a> on
|
1088
|
+
<p>Then you may customize the dictionaries used by rmmseg-cpp
|
1089
|
+
(see <a href="http://rmmseg-cpp.rubyforge.org/rdoc/classes/RMMSeg/Dictionary.html">the rdoc</a> on
|
1090
1090
|
how to add your own dictionaries) and load all dictionaries:</p>
|
1091
1091
|
<pre class="code">
|
1092
|
-
<span style="color:#036;
|
1092
|
+
<span style="color:#036;font-weight:bold">RMMSeg</span>::<span style="color:#036;font-weight:bold">Dictionary</span>.load_dictionaries
|
1093
|
+
</pre>
|
1094
|
+
<p>Now rmmseg-cpp will be ready to do segmenting. If you want to load your own customized
|
1095
|
+
dictionaries, please customize <tt>RMMSeg::Dictionary.dictionaries</tt> before calling
|
1096
|
+
<tt>load_dictionaries</tt>. e.g.</p>
|
1097
|
+
<pre class="code">
|
1098
|
+
<span style="color:#036;font-weight:bold">RMMSeg</span>::<span style="color:#036;font-weight:bold">Dictionary</span>.dictionaries = [[<span style="color:#A60">:chars</span>, <span style="background-color:#fff0f0;color:#D20"><span style="color:#710">"</span><span style="">my_chars.dic</span><span style="color:#710">"</span></span>],
|
1099
|
+
[<span style="color:#A60">:words</span>, <span style="background-color:#fff0f0;color:#D20"><span style="color:#710">"</span><span style="">my_words.dic</span><span style="color:#710">"</span></span>],
|
1100
|
+
[<span style="color:#A60">:words</span>, <span style="background-color:#fff0f0;color:#D20"><span style="color:#710">"</span><span style="">my_words2.dic</span><span style="color:#710">"</span></span>]]
|
1093
1101
|
</pre>
|
1094
|
-
<p>
|
1095
|
-
</
|
1102
|
+
<p>The basic format for char-dictionary and word-dictionary are similar. For each line,
|
1103
|
+
there is a number, then <strong>a</strong> space, then the string. Note there <strong>SHOULD</strong> be a newline
|
1104
|
+
at the end of the dictionary file. And the number in char-dictionary and word-dictionary
|
1105
|
+
has different meaning.</p>
|
1106
|
+
<p>In char-dictionary, the number means the frequency of the character. In word-dictionary,
|
1107
|
+
the number mean the number of characters in the word. Note that this is NOT the number
|
1108
|
+
of <strong>bytes</strong> in the word.</p></div>
|
1109
|
+
</div>
|
1096
1110
|
<div class="section">
|
1097
1111
|
<h3 class="title">
|
1098
|
-
<a class="list" id="Ferret-Integration" href="#
|
1112
|
+
<a class="list" id="Ferret-Integration" href="#a16187880">3.2.2</a> <a class="here" href="#Ferret-Integration">Ferret Integration</a>
|
1099
1113
|
</h3>
|
1100
|
-
<div class="content"><p>To use rmmseg-cpp with Ferret, you’ll need to <code class="code">require</code> the
|
1101
|
-
Ferret support of rmmseg-cpp (Of course you’ll also have to
|
1102
|
-
got Ferret installed. If you have problems running the belowing
|
1103
|
-
example, please try to update to the latest version of both
|
1104
|
-
Ferret and rmmseg-cpp first)
|
1105
|
-
<br />
|
1114
|
+
<div class="content"><p>To use rmmseg-cpp with Ferret, you’ll need to <code class="code">require</code> the
|
1115
|
+
Ferret support of rmmseg-cpp (Of course you’ll also have to
|
1116
|
+
got Ferret installed. If you have problems running the belowing
|
1117
|
+
example, please try to update to the latest version of both
|
1118
|
+
Ferret and rmmseg-cpp first):</p>
|
1106
1119
|
<pre class="code">
|
1107
|
-
require <span style="background-color:#fff0f0"><span style="color:#710">'</span><span style="
|
1108
|
-
</pre
|
1109
|
-
<p>rmmseg-cpp comes with a ready to use Ferret analyzer
|
1110
|
-
<br />
|
1120
|
+
require <span style="background-color:#fff0f0;color:#D20"><span style="color:#710">'</span><span style="">rmmseg/ferret</span><span style="color:#710">'</span></span>
|
1121
|
+
</pre>
|
1122
|
+
<p>rmmseg-cpp comes with a ready to use Ferret analyzer:</p>
|
1111
1123
|
<pre class="code">
|
1112
|
-
analyzer = <span style="color:#036;
|
1113
|
-
<span style="color:#036;
|
1124
|
+
analyzer = <span style="color:#036;font-weight:bold">RMMSeg</span>::<span style="color:#036;font-weight:bold">Ferret</span>::<span style="color:#036;font-weight:bold">Analyzer</span>.new { |tokenizer|
|
1125
|
+
<span style="color:#036;font-weight:bold">Ferret</span>::<span style="color:#036;font-weight:bold">Analysis</span>::<span style="color:#036;font-weight:bold">LowerCaseFilter</span>.new(tokenizer)
|
1114
1126
|
}
|
1115
|
-
index = <span style="color:#036;
|
1116
|
-
</pre
|
1117
|
-
|
1118
|
-
|
1119
|
-
|
1120
|
-
<
|
1121
|
-
<div class="
|
1122
|
-
<p class="title"><a class="list" id="Ferret-Example-Screenshot" href="#a-607147048">Figure 1</a>. <a class="here" href="#Ferret-Example-Screenshot">Ferret Example Screenshot</a></p>
|
1123
|
-
<div class="content"><img src="http://pluskid.lifegoo.com/wp-content/uploads/2008/02/rmmseg.png" alt="" /></div>
|
1127
|
+
index = <span style="color:#036;font-weight:bold">Ferret</span>::<span style="color:#036;font-weight:bold">Index</span>::<span style="color:#036;font-weight:bold">Index</span>.new(<span style="color:#A60">:analyzer</span> => analyzer)
|
1128
|
+
</pre>
|
1129
|
+
<p>A complete example can be found in <tt>misc/ferret_example.rb</tt>. The result
|
1130
|
+
of running that example is shown in <a class="xref" href="#Ferret-Example-Screenshot">Figure 1. Ferret Example Screenshot</a>.</p>
|
1131
|
+
<p><div class="figure">
|
1132
|
+
<p class="title"><a class="list" id="Ferret-Example-Screenshot" href="#a16148860">Figure 1</a>. <a class="here" href="#Ferret-Example-Screenshot">Ferret Example Screenshot</a></p>
|
1133
|
+
<div class="content"><img src="http://lifegoo.pluskid.org/wp-content/uploads/2008/02/rmmseg.png" alt="" /></div>
|
1124
1134
|
</div></p></div>
|
1125
|
-
</div
|
1135
|
+
</div>
|
1126
1136
|
<div class="section">
|
1127
1137
|
<h3 class="title">
|
1128
|
-
<a class="list" id="Normal-Ruby-program" href="#
|
1138
|
+
<a class="list" id="Normal-Ruby-program" href="#a16113620">3.2.3</a> <a class="here" href="#Normal-Ruby-program">Normal Ruby program</a>
|
1129
1139
|
</h3>
|
1130
|
-
<div class="content"><p>rmmseg-cpp can also be used in normal Ruby programs. Just create
|
1131
|
-
an <code class="code"><span style="color:#036;
|
1140
|
+
<div class="content"><p>rmmseg-cpp can also be used in normal Ruby programs. Just create
|
1141
|
+
an <code class="code"><span style="color:#036;font-weight:bold">Algorithm</span></code> object and call <code class="code">next_token</code> until a <code class="code"><span style="color:#038;font-weight:bold">nil</span></code> is returned:</p>
|
1132
1142
|
<pre class="code">
|
1133
|
-
algor = <span style="color:#036;
|
1134
|
-
loop <span style="color:#080;
|
1143
|
+
algor = <span style="color:#036;font-weight:bold">RMMSeg</span>::<span style="color:#036;font-weight:bold">Algorithm</span>.new(text)
|
1144
|
+
loop <span style="color:#080;font-weight:bold">do</span>
|
1135
1145
|
tok = algor.next_token
|
1136
|
-
<span style="color:#080;
|
1137
|
-
puts <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="background:
|
1138
|
-
<span style="color:#080;
|
1146
|
+
<span style="color:#080;font-weight:bold">break</span> <span style="color:#080;font-weight:bold">if</span> tok.nil?
|
1147
|
+
puts <span style="background-color:#fff0f0;color:#D20"><span style="color:#710">"</span><span style="background:#ddd;color:black"><span style="background:#ddd;font-weight:bold;color:#666">#{</span>tok.text<span style="background:#ddd;font-weight:bold;color:#666">}</span></span><span style=""> [</span><span style="background:#ddd;color:black"><span style="background:#ddd;font-weight:bold;color:#666">#{</span>tok.start<span style="background:#ddd;font-weight:bold;color:#666">}</span></span><span style="">..</span><span style="background:#ddd;color:black"><span style="background:#ddd;font-weight:bold;color:#666">#{</span>tok.end<span style="background:#ddd;font-weight:bold;color:#666">}</span></span><span style="">]</span><span style="color:#710">"</span></span>
|
1148
|
+
<span style="color:#080;font-weight:bold">end</span>
|
1139
1149
|
</pre></div>
|
1140
1150
|
</div></div>
|
1141
|
-
</div></div>
|
1151
|
+
</div></p></div>
|
1142
1152
|
</div>
|
1143
1153
|
<div class="chapter">
|
1144
1154
|
<h1 class="title">
|
1145
1155
|
Chapter
|
1146
|
-
<a class="list" id="Who-use-it" href="#
|
1156
|
+
<a class="list" id="Who-use-it" href="#a16072000">4</a>
|
1147
1157
|
|
1148
1158
|
<br/>
|
1149
1159
|
|
@@ -1151,20 +1161,20 @@ loop <span style="color:#080; font-weight:bold">do</span>
|
|
1151
1161
|
</h1>
|
1152
1162
|
|
1153
1163
|
<div class="content"><p><div class="tip">
|
1154
|
-
<p class="title"><a class="list" id="Expand-this-list" href="#
|
1164
|
+
<p class="title"><a class="list" id="Expand-this-list" href="#a16067160">Tip 1</a>. <a class="here" href="#Expand-this-list">Expand this list</a></p>
|
1155
1165
|
|
1156
|
-
<div class="content icon-tip">If you used rmmseg-cpp and would like your project to
|
1166
|
+
<div class="content icon-tip">If you used rmmseg-cpp and would like your project to
|
1157
1167
|
appear in this list, please <a href="mailto:pluskid@gmail.com">contact me</a>.</div>
|
1158
1168
|
</div></p>
|
1159
1169
|
<ul>
|
1160
|
-
<li><a href="http://www.javaeye.com/">JavaEye</a>: One of the biggest software developper
|
1170
|
+
<li><a href="http://www.javaeye.com/">JavaEye</a>: One of the biggest software developper
|
1161
1171
|
community in China.</li>
|
1162
1172
|
</ul></div>
|
1163
1173
|
</div>
|
1164
1174
|
<div class="chapter">
|
1165
1175
|
<h1 class="title">
|
1166
1176
|
Chapter
|
1167
|
-
<a class="list" id="Resources" href="#
|
1177
|
+
<a class="list" id="Resources" href="#a16034860">5</a>
|
1168
1178
|
|
1169
1179
|
<br/>
|
1170
1180
|
|
@@ -1174,7 +1184,7 @@ appear in this list, please <a href="mailto:pluskid@gmail.com">contact me</a>.</
|
|
1174
1184
|
<div class="content"><ul>
|
1175
1185
|
<li><a href="http://rubyforge.org/projects/rmmseg-cpp/">Project Home</a>: The Project page at RubyForge.</li>
|
1176
1186
|
<li><a href="http://rmmseg-cpp.rubyforge.org/rdoc/index.html">RDoc of rmmseg-cpp</a>: The auto generated rdoc of RMMSeg.</li>
|
1177
|
-
<li><a href="http://pluskid.
|
1187
|
+
<li><a href="http://blog.pluskid.org/">Free Mind</a>: The author’s blog.</li>
|
1178
1188
|
<li><a href="mailto:pluskid@gmail.com">Author’s Email</a>: Contact me if you have any problem.</li>
|
1179
1189
|
</ul></div>
|
1180
1190
|
</div></div>
|
@@ -1187,7 +1197,7 @@ appear in this list, please <a href="mailto:pluskid@gmail.com">contact me</a>.</
|
|
1187
1197
|
|
1188
1198
|
<div id="footer">
|
1189
1199
|
|
1190
|
-
Generated on
|
1200
|
+
Generated on 2011-09-10 15:59:08 +0800 by <a href="http://gerbil.rubyforge.org">Gerbil</a> 3.1.0.
|
1191
1201
|
|
1192
1202
|
<div id="footer-credits">
|
1193
1203
|
<span class="icon-warning" style="float: right"> </span>
|
metadata
CHANGED
@@ -1,7 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rmmseg-cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 2
|
8
|
+
- 9
|
9
|
+
version: 0.2.9
|
5
10
|
platform: ruby
|
6
11
|
authors:
|
7
12
|
- pluskid
|
@@ -9,11 +14,25 @@ autorequire:
|
|
9
14
|
bindir: bin
|
10
15
|
cert_chain: []
|
11
16
|
|
12
|
-
date:
|
17
|
+
date: 2011-09-10 00:00:00 +08:00
|
13
18
|
default_executable:
|
14
19
|
dependencies: []
|
15
20
|
|
16
|
-
description:
|
21
|
+
description: |-
|
22
|
+
rmmseg-cpp is a high performance Chinese word segmentation utility for
|
23
|
+
Ruby. It features full "Ferret":http://ferret.davebalmain.com/ integration
|
24
|
+
as well as support for normal Ruby program usage.
|
25
|
+
|
26
|
+
rmmseg-cpp is a re-written of the original
|
27
|
+
RMMSeg(http://rmmseg.rubyforge.org/) gem in C++. RMMSeg is written
|
28
|
+
in pure Ruby. Though I tried hard to tweak RMMSeg, it just consumes
|
29
|
+
lots of memory and the segmenting process is rather slow.
|
30
|
+
|
31
|
+
The interface is almost identical to RMMSeg but the performance is
|
32
|
+
much better. This gem is always preferable in production
|
33
|
+
use. However, if you want to understand how the MMSEG segmenting
|
34
|
+
algorithm works, the source code of RMMSeg is a better choice than
|
35
|
+
this.
|
17
36
|
email: pluskid@gmail.com
|
18
37
|
executables:
|
19
38
|
- rmmseg
|
@@ -69,6 +88,8 @@ files:
|
|
69
88
|
- test/test_rmmseg.rb
|
70
89
|
has_rdoc: true
|
71
90
|
homepage: http://rmmseg-cpp.rubyforge.org
|
91
|
+
licenses: []
|
92
|
+
|
72
93
|
post_install_message:
|
73
94
|
rdoc_options:
|
74
95
|
- --main
|
@@ -77,23 +98,27 @@ require_paths:
|
|
77
98
|
- lib
|
78
99
|
- ext
|
79
100
|
required_ruby_version: !ruby/object:Gem::Requirement
|
101
|
+
none: false
|
80
102
|
requirements:
|
81
103
|
- - ">="
|
82
104
|
- !ruby/object:Gem::Version
|
105
|
+
segments:
|
106
|
+
- 0
|
83
107
|
version: "0"
|
84
|
-
version:
|
85
108
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
109
|
+
none: false
|
86
110
|
requirements:
|
87
111
|
- - ">="
|
88
112
|
- !ruby/object:Gem::Version
|
113
|
+
segments:
|
114
|
+
- 0
|
89
115
|
version: "0"
|
90
|
-
version:
|
91
116
|
requirements: []
|
92
117
|
|
93
118
|
rubyforge_project: rmmseg-cpp
|
94
|
-
rubygems_version: 1.
|
119
|
+
rubygems_version: 1.3.7
|
95
120
|
signing_key:
|
96
|
-
specification_version:
|
121
|
+
specification_version: 3
|
97
122
|
summary: rmmseg-cpp is a high performance Chinese word segmentation utility for Ruby
|
98
123
|
test_files:
|
99
124
|
- test/test_rmmseg.rb
|