rmmseg-cpp 0.2.7 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +8 -0
- data/README +11 -0
- data/Rakefile +1 -1
- data/ext/rmmseg/algor.cpp +24 -33
- data/ext/rmmseg/algor.h +13 -14
- data/ext/rmmseg/chunk.h +12 -11
- data/ext/rmmseg/dict.cpp +11 -9
- data/ext/rmmseg/memory.cpp +2 -2
- data/ext/rmmseg/memory.h +4 -4
- data/ext/rmmseg/rmmseg.cpp +13 -11
- data/ext/rmmseg/rules.h +18 -19
- data/ext/rmmseg/word.h +2 -2
- data/misc/homepage.erb +21 -4
- data/misc/homepage.html +97 -87
- metadata +32 -7
data/History.txt
CHANGED
data/README
CHANGED
@@ -43,6 +43,17 @@ init by loading the dictionaries:
|
|
43
43
|
|
44
44
|
RMMSeg::Dictionary.load_dictionaries
|
45
45
|
|
46
|
+
If you want to add customized dictionaries, append them to
|
47
|
+
+RMMSeg::Dictionary.dictionaries+ before calling +load_dictionaries+.
|
48
|
+
The formats of chars.dic and words.dic are NOT the same:
|
49
|
+
|
50
|
+
* For chars.dic, each line contains freq, a space, and then the character
|
51
|
+
* For words.dic, each line contains length, a space, and then the word.
|
52
|
+
|
53
|
+
Note length mean the length of the word, i.e. the number of characters
|
54
|
+
of the word, not number of bytes. WARNING: there should be a newline at
|
55
|
+
the end of every dictionary file.
|
56
|
+
|
46
57
|
Then create a +Algorithm+ object and call +next_token+ until got a
|
47
58
|
+nil+:
|
48
59
|
|
data/Rakefile
CHANGED
data/ext/rmmseg/algor.cpp
CHANGED
@@ -71,27 +71,28 @@ namespace rmmseg
|
|
71
71
|
|
72
72
|
Token Algorithm::get_cjk_word(int len)
|
73
73
|
{
|
74
|
-
create_chunks();
|
75
|
-
|
76
|
-
if (
|
77
|
-
|
78
|
-
if (
|
79
|
-
|
80
|
-
if (
|
81
|
-
|
82
|
-
if (
|
83
|
-
|
84
|
-
|
85
|
-
if (
|
74
|
+
vector<Chunk> chunks = create_chunks();
|
75
|
+
|
76
|
+
if (chunks.size() > 1)
|
77
|
+
mm_filter(chunks);
|
78
|
+
if (chunks.size() > 1)
|
79
|
+
lawl_filter(chunks);
|
80
|
+
if (chunks.size() > 1)
|
81
|
+
svwl_filter(chunks);
|
82
|
+
if (chunks.size() > 1)
|
83
|
+
lsdmfocw_filter(chunks);
|
84
|
+
|
85
|
+
if (chunks.size() < 1)
|
86
86
|
return Token(NULL, 0);
|
87
87
|
|
88
|
-
Token token(m_text+m_pos,
|
89
|
-
m_pos +=
|
88
|
+
Token token(m_text+m_pos, chunks[0].words[0]->nbytes);
|
89
|
+
m_pos += chunks[0].words[0]->nbytes;
|
90
90
|
return token;
|
91
91
|
}
|
92
92
|
|
93
|
-
|
93
|
+
vector<Chunk> Algorithm::create_chunks()
|
94
94
|
{
|
95
|
+
vector<Chunk> chunks;
|
95
96
|
Chunk chunk;
|
96
97
|
Word *w1, *w2, *w3;
|
97
98
|
|
@@ -100,8 +101,6 @@ namespace rmmseg
|
|
100
101
|
typedef vec_t::iterator it_t;
|
101
102
|
|
102
103
|
vec_t words1 = find_match_words();
|
103
|
-
m_chunks_size = 0;
|
104
|
-
|
105
104
|
for (it_t i1 = words1.begin();
|
106
105
|
i1 != words1.end();
|
107
106
|
++i1)
|
@@ -136,17 +135,13 @@ namespace rmmseg
|
|
136
135
|
chunk.n = 3;
|
137
136
|
chunk.words[2] = w3;
|
138
137
|
}
|
139
|
-
|
140
|
-
sizeof(Chunk));
|
141
|
-
m_chunks_size++;
|
138
|
+
chunks.push_back(chunk);
|
142
139
|
}
|
143
140
|
}
|
144
141
|
else if (m_pos == m_text_length)
|
145
142
|
{
|
146
143
|
chunk.n = 2;
|
147
|
-
|
148
|
-
sizeof(Chunk));
|
149
|
-
m_chunks_size++;
|
144
|
+
chunks.push_back(chunk);
|
150
145
|
}
|
151
146
|
m_pos -= w2->nbytes;
|
152
147
|
}
|
@@ -154,13 +149,13 @@ namespace rmmseg
|
|
154
149
|
else if (m_pos == m_text_length)
|
155
150
|
{
|
156
151
|
chunk.n = 1;
|
157
|
-
|
158
|
-
m_chunks_size++;
|
152
|
+
chunks.push_back(chunk);
|
159
153
|
}
|
160
154
|
m_pos -= w1->nbytes;
|
161
155
|
}
|
162
156
|
|
163
157
|
m_pos = orig_pos;
|
158
|
+
return chunks;
|
164
159
|
}
|
165
160
|
|
166
161
|
int Algorithm::next_char()
|
@@ -169,15 +164,11 @@ namespace rmmseg
|
|
169
164
|
unsigned char ch = m_text[m_pos];
|
170
165
|
if (ch >= 0xC0 && ch <= 0xDF)
|
171
166
|
{
|
172
|
-
|
173
|
-
return 1; /* broken text at the end */
|
174
|
-
return 2;
|
167
|
+
return min(2, m_text_length-m_pos);
|
175
168
|
}
|
176
169
|
if (ch >= 0xE0 && ch <= 0xEF)
|
177
170
|
{
|
178
|
-
|
179
|
-
return 1; /* broken text at the end */
|
180
|
-
return 3;
|
171
|
+
return min(3, m_text_length-m_pos);
|
181
172
|
}
|
182
173
|
return 1;
|
183
174
|
}
|
@@ -195,11 +186,11 @@ namespace rmmseg
|
|
195
186
|
|
196
187
|
while (m_pos < m_text_length)
|
197
188
|
{
|
189
|
+
if (n >= max_word_length())
|
190
|
+
break;
|
198
191
|
len = next_char();
|
199
192
|
if (len <= 1)
|
200
193
|
break;
|
201
|
-
if (n >= max_word_length())
|
202
|
-
break;
|
203
194
|
|
204
195
|
m_pos += len;
|
205
196
|
n++;
|
data/ext/rmmseg/algor.h
CHANGED
@@ -22,33 +22,32 @@ namespace rmmseg
|
|
22
22
|
{
|
23
23
|
public:
|
24
24
|
Algorithm(const char *text, int length)
|
25
|
-
:
|
25
|
+
:m_text(text), m_pos(0),
|
26
26
|
m_text_length(length),
|
27
27
|
m_tmp_words_i(0),
|
28
28
|
m_match_cache_i(0)
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
29
|
+
{
|
30
|
+
for (int i = 0; i < match_cache_size; ++i)
|
31
|
+
m_match_cache[i].first = -1;
|
32
|
+
}
|
33
33
|
|
34
34
|
Token next_token();
|
35
35
|
|
36
|
+
const char *get_text() const
|
37
|
+
{
|
38
|
+
return m_text;
|
39
|
+
}
|
40
|
+
|
36
41
|
private:
|
37
42
|
Token get_basic_latin_word();
|
38
43
|
Token get_cjk_word(int);
|
39
|
-
|
40
|
-
static const int MAX_WORD_LENGTH = 4;
|
41
|
-
static const int MAX_N_CHUNKS = \
|
42
|
-
MAX_WORD_LENGTH*MAX_WORD_LENGTH*MAX_WORD_LENGTH;
|
43
44
|
|
44
|
-
|
45
|
+
std::vector<Chunk> create_chunks();
|
45
46
|
int next_word();
|
46
47
|
int next_char();
|
47
48
|
std::vector<Word *> find_match_words();
|
48
|
-
int max_word_length() { return
|
49
|
+
int max_word_length() { return 4; }
|
49
50
|
|
50
|
-
Chunk m_chunks[MAX_N_CHUNKS];
|
51
|
-
int m_chunks_size;
|
52
51
|
|
53
52
|
const char *m_text;
|
54
53
|
int m_pos;
|
@@ -65,7 +64,7 @@ namespace rmmseg
|
|
65
64
|
return &m_tmp_words[m_tmp_words_i++];
|
66
65
|
}
|
67
66
|
|
68
|
-
/* related to max_word_length and
|
67
|
+
/* related to max_word_length and match_cache_size */
|
69
68
|
static const int max_tmp_words = 64;
|
70
69
|
Word m_tmp_words[max_tmp_words];
|
71
70
|
int m_tmp_words_i;
|
data/ext/rmmseg/chunk.h
CHANGED
@@ -12,36 +12,37 @@ namespace rmmseg
|
|
12
12
|
*/
|
13
13
|
struct Chunk
|
14
14
|
{
|
15
|
-
int total_length()
|
15
|
+
int total_length() const
|
16
16
|
{
|
17
17
|
int len = 0;
|
18
18
|
for (int i = 0; i < n; ++i)
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
19
|
+
len += std::abs(words[i]->length);
|
20
|
+
//if (words[i]->length == -1) /* tmp word */
|
21
|
+
// len += 1;
|
22
|
+
//else
|
23
|
+
// len += words[i]->length;
|
23
24
|
return len;
|
24
25
|
}
|
25
|
-
double average_length()
|
26
|
+
double average_length() const
|
26
27
|
{
|
27
28
|
return ((double)total_length())/n;
|
28
29
|
}
|
29
|
-
double variance()
|
30
|
+
double variance() const
|
30
31
|
{
|
31
32
|
double avg = average_length();
|
32
33
|
double sqr_sum = 0;
|
33
34
|
double tmp;
|
34
35
|
for (int i = 0; i < n; ++i)
|
35
36
|
{
|
36
|
-
tmp = words[i]->length;
|
37
|
-
if (tmp == -1)
|
38
|
-
|
37
|
+
tmp = std::abs(words[i]->length);
|
38
|
+
//if (tmp == -1)
|
39
|
+
// tmp = 1;
|
39
40
|
tmp = tmp-avg;
|
40
41
|
sqr_sum += tmp*tmp;
|
41
42
|
}
|
42
43
|
return std::sqrt(sqr_sum);
|
43
44
|
}
|
44
|
-
int degree_of_morphemic_freedom()
|
45
|
+
int degree_of_morphemic_freedom() const
|
45
46
|
{
|
46
47
|
int sum = 0;
|
47
48
|
for (int i = 0; i < n; ++i)
|
data/ext/rmmseg/dict.cpp
CHANGED
@@ -12,12 +12,12 @@ namespace rmmseg
|
|
12
12
|
Entry *next;
|
13
13
|
};
|
14
14
|
|
15
|
-
const
|
16
|
-
const
|
15
|
+
const size_t init_size = 262147;
|
16
|
+
const size_t max_density = 5;
|
17
17
|
/*
|
18
18
|
Table of prime numbers 2^n+a, 2<=n<=30.
|
19
19
|
*/
|
20
|
-
static
|
20
|
+
static size_t primes[] = {
|
21
21
|
524288 + 21,
|
22
22
|
1048576 + 7,
|
23
23
|
2097152 + 17,
|
@@ -33,14 +33,14 @@ namespace rmmseg
|
|
33
33
|
};
|
34
34
|
|
35
35
|
|
36
|
-
static
|
37
|
-
static
|
36
|
+
static size_t n_bins = init_size;
|
37
|
+
static size_t n_entries = 0;
|
38
38
|
static Entry **bins = static_cast<Entry **>(std::calloc(init_size,
|
39
39
|
sizeof(Entry *)));
|
40
40
|
|
41
|
-
static
|
41
|
+
static size_t new_size()
|
42
42
|
{
|
43
|
-
for (
|
43
|
+
for (size_t i = 0;
|
44
44
|
i < sizeof(primes)/sizeof(primes[0]);
|
45
45
|
++i)
|
46
46
|
{
|
@@ -76,7 +76,7 @@ namespace rmmseg
|
|
76
76
|
Entry *entry, *next;
|
77
77
|
unsigned int hash_val;
|
78
78
|
|
79
|
-
for (
|
79
|
+
for (size_t i = 0; i < n_bins; ++i)
|
80
80
|
{
|
81
81
|
entry = bins[i];
|
82
82
|
while (entry)
|
@@ -140,6 +140,7 @@ namespace rmmseg
|
|
140
140
|
entry->next = NULL;
|
141
141
|
bins[h] = entry;
|
142
142
|
n_entries++;
|
143
|
+
return;
|
143
144
|
}
|
144
145
|
|
145
146
|
bool done = false;
|
@@ -168,6 +169,7 @@ namespace rmmseg
|
|
168
169
|
entry->word = word;
|
169
170
|
entry->next = bins[h];
|
170
171
|
bins[h] = entry;
|
172
|
+
n_entries++;
|
171
173
|
}
|
172
174
|
}
|
173
175
|
|
@@ -179,7 +181,7 @@ namespace rmmseg
|
|
179
181
|
return false;
|
180
182
|
}
|
181
183
|
|
182
|
-
const
|
184
|
+
const size_t buf_len = 24;
|
183
185
|
char buf[buf_len];
|
184
186
|
char *ptr;
|
185
187
|
|
data/ext/rmmseg/memory.cpp
CHANGED
data/ext/rmmseg/memory.h
CHANGED
@@ -12,12 +12,12 @@
|
|
12
12
|
|
13
13
|
namespace rmmseg
|
14
14
|
{
|
15
|
-
const
|
15
|
+
const size_t REALLOC_SIZE = 2048; /* 2KB */
|
16
16
|
|
17
|
-
extern
|
18
|
-
extern char
|
17
|
+
extern size_t _pool_size;
|
18
|
+
extern char *_pool_base;
|
19
19
|
|
20
|
-
inline void *pool_alloc(
|
20
|
+
inline void *pool_alloc(size_t len)
|
21
21
|
{
|
22
22
|
void *mem = _pool_base;
|
23
23
|
|
data/ext/rmmseg/rmmseg.cpp
CHANGED
@@ -36,7 +36,7 @@ extern "C" {
|
|
36
36
|
*/
|
37
37
|
static VALUE dic_load_chars(VALUE mod, VALUE path)
|
38
38
|
{
|
39
|
-
if (rmmseg::dict::load_chars(
|
39
|
+
if (rmmseg::dict::load_chars(RSTRING_PTR(path)))
|
40
40
|
return Qtrue;
|
41
41
|
return Qfalse;
|
42
42
|
}
|
@@ -51,7 +51,7 @@ extern "C" {
|
|
51
51
|
*/
|
52
52
|
static VALUE dic_load_words(VALUE mod, VALUE path)
|
53
53
|
{
|
54
|
-
if (rmmseg::dict::load_words(
|
54
|
+
if (rmmseg::dict::load_words(RSTRING_PTR(path)))
|
55
55
|
return Qtrue;
|
56
56
|
return Qfalse;
|
57
57
|
}
|
@@ -70,8 +70,8 @@ extern "C" {
|
|
70
70
|
*/
|
71
71
|
static VALUE dic_add(VALUE mod, VALUE word, VALUE len, VALUE freq)
|
72
72
|
{
|
73
|
-
const char *str =
|
74
|
-
int nbytes =
|
73
|
+
const char *str = RSTRING_PTR(word);
|
74
|
+
int nbytes = RSTRING_LEN(word);
|
75
75
|
rmmseg::Word *w = rmmseg::make_word(str, FIX2INT(len), FIX2INT(freq), nbytes);
|
76
76
|
rmmseg::dict::add(w);
|
77
77
|
return Qnil;
|
@@ -88,8 +88,8 @@ extern "C" {
|
|
88
88
|
*/
|
89
89
|
static VALUE dic_has_word(VALUE mod, VALUE word)
|
90
90
|
{
|
91
|
-
const char *str =
|
92
|
-
int nbytes =
|
91
|
+
const char *str = RSTRING_PTR(word);
|
92
|
+
int nbytes = RSTRING_LEN(word);
|
93
93
|
if (rmmseg::dict::get(str, nbytes) != NULL)
|
94
94
|
return Qtrue;
|
95
95
|
return Qfalse;
|
@@ -162,16 +162,17 @@ extern "C" {
|
|
162
162
|
int start = t.text-base;
|
163
163
|
|
164
164
|
// This is necessary, see
|
165
|
-
// http://pluskid.
|
165
|
+
// http://lifegoo.pluskid.org/?p=348
|
166
166
|
volatile VALUE text = rb_str_new(t.text, t.length);
|
167
167
|
tk->text = text;
|
168
168
|
|
169
169
|
tk->start = INT2FIX(start);
|
170
170
|
tk->end = INT2FIX(start + t.length);
|
171
|
-
|
171
|
+
volatile VALUE tok = Data_Wrap_Struct(cToken,
|
172
172
|
(RUBY_DATA_FUNC)tk_mark,
|
173
173
|
(RUBY_DATA_FUNC)tk_free,
|
174
174
|
tk);
|
175
|
+
return tok;
|
175
176
|
}
|
176
177
|
|
177
178
|
/*********************
|
@@ -207,8 +208,8 @@ extern "C" {
|
|
207
208
|
void *mem;
|
208
209
|
algor->text = text;
|
209
210
|
mem = malloc(sizeof(rmmseg::Algorithm));
|
210
|
-
algor->algor = new(mem) rmmseg::Algorithm(
|
211
|
-
|
211
|
+
algor->algor = new(mem) rmmseg::Algorithm(RSTRING_PTR(text),
|
212
|
+
RSTRING_LEN(text));
|
212
213
|
|
213
214
|
return Data_Wrap_Struct(klass,
|
214
215
|
(RUBY_DATA_FUNC)algor_mark,
|
@@ -231,7 +232,8 @@ extern "C" {
|
|
231
232
|
|
232
233
|
if (tk.length == 0)
|
233
234
|
return Qnil;
|
234
|
-
|
235
|
+
volatile VALUE rtk = tk_create(RSTRING_PTR(algor->text), tk);
|
236
|
+
return rtk;
|
235
237
|
}
|
236
238
|
|
237
239
|
|
data/ext/rmmseg/rules.h
CHANGED
@@ -9,37 +9,36 @@
|
|
9
9
|
namespace rmmseg
|
10
10
|
{
|
11
11
|
template <typename Cmp>
|
12
|
-
|
12
|
+
void take_highest(std::vector<Chunk> &chunks, const Cmp &cmp)
|
13
13
|
{
|
14
|
-
int i = 1, j;
|
15
|
-
|
16
|
-
|
17
|
-
for (j = 1; j < n; ++j)
|
14
|
+
unsigned int i = 1, j;
|
15
|
+
|
16
|
+
for (j = 1; j < chunks.size(); ++j)
|
18
17
|
{
|
19
|
-
int rlt = cmp(chunks[j],
|
18
|
+
int rlt = cmp(chunks[j], chunks[0]);
|
20
19
|
if (rlt > 0)
|
21
20
|
i = 0;
|
22
21
|
if (rlt >= 0)
|
23
22
|
std::swap(chunks[i++], chunks[j]);
|
24
23
|
}
|
25
|
-
|
24
|
+
chunks.erase(chunks.begin()+i, chunks.end());
|
26
25
|
}
|
27
26
|
|
28
27
|
struct MMCmp_t
|
29
28
|
{
|
30
|
-
int operator()(Chunk &a, Chunk &b)
|
29
|
+
int operator()(const Chunk &a, const Chunk &b) const
|
31
30
|
{
|
32
31
|
return a.total_length() - b.total_length();
|
33
32
|
}
|
34
33
|
} MMCmp;
|
35
|
-
|
34
|
+
void mm_filter(std::vector<Chunk> &chunks)
|
36
35
|
{
|
37
|
-
|
36
|
+
take_highest(chunks, MMCmp);
|
38
37
|
}
|
39
38
|
|
40
39
|
struct LAWLCmp_t
|
41
40
|
{
|
42
|
-
int operator()(Chunk &a, Chunk &b)
|
41
|
+
int operator()(const Chunk &a, const Chunk &b) const
|
43
42
|
{
|
44
43
|
double rlt = a.average_length() - b.average_length();
|
45
44
|
if (rlt == 0)
|
@@ -49,14 +48,14 @@ namespace rmmseg
|
|
49
48
|
return -1;
|
50
49
|
}
|
51
50
|
} LAWLCmp;
|
52
|
-
|
51
|
+
void lawl_filter(std::vector<Chunk> &chunks)
|
53
52
|
{
|
54
|
-
|
53
|
+
take_highest(chunks, LAWLCmp);
|
55
54
|
}
|
56
55
|
|
57
56
|
struct SVWLCmp_t
|
58
57
|
{
|
59
|
-
int operator()(Chunk &a, Chunk& b)
|
58
|
+
int operator()(const Chunk &a, const Chunk& b) const
|
60
59
|
{
|
61
60
|
double rlt = a.variance() - b.variance();
|
62
61
|
if (rlt == 0)
|
@@ -66,21 +65,21 @@ namespace rmmseg
|
|
66
65
|
return -1;
|
67
66
|
}
|
68
67
|
} SVWLCmp;
|
69
|
-
|
68
|
+
void svwl_filter(std::vector<Chunk> &chunks)
|
70
69
|
{
|
71
|
-
|
70
|
+
take_highest(chunks, SVWLCmp);
|
72
71
|
}
|
73
72
|
|
74
73
|
struct LSDMFOCWCmp_t
|
75
74
|
{
|
76
|
-
int operator()(Chunk &a, Chunk& b)
|
75
|
+
int operator()(const Chunk &a, const Chunk& b) const
|
77
76
|
{
|
78
77
|
return a.degree_of_morphemic_freedom() - b.degree_of_morphemic_freedom();
|
79
78
|
}
|
80
79
|
} LSDMFOCWCmp;
|
81
|
-
|
80
|
+
void lsdmfocw_filter(std::vector<Chunk> &chunks)
|
82
81
|
{
|
83
|
-
|
82
|
+
take_highest(chunks, LSDMFOCWCmp);
|
84
83
|
}
|
85
84
|
}
|
86
85
|
|
data/ext/rmmseg/word.h
CHANGED
@@ -28,11 +28,11 @@ namespace rmmseg
|
|
28
28
|
if (freq > USHRT_MAX)
|
29
29
|
freq = USHRT_MAX; /* avoid overflow */
|
30
30
|
if (nbytes == -1)
|
31
|
-
nbytes = strlen(text);
|
31
|
+
nbytes = std::strlen(text);
|
32
32
|
Word *w = static_cast<Word *>(pool_alloc(sizeof(Word)
|
33
33
|
+ nbytes+1
|
34
34
|
- word_embed_len));
|
35
|
-
w->nbytes =
|
35
|
+
w->nbytes = nbytes;
|
36
36
|
w->length = length;
|
37
37
|
w->freq = freq;
|
38
38
|
std::strncpy(w->text, text, nbytes);
|
data/misc/homepage.erb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
<%# -*- mode: text; coding: utf-8 -*- %>
|
2
2
|
<%
|
3
3
|
$title = "rmmseg-cpp Homepage"
|
4
|
-
$authors = { 'pluskid' => 'http://pluskid.
|
4
|
+
$authors = { 'pluskid' => 'http://blog.pluskid.org' }
|
5
5
|
%>
|
6
6
|
|
7
7
|
<% chapter "Introduction" do %>
|
@@ -110,7 +110,24 @@
|
|
110
110
|
RMMSeg::Dictionary.load_dictionaries
|
111
111
|
</code>
|
112
112
|
|
113
|
-
Now rmmseg-cpp will be ready to do segmenting.
|
113
|
+
Now rmmseg-cpp will be ready to do segmenting. If you want to load your own customized
|
114
|
+
dictionaries, please customize <tt>RMMSeg::Dictionary.dictionaries</tt> before calling
|
115
|
+
<tt>load_dictionaries</tt>. e.g.
|
116
|
+
|
117
|
+
<code>
|
118
|
+
RMMSeg::Dictionary.dictionaries = [[:chars, "my_chars.dic"],
|
119
|
+
[:words, "my_words.dic"],
|
120
|
+
[:words, "my_words2.dic"]]
|
121
|
+
</code>
|
122
|
+
|
123
|
+
The basic format for char-dictionary and word-dictionary are similar. For each line,
|
124
|
+
there is a number, then *a* space, then the string. Note there *SHOULD* be a newline
|
125
|
+
at the end of the dictionary file. And the number in char-dictionary and word-dictionary
|
126
|
+
has different meaning.
|
127
|
+
|
128
|
+
In char-dictionary, the number means the frequency of the character. In word-dictionary,
|
129
|
+
the number mean the number of characters in the word. Note that this is NOT the number
|
130
|
+
of *bytes* in the word.
|
114
131
|
|
115
132
|
<% end %>
|
116
133
|
|
@@ -139,7 +156,7 @@
|
|
139
156
|
of running that example is shown in <%= xref "Ferret Example Screenshot" %>.
|
140
157
|
|
141
158
|
<% figure "Ferret Example Screenshot" do %>
|
142
|
-
!http://pluskid.
|
159
|
+
!http://lifegoo.pluskid.org/wp-content/uploads/2008/02/rmmseg.png!
|
143
160
|
<% end %>
|
144
161
|
|
145
162
|
<% end %>
|
@@ -174,6 +191,6 @@
|
|
174
191
|
<% chapter "Resources" do %>
|
175
192
|
* "Project Home":http://rubyforge.org/projects/rmmseg-cpp/: The Project page at RubyForge.
|
176
193
|
* "RDoc of rmmseg-cpp":http://rmmseg-cpp.rubyforge.org/rdoc/index.html: The auto generated rdoc of RMMSeg.
|
177
|
-
* "Free Mind":http://pluskid.
|
194
|
+
* "Free Mind":http://blog.pluskid.org/: The author's blog.
|
178
195
|
* "Author's Email":mailto:pluskid@gmail.com: Contact me if you have any problem.
|
179
196
|
<% end %>
|
data/misc/homepage.html
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
4
4
|
<head>
|
5
5
|
<meta http-equiv="content-type" content="text/html; charset=utf-8"/>
|
6
|
-
<meta name="date" content="
|
6
|
+
<meta name="date" content="10 September 2011"/>
|
7
7
|
<meta name="author" content="pluskid"/>
|
8
8
|
<meta name="generator" content="Gerbil 3.1.0"/>
|
9
9
|
<title>rmmseg-cpp Homepage</title>
|
@@ -928,8 +928,8 @@
|
|
928
928
|
|
929
929
|
|
930
930
|
<h1 class="title"><a class="here" href="#">rmmseg-cpp Homepage</a></h1>
|
931
|
-
<h2 class="authors"><a href="http://pluskid.
|
932
|
-
<h3 class="date">
|
931
|
+
<h2 class="authors"><a href="http://blog.pluskid.org">pluskid</a></h2>
|
932
|
+
<h3 class="date">10 September 2011</h3>
|
933
933
|
|
934
934
|
</div>
|
935
935
|
|
@@ -943,12 +943,12 @@
|
|
943
943
|
<div id="Contents">
|
944
944
|
<h1 class="title"><a class="here" href="#Contents">Contents</a></h1>
|
945
945
|
<ul>
|
946
|
-
<li>1 <a id="
|
946
|
+
<li>1 <a id="a16539600" href="#Introduction">Introduction</a></li><li>2 <a id="a16533660" href="#Setup">Setup</a><ul><li>2.1 <a id="a16530900" href="#Requirements">Requirements</a></li><li>2.2 <a id="a16472140" href="#Installation">Installation</a><ul><li>2.2.1 <a id="a16468300" href="#Using-RubyGems">Using RubyGems</a></li><li>2.2.2 <a id="a16363260" href="#From-Git">From Git</a></li></ul></li></ul></li><li>3 <a id="a16272720" href="#Usage">Usage</a><ul><li>3.1 <a id="a16246860" href="#Stand-Alone-rmmseg">Stand Alone rmmseg</a></li><li>3.2 <a id="a16240340" href="#Use-in-Ruby-program">Use in Ruby program</a><ul><li>3.2.1 <a id="a16231580" href="#Initialize">Initialize</a></li><li>3.2.2 <a id="a16187880" href="#Ferret-Integration">Ferret Integration</a></li><li>3.2.3 <a id="a16113620" href="#Normal-Ruby-program">Normal Ruby program</a></li></ul></li></ul></li><li>4 <a id="a16072000" href="#Who-use-it">Who use it</a></li><li>5 <a id="a16034860" href="#Resources">Resources</a></li>
|
947
947
|
|
948
948
|
</ul>
|
949
949
|
</div>
|
950
950
|
|
951
|
-
<div id="lof"><h1 id
|
951
|
+
<div id="lof"><h1 id=\"Figures\" class=\"title\"><a class=\"here\" href=\"#Figures\">Figures</a></h1> <ol><li><a id=\\\"a16148860\\\" href=\\\"#Ferret-Example-Screenshot\\\">Ferret Example Screenshot</a></li></ol><h1 id=\"Tips\" class=\"title\"><a class=\"here\" href=\"#Tips\">Tips</a></h1> <ol><li><a id=\\\"a16067160\\\" href=\\\"#Expand-this-list\\\">Expand this list</a></li></ol><h1 id=\"Warnings\" class=\"title\"><a class=\"here\" href=\"#Warnings\">Warnings</a></h1> <ol><li><a id=\\\"a16360020\\\" href=\\\"#The-latest-source-code-may-be-unstable\\\">The latest source code may be unstable</a></li></ol></div>
|
952
952
|
|
953
953
|
<br style="display: none"/>
|
954
954
|
<hr style="display: none"/>
|
@@ -958,30 +958,30 @@
|
|
958
958
|
<div class="chapter">
|
959
959
|
<h1 class="title">
|
960
960
|
Chapter
|
961
|
-
<a class="list" id="Introduction" href="#
|
961
|
+
<a class="list" id="Introduction" href="#a16539600">1</a>
|
962
962
|
|
963
963
|
<br/>
|
964
964
|
|
965
965
|
<a class="here" href="#Introduction"><big>Introduction</big></a>
|
966
966
|
</h1>
|
967
967
|
|
968
|
-
<div class="content"><p>rmmseg-cpp is a high performance Chinese word segmentation utility for
|
969
|
-
Ruby. It features full <a href="http://ferret.davebalmain.com/">Ferret</a> integration
|
968
|
+
<div class="content"><p>rmmseg-cpp is a high performance Chinese word segmentation utility for
|
969
|
+
Ruby. It features full <a href="http://ferret.davebalmain.com/">Ferret</a> integration
|
970
970
|
as well as support for normal Ruby program usage.</p>
|
971
|
-
<p>rmmseg-cpp is a re-written of the original
|
972
|
-
<a href="http://rmmseg.rubyforge.org/">RMMSeg</a> gem in C++. RMMSeg is written
|
973
|
-
in pure Ruby. Though I tried hard to tweak RMMSeg, it just consumes
|
971
|
+
<p>rmmseg-cpp is a re-written of the original
|
972
|
+
<a href="http://rmmseg.rubyforge.org/">RMMSeg</a> gem in C++. RMMSeg is written
|
973
|
+
in pure Ruby. Though I tried hard to tweak RMMSeg, it just consumes
|
974
974
|
lots of memory and the segmenting process is rather slow.</p>
|
975
|
-
<p>The interface is almost identical to RMMSeg but the performance is
|
976
|
-
much better. This gem is always preferable in production
|
977
|
-
use. However, if you want to understand how the MMSEG segmenting
|
978
|
-
algorithm works, the source code of RMMSeg is a better choice than
|
975
|
+
<p>The interface is almost identical to RMMSeg but the performance is
|
976
|
+
much better. This gem is always preferable in production
|
977
|
+
use. However, if you want to understand how the MMSEG segmenting
|
978
|
+
algorithm works, the source code of RMMSeg is a better choice than
|
979
979
|
this.</p></div>
|
980
980
|
</div>
|
981
981
|
<div class="chapter">
|
982
982
|
<h1 class="title">
|
983
983
|
Chapter
|
984
|
-
<a class="list" id="Setup" href="#
|
984
|
+
<a class="list" id="Setup" href="#a16533660">2</a>
|
985
985
|
|
986
986
|
<br/>
|
987
987
|
|
@@ -990,7 +990,7 @@ this.</p></div>
|
|
990
990
|
|
991
991
|
<div class="content"><div class="section">
|
992
992
|
<h2 class="title">
|
993
|
-
<a class="list" id="Requirements" href="#
|
993
|
+
<a class="list" id="Requirements" href="#a16530900">2.1</a> <a class="here" href="#Requirements">Requirements</a>
|
994
994
|
</h2>
|
995
995
|
<div class="content"><p>Your system needs the following software to run RMMSeg.</p>
|
996
996
|
<table border="1">
|
@@ -1011,37 +1011,37 @@ this.</p></div>
|
|
1011
1011
|
<td> Used to build the native extension </td>
|
1012
1012
|
</tr>
|
1013
1013
|
</table></div>
|
1014
|
-
</div
|
1014
|
+
</div>
|
1015
1015
|
<div class="section">
|
1016
1016
|
<h2 class="title">
|
1017
|
-
<a class="list" id="Installation" href="#
|
1017
|
+
<a class="list" id="Installation" href="#a16472140">2.2</a> <a class="here" href="#Installation">Installation</a>
|
1018
1018
|
</h2>
|
1019
1019
|
<div class="content"><div class="section">
|
1020
1020
|
<h3 class="title">
|
1021
|
-
<a class="list" id="Using-RubyGems" href="#
|
1021
|
+
<a class="list" id="Using-RubyGems" href="#a16468300">2.2.1</a> <a class="here" href="#Using-RubyGems">Using RubyGems</a>
|
1022
1022
|
</h3>
|
1023
1023
|
<div class="content"><p>To install the gem remotely from <a href="http://rubyforge.org">RubyForge</a>:</p>
|
1024
1024
|
sudo gem install rmmseg-cpp
|
1025
|
-
<p>Or you can download the gem file manually from
|
1026
|
-
<a href="http://rubyforge.org/projects/rmmseg-cpp/">RubyForge</a> and
|
1025
|
+
<p>Or you can download the gem file manually from
|
1026
|
+
<a href="http://rubyforge.org/projects/rmmseg-cpp/">RubyForge</a> and
|
1027
1027
|
install it locally:</p>
|
1028
1028
|
sudo gem install —local rmmseg-cpp-x.y.z.gem</div>
|
1029
|
-
</div
|
1029
|
+
</div>
|
1030
1030
|
<div class="section">
|
1031
1031
|
<h3 class="title">
|
1032
|
-
<a class="list" id="From-Git" href="#
|
1032
|
+
<a class="list" id="From-Git" href="#a16363260">2.2.2</a> <a class="here" href="#From-Git">From Git</a>
|
1033
1033
|
</h3>
|
1034
|
-
<div class="content"><p>To build the gem manually from the latest source code. You’ll
|
1034
|
+
<div class="content"><p>To build the gem manually from the latest source code. You’ll
|
1035
1035
|
need to have <strong>git</strong> and <strong>rake</strong> installed.</p>
|
1036
1036
|
<p><div class="warning">
|
1037
|
-
<p class="title"><a class="list" id="The-latest-source-code-may-be-unstable" href="#
|
1037
|
+
<p class="title"><a class="list" id="The-latest-source-code-may-be-unstable" href="#a16360020">Warning 1</a>. <a class="here" href="#The-latest-source-code-may-be-unstable">The latest source code may be unstable</a></p>
|
1038
1038
|
|
1039
|
-
<div class="content icon-warning">While I tried to avoid such kind of problems, the source
|
1040
|
-
code from the repository might still be broken sometimes
|
1039
|
+
<div class="content icon-warning">While I tried to avoid such kind of problems, the source
|
1040
|
+
code from the repository might still be broken sometimes.
|
1041
1041
|
It is generally not recommended to follow the source code.</div>
|
1042
|
-
</div>
|
1043
|
-
The source code of rmmseg-cpp is hosted at
|
1044
|
-
<a href="http://github.com/pluskid/rmmseg-cpp/">GitHub</a>. You can get the
|
1042
|
+
</div>
|
1043
|
+
The source code of rmmseg-cpp is hosted at
|
1044
|
+
<a href="http://github.com/pluskid/rmmseg-cpp/">GitHub</a>. You can get the
|
1045
1045
|
source code by git clone:</p>
|
1046
1046
|
git clone git://github.com/pluskid/rmmseg-cpp.git
|
1047
1047
|
<p>then you can use Rake to build and install the gem:</p>
|
@@ -1053,97 +1053,107 @@ rake gem:install</div>
|
|
1053
1053
|
<div class="chapter">
|
1054
1054
|
<h1 class="title">
|
1055
1055
|
Chapter
|
1056
|
-
<a class="list" id="Usage" href="#
|
1056
|
+
<a class="list" id="Usage" href="#a16272720">3</a>
|
1057
1057
|
|
1058
1058
|
<br/>
|
1059
1059
|
|
1060
1060
|
<a class="here" href="#Usage"><big>Usage</big></a>
|
1061
1061
|
</h1>
|
1062
1062
|
|
1063
|
-
<div class="content"
|
1063
|
+
<div class="content">
|
1064
|
+
<p><div class="section">
|
1064
1065
|
<h2 class="title">
|
1065
|
-
<a class="list" id="Stand-Alone-rmmseg" href="#
|
1066
|
+
<a class="list" id="Stand-Alone-rmmseg" href="#a16246860">3.1</a> <a class="here" href="#Stand-Alone-rmmseg">Stand Alone rmmseg</a>
|
1066
1067
|
</h2>
|
1067
|
-
<div class="content"><p>rmmseg-cpp comes with a script <strong>rmmseg</strong>. To get the basic usage, just execute it with <tt>-h</tt> option
|
1068
|
-
|
1069
|
-
|
1070
|
-
<p>It reads from STDIN and print result to STDOUT. Here is a real<br />
|
1068
|
+
<div class="content"><p>rmmseg-cpp comes with a script <strong>rmmseg</strong>. To get the basic usage, just execute it with <tt>-h</tt> option:</p>
|
1069
|
+
rmmseg -h
|
1070
|
+
<p>It reads from STDIN and print result to STDOUT. Here is a real
|
1071
1071
|
example:</p>
|
1072
1072
|
$ echo “我们都喜欢用 Ruby” | rmmseg
|
1073
1073
|
我们 都 喜欢 用 Ruby</div>
|
1074
|
-
</div
|
1074
|
+
</div>
|
1075
1075
|
<div class="section">
|
1076
1076
|
<h2 class="title">
|
1077
|
-
<a class="list" id="Use-in-Ruby-program" href="#
|
1077
|
+
<a class="list" id="Use-in-Ruby-program" href="#a16240340">3.2</a> <a class="here" href="#Use-in-Ruby-program">Use in Ruby program</a>
|
1078
1078
|
</h2>
|
1079
1079
|
<div class="content"><div class="section">
|
1080
1080
|
<h3 class="title">
|
1081
|
-
<a class="list" id="Initialize" href="#
|
1081
|
+
<a class="list" id="Initialize" href="#a16231580">3.2.1</a> <a class="here" href="#Initialize">Initialize</a>
|
1082
1082
|
</h3>
|
1083
1083
|
<div class="content"><p>To use rmmseg-cpp in Ruby program, you’ll first load it with RubyGems:</p>
|
1084
1084
|
<pre class="code">
|
1085
|
-
require <span style="background-color:#fff0f0"><span style="color:#710">'</span><span style="
|
1086
|
-
require <span style="background-color:#fff0f0"><span style="color:#710">'</span><span style="
|
1085
|
+
require <span style="background-color:#fff0f0;color:#D20"><span style="color:#710">'</span><span style="">rubygems</span><span style="color:#710">'</span></span>
|
1086
|
+
require <span style="background-color:#fff0f0;color:#D20"><span style="color:#710">'</span><span style="">rmmseg</span><span style="color:#710">'</span></span>
|
1087
1087
|
</pre>
|
1088
|
-
<p>Then you may customize the dictionaries used by rmmseg-cpp
|
1089
|
-
(see <a href="http://rmmseg-cpp.rubyforge.org/rdoc/classes/RMMSeg/Dictionary.html">the rdoc</a> on
|
1088
|
+
<p>Then you may customize the dictionaries used by rmmseg-cpp
|
1089
|
+
(see <a href="http://rmmseg-cpp.rubyforge.org/rdoc/classes/RMMSeg/Dictionary.html">the rdoc</a> on
|
1090
1090
|
how to add your own dictionaries) and load all dictionaries:</p>
|
1091
1091
|
<pre class="code">
|
1092
|
-
<span style="color:#036;
|
1092
|
+
<span style="color:#036;font-weight:bold">RMMSeg</span>::<span style="color:#036;font-weight:bold">Dictionary</span>.load_dictionaries
|
1093
|
+
</pre>
|
1094
|
+
<p>Now rmmseg-cpp will be ready to do segmenting. If you want to load your own customized
|
1095
|
+
dictionaries, please customize <tt>RMMSeg::Dictionary.dictionaries</tt> before calling
|
1096
|
+
<tt>load_dictionaries</tt>. e.g.</p>
|
1097
|
+
<pre class="code">
|
1098
|
+
<span style="color:#036;font-weight:bold">RMMSeg</span>::<span style="color:#036;font-weight:bold">Dictionary</span>.dictionaries = [[<span style="color:#A60">:chars</span>, <span style="background-color:#fff0f0;color:#D20"><span style="color:#710">"</span><span style="">my_chars.dic</span><span style="color:#710">"</span></span>],
|
1099
|
+
[<span style="color:#A60">:words</span>, <span style="background-color:#fff0f0;color:#D20"><span style="color:#710">"</span><span style="">my_words.dic</span><span style="color:#710">"</span></span>],
|
1100
|
+
[<span style="color:#A60">:words</span>, <span style="background-color:#fff0f0;color:#D20"><span style="color:#710">"</span><span style="">my_words2.dic</span><span style="color:#710">"</span></span>]]
|
1093
1101
|
</pre>
|
1094
|
-
<p>
|
1095
|
-
</
|
1102
|
+
<p>The basic format for char-dictionary and word-dictionary are similar. For each line,
|
1103
|
+
there is a number, then <strong>a</strong> space, then the string. Note there <strong>SHOULD</strong> be a newline
|
1104
|
+
at the end of the dictionary file. And the number in char-dictionary and word-dictionary
|
1105
|
+
has different meaning.</p>
|
1106
|
+
<p>In char-dictionary, the number means the frequency of the character. In word-dictionary,
|
1107
|
+
the number mean the number of characters in the word. Note that this is NOT the number
|
1108
|
+
of <strong>bytes</strong> in the word.</p></div>
|
1109
|
+
</div>
|
1096
1110
|
<div class="section">
|
1097
1111
|
<h3 class="title">
|
1098
|
-
<a class="list" id="Ferret-Integration" href="#
|
1112
|
+
<a class="list" id="Ferret-Integration" href="#a16187880">3.2.2</a> <a class="here" href="#Ferret-Integration">Ferret Integration</a>
|
1099
1113
|
</h3>
|
1100
|
-
<div class="content"><p>To use rmmseg-cpp with Ferret, you’ll need to <code class="code">require</code> the
|
1101
|
-
Ferret support of rmmseg-cpp (Of course you’ll also have to
|
1102
|
-
got Ferret installed. If you have problems running the belowing
|
1103
|
-
example, please try to update to the latest version of both
|
1104
|
-
Ferret and rmmseg-cpp first)
|
1105
|
-
<br />
|
1114
|
+
<div class="content"><p>To use rmmseg-cpp with Ferret, you’ll need to <code class="code">require</code> the
|
1115
|
+
Ferret support of rmmseg-cpp (Of course you’ll also have to
|
1116
|
+
got Ferret installed. If you have problems running the belowing
|
1117
|
+
example, please try to update to the latest version of both
|
1118
|
+
Ferret and rmmseg-cpp first):</p>
|
1106
1119
|
<pre class="code">
|
1107
|
-
require <span style="background-color:#fff0f0"><span style="color:#710">'</span><span style="
|
1108
|
-
</pre
|
1109
|
-
<p>rmmseg-cpp comes with a ready to use Ferret analyzer
|
1110
|
-
<br />
|
1120
|
+
require <span style="background-color:#fff0f0;color:#D20"><span style="color:#710">'</span><span style="">rmmseg/ferret</span><span style="color:#710">'</span></span>
|
1121
|
+
</pre>
|
1122
|
+
<p>rmmseg-cpp comes with a ready to use Ferret analyzer:</p>
|
1111
1123
|
<pre class="code">
|
1112
|
-
analyzer = <span style="color:#036;
|
1113
|
-
<span style="color:#036;
|
1124
|
+
analyzer = <span style="color:#036;font-weight:bold">RMMSeg</span>::<span style="color:#036;font-weight:bold">Ferret</span>::<span style="color:#036;font-weight:bold">Analyzer</span>.new { |tokenizer|
|
1125
|
+
<span style="color:#036;font-weight:bold">Ferret</span>::<span style="color:#036;font-weight:bold">Analysis</span>::<span style="color:#036;font-weight:bold">LowerCaseFilter</span>.new(tokenizer)
|
1114
1126
|
}
|
1115
|
-
index = <span style="color:#036;
|
1116
|
-
</pre
|
1117
|
-
|
1118
|
-
|
1119
|
-
|
1120
|
-
<
|
1121
|
-
<div class="
|
1122
|
-
<p class="title"><a class="list" id="Ferret-Example-Screenshot" href="#a-607147048">Figure 1</a>. <a class="here" href="#Ferret-Example-Screenshot">Ferret Example Screenshot</a></p>
|
1123
|
-
<div class="content"><img src="http://pluskid.lifegoo.com/wp-content/uploads/2008/02/rmmseg.png" alt="" /></div>
|
1127
|
+
index = <span style="color:#036;font-weight:bold">Ferret</span>::<span style="color:#036;font-weight:bold">Index</span>::<span style="color:#036;font-weight:bold">Index</span>.new(<span style="color:#A60">:analyzer</span> => analyzer)
|
1128
|
+
</pre>
|
1129
|
+
<p>A complete example can be found in <tt>misc/ferret_example.rb</tt>. The result
|
1130
|
+
of running that example is shown in <a class="xref" href="#Ferret-Example-Screenshot">Figure 1. Ferret Example Screenshot</a>.</p>
|
1131
|
+
<p><div class="figure">
|
1132
|
+
<p class="title"><a class="list" id="Ferret-Example-Screenshot" href="#a16148860">Figure 1</a>. <a class="here" href="#Ferret-Example-Screenshot">Ferret Example Screenshot</a></p>
|
1133
|
+
<div class="content"><img src="http://lifegoo.pluskid.org/wp-content/uploads/2008/02/rmmseg.png" alt="" /></div>
|
1124
1134
|
</div></p></div>
|
1125
|
-
</div
|
1135
|
+
</div>
|
1126
1136
|
<div class="section">
|
1127
1137
|
<h3 class="title">
|
1128
|
-
<a class="list" id="Normal-Ruby-program" href="#
|
1138
|
+
<a class="list" id="Normal-Ruby-program" href="#a16113620">3.2.3</a> <a class="here" href="#Normal-Ruby-program">Normal Ruby program</a>
|
1129
1139
|
</h3>
|
1130
|
-
<div class="content"><p>rmmseg-cpp can also be used in normal Ruby programs. Just create
|
1131
|
-
an <code class="code"><span style="color:#036;
|
1140
|
+
<div class="content"><p>rmmseg-cpp can also be used in normal Ruby programs. Just create
|
1141
|
+
an <code class="code"><span style="color:#036;font-weight:bold">Algorithm</span></code> object and call <code class="code">next_token</code> until a <code class="code"><span style="color:#038;font-weight:bold">nil</span></code> is returned:</p>
|
1132
1142
|
<pre class="code">
|
1133
|
-
algor = <span style="color:#036;
|
1134
|
-
loop <span style="color:#080;
|
1143
|
+
algor = <span style="color:#036;font-weight:bold">RMMSeg</span>::<span style="color:#036;font-weight:bold">Algorithm</span>.new(text)
|
1144
|
+
loop <span style="color:#080;font-weight:bold">do</span>
|
1135
1145
|
tok = algor.next_token
|
1136
|
-
<span style="color:#080;
|
1137
|
-
puts <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="background:
|
1138
|
-
<span style="color:#080;
|
1146
|
+
<span style="color:#080;font-weight:bold">break</span> <span style="color:#080;font-weight:bold">if</span> tok.nil?
|
1147
|
+
puts <span style="background-color:#fff0f0;color:#D20"><span style="color:#710">"</span><span style="background:#ddd;color:black"><span style="background:#ddd;font-weight:bold;color:#666">#{</span>tok.text<span style="background:#ddd;font-weight:bold;color:#666">}</span></span><span style=""> [</span><span style="background:#ddd;color:black"><span style="background:#ddd;font-weight:bold;color:#666">#{</span>tok.start<span style="background:#ddd;font-weight:bold;color:#666">}</span></span><span style="">..</span><span style="background:#ddd;color:black"><span style="background:#ddd;font-weight:bold;color:#666">#{</span>tok.end<span style="background:#ddd;font-weight:bold;color:#666">}</span></span><span style="">]</span><span style="color:#710">"</span></span>
|
1148
|
+
<span style="color:#080;font-weight:bold">end</span>
|
1139
1149
|
</pre></div>
|
1140
1150
|
</div></div>
|
1141
|
-
</div></div>
|
1151
|
+
</div></p></div>
|
1142
1152
|
</div>
|
1143
1153
|
<div class="chapter">
|
1144
1154
|
<h1 class="title">
|
1145
1155
|
Chapter
|
1146
|
-
<a class="list" id="Who-use-it" href="#
|
1156
|
+
<a class="list" id="Who-use-it" href="#a16072000">4</a>
|
1147
1157
|
|
1148
1158
|
<br/>
|
1149
1159
|
|
@@ -1151,20 +1161,20 @@ loop <span style="color:#080; font-weight:bold">do</span>
|
|
1151
1161
|
</h1>
|
1152
1162
|
|
1153
1163
|
<div class="content"><p><div class="tip">
|
1154
|
-
<p class="title"><a class="list" id="Expand-this-list" href="#
|
1164
|
+
<p class="title"><a class="list" id="Expand-this-list" href="#a16067160">Tip 1</a>. <a class="here" href="#Expand-this-list">Expand this list</a></p>
|
1155
1165
|
|
1156
|
-
<div class="content icon-tip">If you used rmmseg-cpp and would like your project to
|
1166
|
+
<div class="content icon-tip">If you used rmmseg-cpp and would like your project to
|
1157
1167
|
appear in this list, please <a href="mailto:pluskid@gmail.com">contact me</a>.</div>
|
1158
1168
|
</div></p>
|
1159
1169
|
<ul>
|
1160
|
-
<li><a href="http://www.javaeye.com/">JavaEye</a>: One of the biggest software developper
|
1170
|
+
<li><a href="http://www.javaeye.com/">JavaEye</a>: One of the biggest software developper
|
1161
1171
|
community in China.</li>
|
1162
1172
|
</ul></div>
|
1163
1173
|
</div>
|
1164
1174
|
<div class="chapter">
|
1165
1175
|
<h1 class="title">
|
1166
1176
|
Chapter
|
1167
|
-
<a class="list" id="Resources" href="#
|
1177
|
+
<a class="list" id="Resources" href="#a16034860">5</a>
|
1168
1178
|
|
1169
1179
|
<br/>
|
1170
1180
|
|
@@ -1174,7 +1184,7 @@ appear in this list, please <a href="mailto:pluskid@gmail.com">contact me</a>.</
|
|
1174
1184
|
<div class="content"><ul>
|
1175
1185
|
<li><a href="http://rubyforge.org/projects/rmmseg-cpp/">Project Home</a>: The Project page at RubyForge.</li>
|
1176
1186
|
<li><a href="http://rmmseg-cpp.rubyforge.org/rdoc/index.html">RDoc of rmmseg-cpp</a>: The auto generated rdoc of RMMSeg.</li>
|
1177
|
-
<li><a href="http://pluskid.
|
1187
|
+
<li><a href="http://blog.pluskid.org/">Free Mind</a>: The author’s blog.</li>
|
1178
1188
|
<li><a href="mailto:pluskid@gmail.com">Author’s Email</a>: Contact me if you have any problem.</li>
|
1179
1189
|
</ul></div>
|
1180
1190
|
</div></div>
|
@@ -1187,7 +1197,7 @@ appear in this list, please <a href="mailto:pluskid@gmail.com">contact me</a>.</
|
|
1187
1197
|
|
1188
1198
|
<div id="footer">
|
1189
1199
|
|
1190
|
-
Generated on
|
1200
|
+
Generated on 2011-09-10 15:59:08 +0800 by <a href="http://gerbil.rubyforge.org">Gerbil</a> 3.1.0.
|
1191
1201
|
|
1192
1202
|
<div id="footer-credits">
|
1193
1203
|
<span class="icon-warning" style="float: right"> </span>
|
metadata
CHANGED
@@ -1,7 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rmmseg-cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 2
|
8
|
+
- 9
|
9
|
+
version: 0.2.9
|
5
10
|
platform: ruby
|
6
11
|
authors:
|
7
12
|
- pluskid
|
@@ -9,11 +14,25 @@ autorequire:
|
|
9
14
|
bindir: bin
|
10
15
|
cert_chain: []
|
11
16
|
|
12
|
-
date:
|
17
|
+
date: 2011-09-10 00:00:00 +08:00
|
13
18
|
default_executable:
|
14
19
|
dependencies: []
|
15
20
|
|
16
|
-
description:
|
21
|
+
description: |-
|
22
|
+
rmmseg-cpp is a high performance Chinese word segmentation utility for
|
23
|
+
Ruby. It features full "Ferret":http://ferret.davebalmain.com/ integration
|
24
|
+
as well as support for normal Ruby program usage.
|
25
|
+
|
26
|
+
rmmseg-cpp is a re-written of the original
|
27
|
+
RMMSeg(http://rmmseg.rubyforge.org/) gem in C++. RMMSeg is written
|
28
|
+
in pure Ruby. Though I tried hard to tweak RMMSeg, it just consumes
|
29
|
+
lots of memory and the segmenting process is rather slow.
|
30
|
+
|
31
|
+
The interface is almost identical to RMMSeg but the performance is
|
32
|
+
much better. This gem is always preferable in production
|
33
|
+
use. However, if you want to understand how the MMSEG segmenting
|
34
|
+
algorithm works, the source code of RMMSeg is a better choice than
|
35
|
+
this.
|
17
36
|
email: pluskid@gmail.com
|
18
37
|
executables:
|
19
38
|
- rmmseg
|
@@ -69,6 +88,8 @@ files:
|
|
69
88
|
- test/test_rmmseg.rb
|
70
89
|
has_rdoc: true
|
71
90
|
homepage: http://rmmseg-cpp.rubyforge.org
|
91
|
+
licenses: []
|
92
|
+
|
72
93
|
post_install_message:
|
73
94
|
rdoc_options:
|
74
95
|
- --main
|
@@ -77,23 +98,27 @@ require_paths:
|
|
77
98
|
- lib
|
78
99
|
- ext
|
79
100
|
required_ruby_version: !ruby/object:Gem::Requirement
|
101
|
+
none: false
|
80
102
|
requirements:
|
81
103
|
- - ">="
|
82
104
|
- !ruby/object:Gem::Version
|
105
|
+
segments:
|
106
|
+
- 0
|
83
107
|
version: "0"
|
84
|
-
version:
|
85
108
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
109
|
+
none: false
|
86
110
|
requirements:
|
87
111
|
- - ">="
|
88
112
|
- !ruby/object:Gem::Version
|
113
|
+
segments:
|
114
|
+
- 0
|
89
115
|
version: "0"
|
90
|
-
version:
|
91
116
|
requirements: []
|
92
117
|
|
93
118
|
rubyforge_project: rmmseg-cpp
|
94
|
-
rubygems_version: 1.
|
119
|
+
rubygems_version: 1.3.7
|
95
120
|
signing_key:
|
96
|
-
specification_version:
|
121
|
+
specification_version: 3
|
97
122
|
summary: rmmseg-cpp is a high performance Chinese word segmentation utility for Ruby
|
98
123
|
test_files:
|
99
124
|
- test/test_rmmseg.rb
|