rmmseg-cpp 0.2.7 → 0.2.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,11 @@
1
+ == 0.2.9 / 2011-09-10
2
+
3
+ * Fix GC-related bugs in Ruby C extension.
4
+
5
+ == 0.2.8 / 2010-03-22
6
+
7
+ * Minor release, fixed building bugs in Ruby 1.9.
8
+
1
9
  == 0.2.7 / 2008-09-17
2
10
 
3
11
  * Fix various stupid bugs (typo) that cause problems under MacOSX.
data/README CHANGED
@@ -43,6 +43,17 @@ init by loading the dictionaries:
43
43
 
44
44
  RMMSeg::Dictionary.load_dictionaries
45
45
 
46
+ If you want to add customized dictionaries, append them to
47
+ +RMMSeg::Dictionary.dictionaries+ before calling +load_dictionaries+.
48
+ The formats of chars.dic and words.dic are NOT the same:
49
+
50
+ * For chars.dic, each line contains freq, a space, and then the character
51
+ * For words.dic, each line contains length, a space, and then the word.
52
+
53
+ Note length mean the length of the word, i.e. the number of characters
54
+ of the word, not number of bytes. WARNING: there should be a newline at
55
+ the end of every dictionary file.
56
+
46
57
  Then create a +Algorithm+ object and call +next_token+ until got a
47
58
  +nil+:
48
59
 
data/Rakefile CHANGED
@@ -8,7 +8,7 @@ require 'rmmseg'
8
8
  task :default => 'spec:run'
9
9
 
10
10
  PROJ.name = 'rmmseg-cpp'
11
- PROJ.version = '0.2.7'
11
+ PROJ.version = '0.2.9'
12
12
  PROJ.authors = 'pluskid'
13
13
  PROJ.email = 'pluskid@gmail.com'
14
14
  PROJ.url = 'http://rmmseg-cpp.rubyforge.org'
@@ -71,27 +71,28 @@ namespace rmmseg
71
71
 
72
72
  Token Algorithm::get_cjk_word(int len)
73
73
  {
74
- create_chunks();
75
-
76
- if (m_chunks_size > 1)
77
- m_chunks_size = mm_filter(m_chunks, m_chunks_size);
78
- if (m_chunks_size > 1)
79
- m_chunks_size = lawl_filter(m_chunks, m_chunks_size);
80
- if (m_chunks_size > 1)
81
- m_chunks_size = svwl_filter(m_chunks, m_chunks_size);
82
- if (m_chunks_size > 1)
83
- m_chunks_size = lsdmfocw_filter(m_chunks, m_chunks_size);
84
-
85
- if (m_chunks_size < 1)
74
+ vector<Chunk> chunks = create_chunks();
75
+
76
+ if (chunks.size() > 1)
77
+ mm_filter(chunks);
78
+ if (chunks.size() > 1)
79
+ lawl_filter(chunks);
80
+ if (chunks.size() > 1)
81
+ svwl_filter(chunks);
82
+ if (chunks.size() > 1)
83
+ lsdmfocw_filter(chunks);
84
+
85
+ if (chunks.size() < 1)
86
86
  return Token(NULL, 0);
87
87
 
88
- Token token(m_text+m_pos, m_chunks[0].words[0]->nbytes);
89
- m_pos += m_chunks[0].words[0]->nbytes;
88
+ Token token(m_text+m_pos, chunks[0].words[0]->nbytes);
89
+ m_pos += chunks[0].words[0]->nbytes;
90
90
  return token;
91
91
  }
92
92
 
93
- void Algorithm::create_chunks()
93
+ vector<Chunk> Algorithm::create_chunks()
94
94
  {
95
+ vector<Chunk> chunks;
95
96
  Chunk chunk;
96
97
  Word *w1, *w2, *w3;
97
98
 
@@ -100,8 +101,6 @@ namespace rmmseg
100
101
  typedef vec_t::iterator it_t;
101
102
 
102
103
  vec_t words1 = find_match_words();
103
- m_chunks_size = 0;
104
-
105
104
  for (it_t i1 = words1.begin();
106
105
  i1 != words1.end();
107
106
  ++i1)
@@ -136,17 +135,13 @@ namespace rmmseg
136
135
  chunk.n = 3;
137
136
  chunk.words[2] = w3;
138
137
  }
139
- memcpy(m_chunks+m_chunks_size, &chunk,
140
- sizeof(Chunk));
141
- m_chunks_size++;
138
+ chunks.push_back(chunk);
142
139
  }
143
140
  }
144
141
  else if (m_pos == m_text_length)
145
142
  {
146
143
  chunk.n = 2;
147
- memcpy(m_chunks+m_chunks_size, &chunk,
148
- sizeof(Chunk));
149
- m_chunks_size++;
144
+ chunks.push_back(chunk);
150
145
  }
151
146
  m_pos -= w2->nbytes;
152
147
  }
@@ -154,13 +149,13 @@ namespace rmmseg
154
149
  else if (m_pos == m_text_length)
155
150
  {
156
151
  chunk.n = 1;
157
- memcpy(m_chunks+m_chunks_size, &chunk, sizeof(Chunk));
158
- m_chunks_size++;
152
+ chunks.push_back(chunk);
159
153
  }
160
154
  m_pos -= w1->nbytes;
161
155
  }
162
156
 
163
157
  m_pos = orig_pos;
158
+ return chunks;
164
159
  }
165
160
 
166
161
  int Algorithm::next_char()
@@ -169,15 +164,11 @@ namespace rmmseg
169
164
  unsigned char ch = m_text[m_pos];
170
165
  if (ch >= 0xC0 && ch <= 0xDF)
171
166
  {
172
- if (m_text_length-m_pos < 2)
173
- return 1; /* broken text at the end */
174
- return 2;
167
+ return min(2, m_text_length-m_pos);
175
168
  }
176
169
  if (ch >= 0xE0 && ch <= 0xEF)
177
170
  {
178
- if (m_text_length-m_pos < 3)
179
- return 1; /* broken text at the end */
180
- return 3;
171
+ return min(3, m_text_length-m_pos);
181
172
  }
182
173
  return 1;
183
174
  }
@@ -195,11 +186,11 @@ namespace rmmseg
195
186
 
196
187
  while (m_pos < m_text_length)
197
188
  {
189
+ if (n >= max_word_length())
190
+ break;
198
191
  len = next_char();
199
192
  if (len <= 1)
200
193
  break;
201
- if (n >= max_word_length())
202
- break;
203
194
 
204
195
  m_pos += len;
205
196
  n++;
@@ -22,33 +22,32 @@ namespace rmmseg
22
22
  {
23
23
  public:
24
24
  Algorithm(const char *text, int length)
25
- :m_chunks_size(0), m_text(text), m_pos(0),
25
+ :m_text(text), m_pos(0),
26
26
  m_text_length(length),
27
27
  m_tmp_words_i(0),
28
28
  m_match_cache_i(0)
29
- {
30
- for (int i = 0; i < match_cache_size; ++i)
31
- m_match_cache[i].first = -1;
32
- }
29
+ {
30
+ for (int i = 0; i < match_cache_size; ++i)
31
+ m_match_cache[i].first = -1;
32
+ }
33
33
 
34
34
  Token next_token();
35
35
 
36
+ const char *get_text() const
37
+ {
38
+ return m_text;
39
+ }
40
+
36
41
  private:
37
42
  Token get_basic_latin_word();
38
43
  Token get_cjk_word(int);
39
-
40
- static const int MAX_WORD_LENGTH = 4;
41
- static const int MAX_N_CHUNKS = \
42
- MAX_WORD_LENGTH*MAX_WORD_LENGTH*MAX_WORD_LENGTH;
43
44
 
44
- void create_chunks();
45
+ std::vector<Chunk> create_chunks();
45
46
  int next_word();
46
47
  int next_char();
47
48
  std::vector<Word *> find_match_words();
48
- int max_word_length() { return MAX_WORD_LENGTH; }
49
+ int max_word_length() { return 4; }
49
50
 
50
- Chunk m_chunks[MAX_N_CHUNKS];
51
- int m_chunks_size;
52
51
 
53
52
  const char *m_text;
54
53
  int m_pos;
@@ -65,7 +64,7 @@ namespace rmmseg
65
64
  return &m_tmp_words[m_tmp_words_i++];
66
65
  }
67
66
 
68
- /* related to max_word_length and match_words_cache_size */
67
+ /* related to max_word_length and match_cache_size */
69
68
  static const int max_tmp_words = 64;
70
69
  Word m_tmp_words[max_tmp_words];
71
70
  int m_tmp_words_i;
@@ -12,36 +12,37 @@ namespace rmmseg
12
12
  */
13
13
  struct Chunk
14
14
  {
15
- int total_length()
15
+ int total_length() const
16
16
  {
17
17
  int len = 0;
18
18
  for (int i = 0; i < n; ++i)
19
- if (words[i]->length == -1) /* tmp word */
20
- len += 1;
21
- else
22
- len += words[i]->length;
19
+ len += std::abs(words[i]->length);
20
+ //if (words[i]->length == -1) /* tmp word */
21
+ // len += 1;
22
+ //else
23
+ // len += words[i]->length;
23
24
  return len;
24
25
  }
25
- double average_length()
26
+ double average_length() const
26
27
  {
27
28
  return ((double)total_length())/n;
28
29
  }
29
- double variance()
30
+ double variance() const
30
31
  {
31
32
  double avg = average_length();
32
33
  double sqr_sum = 0;
33
34
  double tmp;
34
35
  for (int i = 0; i < n; ++i)
35
36
  {
36
- tmp = words[i]->length;
37
- if (tmp == -1)
38
- tmp = 1;
37
+ tmp = std::abs(words[i]->length);
38
+ //if (tmp == -1)
39
+ // tmp = 1;
39
40
  tmp = tmp-avg;
40
41
  sqr_sum += tmp*tmp;
41
42
  }
42
43
  return std::sqrt(sqr_sum);
43
44
  }
44
- int degree_of_morphemic_freedom()
45
+ int degree_of_morphemic_freedom() const
45
46
  {
46
47
  int sum = 0;
47
48
  for (int i = 0; i < n; ++i)
@@ -12,12 +12,12 @@ namespace rmmseg
12
12
  Entry *next;
13
13
  };
14
14
 
15
- const int init_size = 262147;
16
- const int max_density = 5;
15
+ const size_t init_size = 262147;
16
+ const size_t max_density = 5;
17
17
  /*
18
18
  Table of prime numbers 2^n+a, 2<=n<=30.
19
19
  */
20
- static int primes[] = {
20
+ static size_t primes[] = {
21
21
  524288 + 21,
22
22
  1048576 + 7,
23
23
  2097152 + 17,
@@ -33,14 +33,14 @@ namespace rmmseg
33
33
  };
34
34
 
35
35
 
36
- static int n_bins = init_size;
37
- static int n_entries = 0;
36
+ static size_t n_bins = init_size;
37
+ static size_t n_entries = 0;
38
38
  static Entry **bins = static_cast<Entry **>(std::calloc(init_size,
39
39
  sizeof(Entry *)));
40
40
 
41
- static int new_size()
41
+ static size_t new_size()
42
42
  {
43
- for (int i = 0;
43
+ for (size_t i = 0;
44
44
  i < sizeof(primes)/sizeof(primes[0]);
45
45
  ++i)
46
46
  {
@@ -76,7 +76,7 @@ namespace rmmseg
76
76
  Entry *entry, *next;
77
77
  unsigned int hash_val;
78
78
 
79
- for (int i = 0; i < n_bins; ++i)
79
+ for (size_t i = 0; i < n_bins; ++i)
80
80
  {
81
81
  entry = bins[i];
82
82
  while (entry)
@@ -140,6 +140,7 @@ namespace rmmseg
140
140
  entry->next = NULL;
141
141
  bins[h] = entry;
142
142
  n_entries++;
143
+ return;
143
144
  }
144
145
 
145
146
  bool done = false;
@@ -168,6 +169,7 @@ namespace rmmseg
168
169
  entry->word = word;
169
170
  entry->next = bins[h];
170
171
  bins[h] = entry;
172
+ n_entries++;
171
173
  }
172
174
  }
173
175
 
@@ -179,7 +181,7 @@ namespace rmmseg
179
181
  return false;
180
182
  }
181
183
 
182
- const int buf_len = 24;
184
+ const size_t buf_len = 24;
183
185
  char buf[buf_len];
184
186
  char *ptr;
185
187
 
@@ -4,6 +4,6 @@
4
4
 
5
5
  namespace rmmseg
6
6
  {
7
- char *_pool_base = static_cast<char *>(std::malloc(PRE_ALLOC_SIZE));
8
- int _pool_size = PRE_ALLOC_SIZE;
7
+ char *_pool_base = static_cast<char *>(std::malloc(PRE_ALLOC_SIZE));
8
+ size_t _pool_size = PRE_ALLOC_SIZE;
9
9
  }
@@ -12,12 +12,12 @@
12
12
 
13
13
  namespace rmmseg
14
14
  {
15
- const int REALLOC_SIZE = 2048; /* 2KB */
15
+ const size_t REALLOC_SIZE = 2048; /* 2KB */
16
16
 
17
- extern int _pool_size;
18
- extern char *_pool_base;
17
+ extern size_t _pool_size;
18
+ extern char *_pool_base;
19
19
 
20
- inline void *pool_alloc(int len)
20
+ inline void *pool_alloc(size_t len)
21
21
  {
22
22
  void *mem = _pool_base;
23
23
 
@@ -36,7 +36,7 @@ extern "C" {
36
36
  */
37
37
  static VALUE dic_load_chars(VALUE mod, VALUE path)
38
38
  {
39
- if (rmmseg::dict::load_chars(RSTRING(path)->ptr))
39
+ if (rmmseg::dict::load_chars(RSTRING_PTR(path)))
40
40
  return Qtrue;
41
41
  return Qfalse;
42
42
  }
@@ -51,7 +51,7 @@ extern "C" {
51
51
  */
52
52
  static VALUE dic_load_words(VALUE mod, VALUE path)
53
53
  {
54
- if (rmmseg::dict::load_words(RSTRING(path)->ptr))
54
+ if (rmmseg::dict::load_words(RSTRING_PTR(path)))
55
55
  return Qtrue;
56
56
  return Qfalse;
57
57
  }
@@ -70,8 +70,8 @@ extern "C" {
70
70
  */
71
71
  static VALUE dic_add(VALUE mod, VALUE word, VALUE len, VALUE freq)
72
72
  {
73
- const char *str = RSTRING(word)->ptr;
74
- int nbytes = RSTRING(word)->len;
73
+ const char *str = RSTRING_PTR(word);
74
+ int nbytes = RSTRING_LEN(word);
75
75
  rmmseg::Word *w = rmmseg::make_word(str, FIX2INT(len), FIX2INT(freq), nbytes);
76
76
  rmmseg::dict::add(w);
77
77
  return Qnil;
@@ -88,8 +88,8 @@ extern "C" {
88
88
  */
89
89
  static VALUE dic_has_word(VALUE mod, VALUE word)
90
90
  {
91
- const char *str = RSTRING(word)->ptr;
92
- int nbytes = RSTRING(word)->len;
91
+ const char *str = RSTRING_PTR(word);
92
+ int nbytes = RSTRING_LEN(word);
93
93
  if (rmmseg::dict::get(str, nbytes) != NULL)
94
94
  return Qtrue;
95
95
  return Qfalse;
@@ -162,16 +162,17 @@ extern "C" {
162
162
  int start = t.text-base;
163
163
 
164
164
  // This is necessary, see
165
- // http://pluskid.lifegoo.com/?p=348
165
+ // http://lifegoo.pluskid.org/?p=348
166
166
  volatile VALUE text = rb_str_new(t.text, t.length);
167
167
  tk->text = text;
168
168
 
169
169
  tk->start = INT2FIX(start);
170
170
  tk->end = INT2FIX(start + t.length);
171
- return Data_Wrap_Struct(cToken,
171
+ volatile VALUE tok = Data_Wrap_Struct(cToken,
172
172
  (RUBY_DATA_FUNC)tk_mark,
173
173
  (RUBY_DATA_FUNC)tk_free,
174
174
  tk);
175
+ return tok;
175
176
  }
176
177
 
177
178
  /*********************
@@ -207,8 +208,8 @@ extern "C" {
207
208
  void *mem;
208
209
  algor->text = text;
209
210
  mem = malloc(sizeof(rmmseg::Algorithm));
210
- algor->algor = new(mem) rmmseg::Algorithm(RSTRING(text)->ptr,
211
- RSTRING(text)->len);
211
+ algor->algor = new(mem) rmmseg::Algorithm(RSTRING_PTR(text),
212
+ RSTRING_LEN(text));
212
213
 
213
214
  return Data_Wrap_Struct(klass,
214
215
  (RUBY_DATA_FUNC)algor_mark,
@@ -231,7 +232,8 @@ extern "C" {
231
232
 
232
233
  if (tk.length == 0)
233
234
  return Qnil;
234
- return tk_create(RSTRING(algor->text)->ptr, tk);
235
+ volatile VALUE rtk = tk_create(RSTRING_PTR(algor->text), tk);
236
+ return rtk;
235
237
  }
236
238
 
237
239
 
@@ -9,37 +9,36 @@
9
9
  namespace rmmseg
10
10
  {
11
11
  template <typename Cmp>
12
- int take_highest(Chunk *chunks, int n, Cmp &cmp)
12
+ void take_highest(std::vector<Chunk> &chunks, const Cmp &cmp)
13
13
  {
14
- int i = 1, j;
15
- Chunk &max = chunks[0];
16
-
17
- for (j = 1; j < n; ++j)
14
+ unsigned int i = 1, j;
15
+
16
+ for (j = 1; j < chunks.size(); ++j)
18
17
  {
19
- int rlt = cmp(chunks[j], max);
18
+ int rlt = cmp(chunks[j], chunks[0]);
20
19
  if (rlt > 0)
21
20
  i = 0;
22
21
  if (rlt >= 0)
23
22
  std::swap(chunks[i++], chunks[j]);
24
23
  }
25
- return i;
24
+ chunks.erase(chunks.begin()+i, chunks.end());
26
25
  }
27
26
 
28
27
  struct MMCmp_t
29
28
  {
30
- int operator()(Chunk &a, Chunk &b)
29
+ int operator()(const Chunk &a, const Chunk &b) const
31
30
  {
32
31
  return a.total_length() - b.total_length();
33
32
  }
34
33
  } MMCmp;
35
- int mm_filter(Chunk *chunks, int n)
34
+ void mm_filter(std::vector<Chunk> &chunks)
36
35
  {
37
- return take_highest(chunks, n, MMCmp);
36
+ take_highest(chunks, MMCmp);
38
37
  }
39
38
 
40
39
  struct LAWLCmp_t
41
40
  {
42
- int operator()(Chunk &a, Chunk &b)
41
+ int operator()(const Chunk &a, const Chunk &b) const
43
42
  {
44
43
  double rlt = a.average_length() - b.average_length();
45
44
  if (rlt == 0)
@@ -49,14 +48,14 @@ namespace rmmseg
49
48
  return -1;
50
49
  }
51
50
  } LAWLCmp;
52
- int lawl_filter(Chunk *chunks, int n)
51
+ void lawl_filter(std::vector<Chunk> &chunks)
53
52
  {
54
- return take_highest(chunks, n, LAWLCmp);
53
+ take_highest(chunks, LAWLCmp);
55
54
  }
56
55
 
57
56
  struct SVWLCmp_t
58
57
  {
59
- int operator()(Chunk &a, Chunk& b)
58
+ int operator()(const Chunk &a, const Chunk& b) const
60
59
  {
61
60
  double rlt = a.variance() - b.variance();
62
61
  if (rlt == 0)
@@ -66,21 +65,21 @@ namespace rmmseg
66
65
  return -1;
67
66
  }
68
67
  } SVWLCmp;
69
- int svwl_filter(Chunk *chunks, int n)
68
+ void svwl_filter(std::vector<Chunk> &chunks)
70
69
  {
71
- return take_highest(chunks, n, SVWLCmp);
70
+ take_highest(chunks, SVWLCmp);
72
71
  }
73
72
 
74
73
  struct LSDMFOCWCmp_t
75
74
  {
76
- int operator()(Chunk &a, Chunk& b)
75
+ int operator()(const Chunk &a, const Chunk& b) const
77
76
  {
78
77
  return a.degree_of_morphemic_freedom() - b.degree_of_morphemic_freedom();
79
78
  }
80
79
  } LSDMFOCWCmp;
81
- int lsdmfocw_filter(Chunk *chunks, int n)
80
+ void lsdmfocw_filter(std::vector<Chunk> &chunks)
82
81
  {
83
- return take_highest(chunks, n, LSDMFOCWCmp);
82
+ take_highest(chunks, LSDMFOCWCmp);
84
83
  }
85
84
  }
86
85
 
@@ -28,11 +28,11 @@ namespace rmmseg
28
28
  if (freq > USHRT_MAX)
29
29
  freq = USHRT_MAX; /* avoid overflow */
30
30
  if (nbytes == -1)
31
- nbytes = strlen(text);
31
+ nbytes = std::strlen(text);
32
32
  Word *w = static_cast<Word *>(pool_alloc(sizeof(Word)
33
33
  + nbytes+1
34
34
  - word_embed_len));
35
- w->nbytes = std::strlen(text);
35
+ w->nbytes = nbytes;
36
36
  w->length = length;
37
37
  w->freq = freq;
38
38
  std::strncpy(w->text, text, nbytes);
@@ -1,7 +1,7 @@
1
1
  <%# -*- mode: text; coding: utf-8 -*- %>
2
2
  <%
3
3
  $title = "rmmseg-cpp Homepage"
4
- $authors = { 'pluskid' => 'http://pluskid.lifegoo.com' }
4
+ $authors = { 'pluskid' => 'http://blog.pluskid.org' }
5
5
  %>
6
6
 
7
7
  <% chapter "Introduction" do %>
@@ -110,7 +110,24 @@
110
110
  RMMSeg::Dictionary.load_dictionaries
111
111
  </code>
112
112
 
113
- Now rmmseg-cpp will be ready to do segmenting.
113
+ Now rmmseg-cpp will be ready to do segmenting. If you want to load your own customized
114
+ dictionaries, please customize <tt>RMMSeg::Dictionary.dictionaries</tt> before calling
115
+ <tt>load_dictionaries</tt>. e.g.
116
+
117
+ <code>
118
+ RMMSeg::Dictionary.dictionaries = [[:chars, "my_chars.dic"],
119
+ [:words, "my_words.dic"],
120
+ [:words, "my_words2.dic"]]
121
+ </code>
122
+
123
+ The basic format for char-dictionary and word-dictionary are similar. For each line,
124
+ there is a number, then *a* space, then the string. Note there *SHOULD* be a newline
125
+ at the end of the dictionary file. And the number in char-dictionary and word-dictionary
126
+ has different meaning.
127
+
128
+ In char-dictionary, the number means the frequency of the character. In word-dictionary,
129
+ the number mean the number of characters in the word. Note that this is NOT the number
130
+ of *bytes* in the word.
114
131
 
115
132
  <% end %>
116
133
 
@@ -139,7 +156,7 @@
139
156
  of running that example is shown in <%= xref "Ferret Example Screenshot" %>.
140
157
 
141
158
  <% figure "Ferret Example Screenshot" do %>
142
- !http://pluskid.lifegoo.com/wp-content/uploads/2008/02/rmmseg.png!
159
+ !http://lifegoo.pluskid.org/wp-content/uploads/2008/02/rmmseg.png!
143
160
  <% end %>
144
161
 
145
162
  <% end %>
@@ -174,6 +191,6 @@
174
191
  <% chapter "Resources" do %>
175
192
  * "Project Home":http://rubyforge.org/projects/rmmseg-cpp/: The Project page at RubyForge.
176
193
  * "RDoc of rmmseg-cpp":http://rmmseg-cpp.rubyforge.org/rdoc/index.html: The auto generated rdoc of RMMSeg.
177
- * "Free Mind":http://pluskid.lifegoo.com/: The author's blog.
194
+ * "Free Mind":http://blog.pluskid.org/: The author's blog.
178
195
  * "Author's Email":mailto:pluskid@gmail.com: Contact me if you have any problem.
179
196
  <% end %>
@@ -3,7 +3,7 @@
3
3
  <html xmlns="http://www.w3.org/1999/xhtml">
4
4
  <head>
5
5
  <meta http-equiv="content-type" content="text/html; charset=utf-8"/>
6
- <meta name="date" content="17 September 2008"/>
6
+ <meta name="date" content="10 September 2011"/>
7
7
  <meta name="author" content="pluskid"/>
8
8
  <meta name="generator" content="Gerbil 3.1.0"/>
9
9
  <title>rmmseg-cpp Homepage</title>
@@ -928,8 +928,8 @@
928
928
 
929
929
 
930
930
  <h1 class="title"><a class="here" href="#">rmmseg-cpp Homepage</a></h1>
931
- <h2 class="authors"><a href="http://pluskid.lifegoo.com">pluskid</a></h2>
932
- <h3 class="date">17 September 2008</h3>
931
+ <h2 class="authors"><a href="http://blog.pluskid.org">pluskid</a></h2>
932
+ <h3 class="date">10 September 2011</h3>
933
933
 
934
934
  </div>
935
935
 
@@ -943,12 +943,12 @@
943
943
  <div id="Contents">
944
944
  <h1 class="title"><a class="here" href="#Contents">Contents</a></h1>
945
945
  <ul>
946
- <li>1&nbsp;&nbsp;<a id="a-607090478" href="#Introduction">Introduction</a></li><li>2&nbsp;&nbsp;<a id="a-607093208" href="#Setup">Setup</a><ul><li>2.1&nbsp;&nbsp;<a id="a-607094728" href="#Requirements">Requirements</a></li><li>2.2&nbsp;&nbsp;<a id="a-607099478" href="#Installation">Installation</a><ul><li>2.2.1&nbsp;&nbsp;<a id="a-607103648" href="#Using-RubyGems">Using RubyGems</a></li><li>2.2.2&nbsp;&nbsp;<a id="a-607106038" href="#From-Git">From Git</a></li></ul></li></ul></li><li>3&nbsp;&nbsp;<a id="a-607115348" href="#Usage">Usage</a><ul><li>3.1&nbsp;&nbsp;<a id="a-607120028" href="#Stand-Alone-rmmseg">Stand Alone rmmseg</a></li><li>3.2&nbsp;&nbsp;<a id="a-607126248" href="#Use-in-Ruby-program">Use in Ruby program</a><ul><li>3.2.1&nbsp;&nbsp;<a id="a-607131168" href="#Initialize">Initialize</a></li><li>3.2.2&nbsp;&nbsp;<a id="a-607137248" href="#Ferret-Integration">Ferret Integration</a></li><li>3.2.3&nbsp;&nbsp;<a id="a-607154008" href="#Normal-Ruby-program">Normal Ruby program</a></li></ul></li></ul></li><li>4&nbsp;&nbsp;<a id="a-607162878" href="#Who-use-it">Who use it</a></li><li>5&nbsp;&nbsp;<a id="a-607172188" href="#Resources">Resources</a></li>
946
+ <li>1&nbsp;&nbsp;<a id="a16539600" href="#Introduction">Introduction</a></li><li>2&nbsp;&nbsp;<a id="a16533660" href="#Setup">Setup</a><ul><li>2.1&nbsp;&nbsp;<a id="a16530900" href="#Requirements">Requirements</a></li><li>2.2&nbsp;&nbsp;<a id="a16472140" href="#Installation">Installation</a><ul><li>2.2.1&nbsp;&nbsp;<a id="a16468300" href="#Using-RubyGems">Using RubyGems</a></li><li>2.2.2&nbsp;&nbsp;<a id="a16363260" href="#From-Git">From Git</a></li></ul></li></ul></li><li>3&nbsp;&nbsp;<a id="a16272720" href="#Usage">Usage</a><ul><li>3.1&nbsp;&nbsp;<a id="a16246860" href="#Stand-Alone-rmmseg">Stand Alone rmmseg</a></li><li>3.2&nbsp;&nbsp;<a id="a16240340" href="#Use-in-Ruby-program">Use in Ruby program</a><ul><li>3.2.1&nbsp;&nbsp;<a id="a16231580" href="#Initialize">Initialize</a></li><li>3.2.2&nbsp;&nbsp;<a id="a16187880" href="#Ferret-Integration">Ferret Integration</a></li><li>3.2.3&nbsp;&nbsp;<a id="a16113620" href="#Normal-Ruby-program">Normal Ruby program</a></li></ul></li></ul></li><li>4&nbsp;&nbsp;<a id="a16072000" href="#Who-use-it">Who use it</a></li><li>5&nbsp;&nbsp;<a id="a16034860" href="#Resources">Resources</a></li>
947
947
 
948
948
  </ul>
949
949
  </div>
950
950
 
951
- <div id="lof"><h1 id="Figures" class="title"><a class="here" href="#Figures">Figures</a></h1> <ol><li><a id="a-607147048" href="#Ferret-Example-Screenshot">Ferret Example Screenshot</a></li></ol><h1 id="Tips" class="title"><a class="here" href="#Tips">Tips</a></h1> <ol><li><a id="a-607168148" href="#Expand-this-list">Expand this list</a></li></ol><h1 id="Warnings" class="title"><a class="here" href="#Warnings">Warnings</a></h1> <ol><li><a id="a-607107678" href="#The-latest-source-code-may-be-unstable">The latest source code may be unstable</a></li></ol></div>
951
+ <div id="lof"><h1 id=\"Figures\" class=\"title\"><a class=\"here\" href=\"#Figures\">Figures</a></h1> <ol><li><a id=\\\"a16148860\\\" href=\\\"#Ferret-Example-Screenshot\\\">Ferret Example Screenshot</a></li></ol><h1 id=\"Tips\" class=\"title\"><a class=\"here\" href=\"#Tips\">Tips</a></h1> <ol><li><a id=\\\"a16067160\\\" href=\\\"#Expand-this-list\\\">Expand this list</a></li></ol><h1 id=\"Warnings\" class=\"title\"><a class=\"here\" href=\"#Warnings\">Warnings</a></h1> <ol><li><a id=\\\"a16360020\\\" href=\\\"#The-latest-source-code-may-be-unstable\\\">The latest source code may be unstable</a></li></ol></div>
952
952
 
953
953
  <br style="display: none"/>
954
954
  <hr style="display: none"/>
@@ -958,30 +958,30 @@
958
958
  <div class="chapter">
959
959
  <h1 class="title">
960
960
  Chapter
961
- <a class="list" id="Introduction" href="#a-607090478">1</a>
961
+ <a class="list" id="Introduction" href="#a16539600">1</a>
962
962
 
963
963
  <br/>
964
964
 
965
965
  <a class="here" href="#Introduction"><big>Introduction</big></a>
966
966
  </h1>
967
967
 
968
- <div class="content"><p>rmmseg-cpp is a high performance Chinese word segmentation utility for<br />
969
- Ruby. It features full <a href="http://ferret.davebalmain.com/">Ferret</a> integration<br />
968
+ <div class="content"><p>rmmseg-cpp is a high performance Chinese word segmentation utility for
969
+ Ruby. It features full <a href="http://ferret.davebalmain.com/">Ferret</a> integration
970
970
  as well as support for normal Ruby program usage.</p>
971
- <p>rmmseg-cpp is a re-written of the original<br />
972
- <a href="http://rmmseg.rubyforge.org/">RMMSeg</a> gem in C++. RMMSeg is written<br />
973
- in pure Ruby. Though I tried hard to tweak RMMSeg, it just consumes<br />
971
+ <p>rmmseg-cpp is a re-written of the original
972
+ <a href="http://rmmseg.rubyforge.org/">RMMSeg</a> gem in C++. RMMSeg is written
973
+ in pure Ruby. Though I tried hard to tweak RMMSeg, it just consumes
974
974
  lots of memory and the segmenting process is rather slow.</p>
975
- <p>The interface is almost identical to RMMSeg but the performance is<br />
976
- much better. This gem is always preferable in production<br />
977
- use. However, if you want to understand how the MMSEG segmenting<br />
978
- algorithm works, the source code of RMMSeg is a better choice than<br />
975
+ <p>The interface is almost identical to RMMSeg but the performance is
976
+ much better. This gem is always preferable in production
977
+ use. However, if you want to understand how the MMSEG segmenting
978
+ algorithm works, the source code of RMMSeg is a better choice than
979
979
  this.</p></div>
980
980
  </div>
981
981
  <div class="chapter">
982
982
  <h1 class="title">
983
983
  Chapter
984
- <a class="list" id="Setup" href="#a-607093208">2</a>
984
+ <a class="list" id="Setup" href="#a16533660">2</a>
985
985
 
986
986
  <br/>
987
987
 
@@ -990,7 +990,7 @@ this.</p></div>
990
990
 
991
991
  <div class="content"><div class="section">
992
992
  <h2 class="title">
993
- <a class="list" id="Requirements" href="#a-607094728">2.1</a>&nbsp;&nbsp;<a class="here" href="#Requirements">Requirements</a>
993
+ <a class="list" id="Requirements" href="#a16530900">2.1</a>&nbsp;&nbsp;<a class="here" href="#Requirements">Requirements</a>
994
994
  </h2>
995
995
  <div class="content"><p>Your system needs the following software to run RMMSeg.</p>
996
996
  <table border="1">
@@ -1011,37 +1011,37 @@ this.</p></div>
1011
1011
  <td> Used to build the native extension </td>
1012
1012
  </tr>
1013
1013
  </table></div>
1014
- </div><br />
1014
+ </div>
1015
1015
  <div class="section">
1016
1016
  <h2 class="title">
1017
- <a class="list" id="Installation" href="#a-607099478">2.2</a>&nbsp;&nbsp;<a class="here" href="#Installation">Installation</a>
1017
+ <a class="list" id="Installation" href="#a16472140">2.2</a>&nbsp;&nbsp;<a class="here" href="#Installation">Installation</a>
1018
1018
  </h2>
1019
1019
  <div class="content"><div class="section">
1020
1020
  <h3 class="title">
1021
- <a class="list" id="Using-RubyGems" href="#a-607103648">2.2.1</a>&nbsp;&nbsp;<a class="here" href="#Using-RubyGems">Using RubyGems</a>
1021
+ <a class="list" id="Using-RubyGems" href="#a16468300">2.2.1</a>&nbsp;&nbsp;<a class="here" href="#Using-RubyGems">Using RubyGems</a>
1022
1022
  </h3>
1023
1023
  <div class="content"><p>To install the gem remotely from <a href="http://rubyforge.org">RubyForge</a>:</p>
1024
1024
  sudo gem install rmmseg-cpp
1025
- <p>Or you can download the gem file manually from<br />
1026
- <a href="http://rubyforge.org/projects/rmmseg-cpp/">RubyForge</a> and<br />
1025
+ <p>Or you can download the gem file manually from
1026
+ <a href="http://rubyforge.org/projects/rmmseg-cpp/">RubyForge</a> and
1027
1027
  install it locally:</p>
1028
1028
  sudo gem install &#8212;local rmmseg-cpp-x.y.z.gem</div>
1029
- </div><br />
1029
+ </div>
1030
1030
  <div class="section">
1031
1031
  <h3 class="title">
1032
- <a class="list" id="From-Git" href="#a-607106038">2.2.2</a>&nbsp;&nbsp;<a class="here" href="#From-Git">From Git</a>
1032
+ <a class="list" id="From-Git" href="#a16363260">2.2.2</a>&nbsp;&nbsp;<a class="here" href="#From-Git">From Git</a>
1033
1033
  </h3>
1034
- <div class="content"><p>To build the gem manually from the latest source code. You&#8217;ll<br />
1034
+ <div class="content"><p>To build the gem manually from the latest source code. You&#8217;ll
1035
1035
  need to have <strong>git</strong> and <strong>rake</strong> installed.</p>
1036
1036
  <p><div class="warning">
1037
- <p class="title"><a class="list" id="The-latest-source-code-may-be-unstable" href="#a-607107678">Warning 1</a>.&nbsp;&nbsp;<a class="here" href="#The-latest-source-code-may-be-unstable">The latest source code may be unstable</a></p>
1037
+ <p class="title"><a class="list" id="The-latest-source-code-may-be-unstable" href="#a16360020">Warning 1</a>.&nbsp;&nbsp;<a class="here" href="#The-latest-source-code-may-be-unstable">The latest source code may be unstable</a></p>
1038
1038
 
1039
- <div class="content icon-warning">While I tried to avoid such kind of problems, the source<br />
1040
- code from the repository might still be broken sometimes.<br />
1039
+ <div class="content icon-warning">While I tried to avoid such kind of problems, the source
1040
+ code from the repository might still be broken sometimes.
1041
1041
  It is generally not recommended to follow the source code.</div>
1042
- </div> <br />
1043
- The source code of rmmseg-cpp is hosted at<br />
1044
- <a href="http://github.com/pluskid/rmmseg-cpp/">GitHub</a>. You can get the<br />
1042
+ </div>
1043
+ The source code of rmmseg-cpp is hosted at
1044
+ <a href="http://github.com/pluskid/rmmseg-cpp/">GitHub</a>. You can get the
1045
1045
  source code by git clone:</p>
1046
1046
  git clone git://github.com/pluskid/rmmseg-cpp.git
1047
1047
  <p>then you can use Rake to build and install the gem:</p>
@@ -1053,97 +1053,107 @@ rake gem:install</div>
1053
1053
  <div class="chapter">
1054
1054
  <h1 class="title">
1055
1055
  Chapter
1056
- <a class="list" id="Usage" href="#a-607115348">3</a>
1056
+ <a class="list" id="Usage" href="#a16272720">3</a>
1057
1057
 
1058
1058
  <br/>
1059
1059
 
1060
1060
  <a class="here" href="#Usage"><big>Usage</big></a>
1061
1061
  </h1>
1062
1062
 
1063
- <div class="content"><div class="section">
1063
+ <div class="content">
1064
+ <p><div class="section">
1064
1065
  <h2 class="title">
1065
- <a class="list" id="Stand-Alone-rmmseg" href="#a-607120028">3.1</a>&nbsp;&nbsp;<a class="here" href="#Stand-Alone-rmmseg">Stand Alone rmmseg</a>
1066
+ <a class="list" id="Stand-Alone-rmmseg" href="#a16246860">3.1</a>&nbsp;&nbsp;<a class="here" href="#Stand-Alone-rmmseg">Stand Alone rmmseg</a>
1066
1067
  </h2>
1067
- <div class="content"><p>rmmseg-cpp comes with a script <strong>rmmseg</strong>. To get the basic usage, just execute it with <tt>-h</tt> option:<br />
1068
- <br />
1069
- rmmseg -h</p>
1070
- <p>It reads from STDIN and print result to STDOUT. Here is a real<br />
1068
+ <div class="content"><p>rmmseg-cpp comes with a script <strong>rmmseg</strong>. To get the basic usage, just execute it with <tt>-h</tt> option:</p>
1069
+ rmmseg -h
1070
+ <p>It reads from STDIN and print result to STDOUT. Here is a real
1071
1071
  example:</p>
1072
1072
  $ echo &#8220;我们都喜欢用 Ruby&#8221; | rmmseg
1073
1073
  我们 都 喜欢 用 Ruby</div>
1074
- </div><br />
1074
+ </div>
1075
1075
  <div class="section">
1076
1076
  <h2 class="title">
1077
- <a class="list" id="Use-in-Ruby-program" href="#a-607126248">3.2</a>&nbsp;&nbsp;<a class="here" href="#Use-in-Ruby-program">Use in Ruby program</a>
1077
+ <a class="list" id="Use-in-Ruby-program" href="#a16240340">3.2</a>&nbsp;&nbsp;<a class="here" href="#Use-in-Ruby-program">Use in Ruby program</a>
1078
1078
  </h2>
1079
1079
  <div class="content"><div class="section">
1080
1080
  <h3 class="title">
1081
- <a class="list" id="Initialize" href="#a-607131168">3.2.1</a>&nbsp;&nbsp;<a class="here" href="#Initialize">Initialize</a>
1081
+ <a class="list" id="Initialize" href="#a16231580">3.2.1</a>&nbsp;&nbsp;<a class="here" href="#Initialize">Initialize</a>
1082
1082
  </h3>
1083
1083
  <div class="content"><p>To use rmmseg-cpp in Ruby program, you&#8217;ll first load it with RubyGems:</p>
1084
1084
  <pre class="code">
1085
- require <span style="background-color:#fff0f0"><span style="color:#710">'</span><span style="color:#D20">rubygems</span><span style="color:#710">'</span></span>
1086
- require <span style="background-color:#fff0f0"><span style="color:#710">'</span><span style="color:#D20">rmmseg</span><span style="color:#710">'</span></span>
1085
+ require <span style="background-color:#fff0f0;color:#D20"><span style="color:#710">'</span><span style="">rubygems</span><span style="color:#710">'</span></span>
1086
+ require <span style="background-color:#fff0f0;color:#D20"><span style="color:#710">'</span><span style="">rmmseg</span><span style="color:#710">'</span></span>
1087
1087
  </pre>
1088
- <p>Then you may customize the dictionaries used by rmmseg-cpp<br />
1089
- (see <a href="http://rmmseg-cpp.rubyforge.org/rdoc/classes/RMMSeg/Dictionary.html">the rdoc</a> on<br />
1088
+ <p>Then you may customize the dictionaries used by rmmseg-cpp
1089
+ (see <a href="http://rmmseg-cpp.rubyforge.org/rdoc/classes/RMMSeg/Dictionary.html">the rdoc</a> on
1090
1090
  how to add your own dictionaries) and load all dictionaries:</p>
1091
1091
  <pre class="code">
1092
- <span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Dictionary</span>.load_dictionaries
1092
+ <span style="color:#036;font-weight:bold">RMMSeg</span>::<span style="color:#036;font-weight:bold">Dictionary</span>.load_dictionaries
1093
+ </pre>
1094
+ <p>Now rmmseg-cpp will be ready to do segmenting. If you want to load your own customized
1095
+ dictionaries, please customize <tt>RMMSeg::Dictionary.dictionaries</tt> before calling
1096
+ <tt>load_dictionaries</tt>. e.g.</p>
1097
+ <pre class="code">
1098
+ <span style="color:#036;font-weight:bold">RMMSeg</span>::<span style="color:#036;font-weight:bold">Dictionary</span>.dictionaries = [[<span style="color:#A60">:chars</span>, <span style="background-color:#fff0f0;color:#D20"><span style="color:#710">&quot;</span><span style="">my_chars.dic</span><span style="color:#710">&quot;</span></span>],
1099
+ [<span style="color:#A60">:words</span>, <span style="background-color:#fff0f0;color:#D20"><span style="color:#710">&quot;</span><span style="">my_words.dic</span><span style="color:#710">&quot;</span></span>],
1100
+ [<span style="color:#A60">:words</span>, <span style="background-color:#fff0f0;color:#D20"><span style="color:#710">&quot;</span><span style="">my_words2.dic</span><span style="color:#710">&quot;</span></span>]]
1093
1101
  </pre>
1094
- <p>Now rmmseg-cpp will be ready to do segmenting.</p></div>
1095
- </div><br />
1102
+ <p>The basic format for char-dictionary and word-dictionary are similar. For each line,
1103
+ there is a number, then <strong>a</strong> space, then the string. Note there <strong>SHOULD</strong> be a newline
1104
+ at the end of the dictionary file. And the number in char-dictionary and word-dictionary
1105
+ has different meaning.</p>
1106
+ <p>In char-dictionary, the number means the frequency of the character. In word-dictionary,
1107
+ the number mean the number of characters in the word. Note that this is NOT the number
1108
+ of <strong>bytes</strong> in the word.</p></div>
1109
+ </div>
1096
1110
  <div class="section">
1097
1111
  <h3 class="title">
1098
- <a class="list" id="Ferret-Integration" href="#a-607137248">3.2.2</a>&nbsp;&nbsp;<a class="here" href="#Ferret-Integration">Ferret Integration</a>
1112
+ <a class="list" id="Ferret-Integration" href="#a16187880">3.2.2</a>&nbsp;&nbsp;<a class="here" href="#Ferret-Integration">Ferret Integration</a>
1099
1113
  </h3>
1100
- <div class="content"><p>To use rmmseg-cpp with Ferret, you&#8217;ll need to <code class="code">require</code> the<br />
1101
- Ferret support of rmmseg-cpp (Of course you&#8217;ll also have to<br />
1102
- got Ferret installed. If you have problems running the belowing<br />
1103
- example, please try to update to the latest version of both<br />
1104
- Ferret and rmmseg-cpp first):<br />
1105
- <br />
1114
+ <div class="content"><p>To use rmmseg-cpp with Ferret, you&#8217;ll need to <code class="code">require</code> the
1115
+ Ferret support of rmmseg-cpp (Of course you&#8217;ll also have to
1116
+ got Ferret installed. If you have problems running the belowing
1117
+ example, please try to update to the latest version of both
1118
+ Ferret and rmmseg-cpp first):</p>
1106
1119
  <pre class="code">
1107
- require <span style="background-color:#fff0f0"><span style="color:#710">'</span><span style="color:#D20">rmmseg/ferret</span><span style="color:#710">'</span></span>
1108
- </pre></p>
1109
- <p>rmmseg-cpp comes with a ready to use Ferret analyzer:<br />
1110
- <br />
1120
+ require <span style="background-color:#fff0f0;color:#D20"><span style="color:#710">'</span><span style="">rmmseg/ferret</span><span style="color:#710">'</span></span>
1121
+ </pre>
1122
+ <p>rmmseg-cpp comes with a ready to use Ferret analyzer:</p>
1111
1123
  <pre class="code">
1112
- analyzer = <span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Ferret</span>::<span style="color:#036; font-weight:bold">Analyzer</span>.new { |tokenizer|
1113
- <span style="color:#036; font-weight:bold">Ferret</span>::<span style="color:#036; font-weight:bold">Analysis</span>::<span style="color:#036; font-weight:bold">LowerCaseFilter</span>.new(tokenizer)
1124
+ analyzer = <span style="color:#036;font-weight:bold">RMMSeg</span>::<span style="color:#036;font-weight:bold">Ferret</span>::<span style="color:#036;font-weight:bold">Analyzer</span>.new { |tokenizer|
1125
+ <span style="color:#036;font-weight:bold">Ferret</span>::<span style="color:#036;font-weight:bold">Analysis</span>::<span style="color:#036;font-weight:bold">LowerCaseFilter</span>.new(tokenizer)
1114
1126
  }
1115
- index = <span style="color:#036; font-weight:bold">Ferret</span>::<span style="color:#036; font-weight:bold">Index</span>::<span style="color:#036; font-weight:bold">Index</span>.new(<span style="color:#A60">:analyzer</span> =&gt; analyzer)
1116
- </pre><br />
1117
- <br />
1118
- A complete example can be found in <tt>misc/ferret_example.rb</tt>. The result<br />
1119
- of running that example is shown in <a class="xref" href="#Ferret-Example-Screenshot">Figure 1. Ferret Example Screenshot</a>.<br />
1120
- <br />
1121
- <div class="figure">
1122
- <p class="title"><a class="list" id="Ferret-Example-Screenshot" href="#a-607147048">Figure 1</a>.&nbsp;&nbsp;<a class="here" href="#Ferret-Example-Screenshot">Ferret Example Screenshot</a></p>
1123
- <div class="content"><img src="http://pluskid.lifegoo.com/wp-content/uploads/2008/02/rmmseg.png" alt="" /></div>
1127
+ index = <span style="color:#036;font-weight:bold">Ferret</span>::<span style="color:#036;font-weight:bold">Index</span>::<span style="color:#036;font-weight:bold">Index</span>.new(<span style="color:#A60">:analyzer</span> =&gt; analyzer)
1128
+ </pre>
1129
+ <p>A complete example can be found in <tt>misc/ferret_example.rb</tt>. The result
1130
+ of running that example is shown in <a class="xref" href="#Ferret-Example-Screenshot">Figure 1. Ferret Example Screenshot</a>.</p>
1131
+ <p><div class="figure">
1132
+ <p class="title"><a class="list" id="Ferret-Example-Screenshot" href="#a16148860">Figure 1</a>.&nbsp;&nbsp;<a class="here" href="#Ferret-Example-Screenshot">Ferret Example Screenshot</a></p>
1133
+ <div class="content"><img src="http://lifegoo.pluskid.org/wp-content/uploads/2008/02/rmmseg.png" alt="" /></div>
1124
1134
  </div></p></div>
1125
- </div><br />
1135
+ </div>
1126
1136
  <div class="section">
1127
1137
  <h3 class="title">
1128
- <a class="list" id="Normal-Ruby-program" href="#a-607154008">3.2.3</a>&nbsp;&nbsp;<a class="here" href="#Normal-Ruby-program">Normal Ruby program</a>
1138
+ <a class="list" id="Normal-Ruby-program" href="#a16113620">3.2.3</a>&nbsp;&nbsp;<a class="here" href="#Normal-Ruby-program">Normal Ruby program</a>
1129
1139
  </h3>
1130
- <div class="content"><p>rmmseg-cpp can also be used in normal Ruby programs. Just create<br />
1131
- an <code class="code"><span style="color:#036; font-weight:bold">Algorithm</span></code> object and call <code class="code">next_token</code> until a <code class="code"><span style="color:#038; font-weight:bold">nil</span></code> is returned:</p>
1140
+ <div class="content"><p>rmmseg-cpp can also be used in normal Ruby programs. Just create
1141
+ an <code class="code"><span style="color:#036;font-weight:bold">Algorithm</span></code> object and call <code class="code">next_token</code> until a <code class="code"><span style="color:#038;font-weight:bold">nil</span></code> is returned:</p>
1132
1142
  <pre class="code">
1133
- algor = <span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Algorithm</span>.new(text)
1134
- loop <span style="color:#080; font-weight:bold">do</span>
1143
+ algor = <span style="color:#036;font-weight:bold">RMMSeg</span>::<span style="color:#036;font-weight:bold">Algorithm</span>.new(text)
1144
+ loop <span style="color:#080;font-weight:bold">do</span>
1135
1145
  tok = algor.next_token
1136
- <span style="color:#080; font-weight:bold">break</span> <span style="color:#080; font-weight:bold">if</span> tok.nil?
1137
- puts <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>tok.text<span style="font-weight: bold; color: #888">}</span></span><span style="color:#D20"> [</span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>tok.start<span style="font-weight: bold; color: #888">}</span></span><span style="color:#D20">..</span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>tok.end<span style="font-weight: bold; color: #888">}</span></span><span style="color:#D20">]</span><span style="color:#710">&quot;</span></span>
1138
- <span style="color:#080; font-weight:bold">end</span>
1146
+ <span style="color:#080;font-weight:bold">break</span> <span style="color:#080;font-weight:bold">if</span> tok.nil?
1147
+ puts <span style="background-color:#fff0f0;color:#D20"><span style="color:#710">&quot;</span><span style="background:#ddd;color:black"><span style="background:#ddd;font-weight:bold;color:#666">#{</span>tok.text<span style="background:#ddd;font-weight:bold;color:#666">}</span></span><span style=""> [</span><span style="background:#ddd;color:black"><span style="background:#ddd;font-weight:bold;color:#666">#{</span>tok.start<span style="background:#ddd;font-weight:bold;color:#666">}</span></span><span style="">..</span><span style="background:#ddd;color:black"><span style="background:#ddd;font-weight:bold;color:#666">#{</span>tok.end<span style="background:#ddd;font-weight:bold;color:#666">}</span></span><span style="">]</span><span style="color:#710">&quot;</span></span>
1148
+ <span style="color:#080;font-weight:bold">end</span>
1139
1149
  </pre></div>
1140
1150
  </div></div>
1141
- </div></div>
1151
+ </div></p></div>
1142
1152
  </div>
1143
1153
  <div class="chapter">
1144
1154
  <h1 class="title">
1145
1155
  Chapter
1146
- <a class="list" id="Who-use-it" href="#a-607162878">4</a>
1156
+ <a class="list" id="Who-use-it" href="#a16072000">4</a>
1147
1157
 
1148
1158
  <br/>
1149
1159
 
@@ -1151,20 +1161,20 @@ loop <span style="color:#080; font-weight:bold">do</span>
1151
1161
  </h1>
1152
1162
 
1153
1163
  <div class="content"><p><div class="tip">
1154
- <p class="title"><a class="list" id="Expand-this-list" href="#a-607168148">Tip 1</a>.&nbsp;&nbsp;<a class="here" href="#Expand-this-list">Expand this list</a></p>
1164
+ <p class="title"><a class="list" id="Expand-this-list" href="#a16067160">Tip 1</a>.&nbsp;&nbsp;<a class="here" href="#Expand-this-list">Expand this list</a></p>
1155
1165
 
1156
- <div class="content icon-tip">If you used rmmseg-cpp and would like your project to<br />
1166
+ <div class="content icon-tip">If you used rmmseg-cpp and would like your project to
1157
1167
  appear in this list, please <a href="mailto:pluskid@gmail.com">contact me</a>.</div>
1158
1168
  </div></p>
1159
1169
  <ul>
1160
- <li><a href="http://www.javaeye.com/">JavaEye</a>: One of the biggest software developper<br />
1170
+ <li><a href="http://www.javaeye.com/">JavaEye</a>: One of the biggest software developper
1161
1171
  community in China.</li>
1162
1172
  </ul></div>
1163
1173
  </div>
1164
1174
  <div class="chapter">
1165
1175
  <h1 class="title">
1166
1176
  Chapter
1167
- <a class="list" id="Resources" href="#a-607172188">5</a>
1177
+ <a class="list" id="Resources" href="#a16034860">5</a>
1168
1178
 
1169
1179
  <br/>
1170
1180
 
@@ -1174,7 +1184,7 @@ appear in this list, please <a href="mailto:pluskid@gmail.com">contact me</a>.</
1174
1184
  <div class="content"><ul>
1175
1185
  <li><a href="http://rubyforge.org/projects/rmmseg-cpp/">Project Home</a>: The Project page at RubyForge.</li>
1176
1186
  <li><a href="http://rmmseg-cpp.rubyforge.org/rdoc/index.html">RDoc of rmmseg-cpp</a>: The auto generated rdoc of RMMSeg.</li>
1177
- <li><a href="http://pluskid.lifegoo.com/">Free Mind</a>: The author&#8217;s blog.</li>
1187
+ <li><a href="http://blog.pluskid.org/">Free Mind</a>: The author&#8217;s blog.</li>
1178
1188
  <li><a href="mailto:pluskid@gmail.com">Author&#8217;s Email</a>: Contact me if you have any problem.</li>
1179
1189
  </ul></div>
1180
1190
  </div></div>
@@ -1187,7 +1197,7 @@ appear in this list, please <a href="mailto:pluskid@gmail.com">contact me</a>.</
1187
1197
 
1188
1198
  <div id="footer">
1189
1199
 
1190
- Generated on Wed Sep 17 10:18:56 -0400 2008 by <a href="http://gerbil.rubyforge.org">Gerbil</a> 3.1.0.
1200
+ Generated on 2011-09-10 15:59:08 +0800 by <a href="http://gerbil.rubyforge.org">Gerbil</a> 3.1.0.
1191
1201
 
1192
1202
  <div id="footer-credits">
1193
1203
  <span class="icon-warning" style="float: right">&nbsp;</span>
metadata CHANGED
@@ -1,7 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rmmseg-cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.7
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 2
8
+ - 9
9
+ version: 0.2.9
5
10
  platform: ruby
6
11
  authors:
7
12
  - pluskid
@@ -9,11 +14,25 @@ autorequire:
9
14
  bindir: bin
10
15
  cert_chain: []
11
16
 
12
- date: 2008-09-17 00:00:00 -04:00
17
+ date: 2011-09-10 00:00:00 +08:00
13
18
  default_executable:
14
19
  dependencies: []
15
20
 
16
- description: rmmseg-cpp is a high performance Chinese word segmentation utility for Ruby. It features full "Ferret":http://ferret.davebalmain.com/ integration as well as support for normal Ruby program usage. rmmseg-cpp is a re-written of the original RMMSeg(http://rmmseg.rubyforge.org/) gem in C++. RMMSeg is written in pure Ruby. Though I tried hard to tweak RMMSeg, it just consumes lots of memory and the segmenting process is rather slow. The interface is almost identical to RMMSeg but the performance is much better. This gem is always preferable in production use. However, if you want to understand how the MMSEG segmenting algorithm works, the source code of RMMSeg is a better choice than this.
21
+ description: |-
22
+ rmmseg-cpp is a high performance Chinese word segmentation utility for
23
+ Ruby. It features full "Ferret":http://ferret.davebalmain.com/ integration
24
+ as well as support for normal Ruby program usage.
25
+
26
+ rmmseg-cpp is a re-written of the original
27
+ RMMSeg(http://rmmseg.rubyforge.org/) gem in C++. RMMSeg is written
28
+ in pure Ruby. Though I tried hard to tweak RMMSeg, it just consumes
29
+ lots of memory and the segmenting process is rather slow.
30
+
31
+ The interface is almost identical to RMMSeg but the performance is
32
+ much better. This gem is always preferable in production
33
+ use. However, if you want to understand how the MMSEG segmenting
34
+ algorithm works, the source code of RMMSeg is a better choice than
35
+ this.
17
36
  email: pluskid@gmail.com
18
37
  executables:
19
38
  - rmmseg
@@ -69,6 +88,8 @@ files:
69
88
  - test/test_rmmseg.rb
70
89
  has_rdoc: true
71
90
  homepage: http://rmmseg-cpp.rubyforge.org
91
+ licenses: []
92
+
72
93
  post_install_message:
73
94
  rdoc_options:
74
95
  - --main
@@ -77,23 +98,27 @@ require_paths:
77
98
  - lib
78
99
  - ext
79
100
  required_ruby_version: !ruby/object:Gem::Requirement
101
+ none: false
80
102
  requirements:
81
103
  - - ">="
82
104
  - !ruby/object:Gem::Version
105
+ segments:
106
+ - 0
83
107
  version: "0"
84
- version:
85
108
  required_rubygems_version: !ruby/object:Gem::Requirement
109
+ none: false
86
110
  requirements:
87
111
  - - ">="
88
112
  - !ruby/object:Gem::Version
113
+ segments:
114
+ - 0
89
115
  version: "0"
90
- version:
91
116
  requirements: []
92
117
 
93
118
  rubyforge_project: rmmseg-cpp
94
- rubygems_version: 1.2.0
119
+ rubygems_version: 1.3.7
95
120
  signing_key:
96
- specification_version: 2
121
+ specification_version: 3
97
122
  summary: rmmseg-cpp is a high performance Chinese word segmentation utility for Ruby
98
123
  test_files:
99
124
  - test/test_rmmseg.rb