rmmseg-cpp-traditional 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. data/.gitignore +17 -0
  2. data/Gemfile +4 -0
  3. data/History.txt +21 -0
  4. data/LICENSE.txt +22 -0
  5. data/Manifest.txt +43 -0
  6. data/README +111 -0
  7. data/README.md +29 -0
  8. data/Rakefile +19 -0
  9. data/bin/rmmseg +63 -0
  10. data/data/chars.dic +12638 -0
  11. data/data/words.dic +120308 -0
  12. data/ext/rmmseg/algor.cpp +222 -0
  13. data/ext/rmmseg/algor.h +80 -0
  14. data/ext/rmmseg/chunk.h +59 -0
  15. data/ext/rmmseg/dict.cpp +230 -0
  16. data/ext/rmmseg/dict.h +34 -0
  17. data/ext/rmmseg/extconf.rb +17 -0
  18. data/ext/rmmseg/memory.cpp +9 -0
  19. data/ext/rmmseg/memory.h +43 -0
  20. data/ext/rmmseg/rmmseg.cpp +263 -0
  21. data/ext/rmmseg/rules.h +86 -0
  22. data/ext/rmmseg/token.h +19 -0
  23. data/ext/rmmseg/word.h +44 -0
  24. data/lib/rmmseg/dictionary.rb +59 -0
  25. data/lib/rmmseg/ferret.rb +64 -0
  26. data/lib/rmmseg-cpp-traditional/version.rb +7 -0
  27. data/lib/rmmseg-cpp-traditional.rb +9 -0
  28. data/lib/rmmseg.rb +3 -0
  29. data/misc/convert.rb +114 -0
  30. data/misc/ferret_example.rb +59 -0
  31. data/misc/homepage.erb +196 -0
  32. data/misc/homepage.html +1212 -0
  33. data/rmmseg-cpp-traditional.gemspec +19 -0
  34. data/spec/rmmseg_spec.rb +8 -0
  35. data/spec/spec_helper.rb +17 -0
  36. data/tasks/ann.rake +81 -0
  37. data/tasks/bones.rake +21 -0
  38. data/tasks/gem.rake +126 -0
  39. data/tasks/git.rake +41 -0
  40. data/tasks/homepage.rake +15 -0
  41. data/tasks/manifest.rake +49 -0
  42. data/tasks/notes.rake +28 -0
  43. data/tasks/post_load.rake +39 -0
  44. data/tasks/rdoc.rake +51 -0
  45. data/tasks/rubyforge.rake +58 -0
  46. data/tasks/setup.rb +268 -0
  47. data/tasks/spec.rake +55 -0
  48. data/tasks/svn.rake +48 -0
  49. data/tasks/test.rake +38 -0
  50. data/test/test_rmmseg.rb +0 -0
  51. metadata +116 -0
@@ -0,0 +1,263 @@
1
+ #include <ruby.h>
2
+ #include <cstdio> // for debug
3
+
4
+ #include "token.h"
5
+ #include "dict.h"
6
+ #include "algor.h"
7
+
8
+ using namespace std;
9
+
10
+ extern "C" {
11
+
12
+ /*****************************************
13
+ *
14
+ * Normal interface
15
+ *
16
+ *****************************************/
17
+
18
+ /*********************
19
+ * RMMSeg module
20
+ *********************/
21
+ static VALUE mRMMSeg;
22
+
23
+
24
+ /*********************
25
+ * Dictionary module
26
+ *********************/
27
+ static VALUE mDictionary;
28
+
29
+ /*
30
+ * Load a character dictionary.
31
+ *
32
+ * call-seq:
33
+ * load_chars(path) -> status
34
+ *
35
+ * Return +true+ if loaded successfully, +false+ otherwise.
36
+ */
37
+ static VALUE dic_load_chars(VALUE mod, VALUE path)
38
+ {
39
+ if (rmmseg::dict::load_chars(RSTRING_PTR(path)))
40
+ return Qtrue;
41
+ return Qfalse;
42
+ }
43
+
44
+ /*
45
+ * Load a word dictionary.
46
+ *
47
+ * call-seq:
48
+ * load_words(path) -> status
49
+ *
50
+ * Return +true+ if loaded successfully, +false+ otherwise.
51
+ */
52
+ static VALUE dic_load_words(VALUE mod, VALUE path)
53
+ {
54
+ if (rmmseg::dict::load_words(RSTRING_PTR(path)))
55
+ return Qtrue;
56
+ return Qfalse;
57
+ }
58
+
59
+ /*
60
+ * Add a word to the in-memory dictionary.
61
+ *
62
+ * call-seq:
63
+ * add(word, length, freq)
64
+ *
65
+ * - +word+ is a String.
66
+ * - +length+ is number of characters (not number of bytes) of the
67
+ * word to be added.
68
+ * - +freq+ is the frequency of the word. This is only used when
69
+ * it is a one-character word.
70
+ */
71
+ static VALUE dic_add(VALUE mod, VALUE word, VALUE len, VALUE freq)
72
+ {
73
+ const char *str = RSTRING_PTR(word);
74
+ int nbytes = RSTRING_LEN(word);
75
+ rmmseg::Word *w = rmmseg::make_word(str, FIX2INT(len), FIX2INT(freq), nbytes);
76
+ rmmseg::dict::add(w);
77
+ return Qnil;
78
+ }
79
+
80
+ /*
81
+ * Check whether one word is included in the dictionary.
82
+ *
83
+ * call-seq:
84
+ * has_word?(word) -> result
85
+ *
86
+ * Return +true+ if the word is included in the dictionary,
87
+ * +false+ otherwise.
88
+ */
89
+ static VALUE dic_has_word(VALUE mod, VALUE word)
90
+ {
91
+ const char *str = RSTRING_PTR(word);
92
+ int nbytes = RSTRING_LEN(word);
93
+ if (rmmseg::dict::get(str, nbytes) != NULL)
94
+ return Qtrue;
95
+ return Qfalse;
96
+ }
97
+
98
+
99
+ /**********************
100
+ * Token Class
101
+ **********************/
102
+ struct Token
103
+ {
104
+ VALUE text;
105
+ VALUE start;
106
+ VALUE end;
107
+ };
108
+
109
+ static void tk_mark(Token *t)
110
+ {
111
+ // start and end are Fixnums, no need to mark
112
+ rb_gc_mark(t->text);
113
+ }
114
+ static void tk_free(Token *t)
115
+ {
116
+ free(t);
117
+ }
118
+
119
+ /*
120
+ * Get the text held by this token.
121
+ *
122
+ * call-seq:
123
+ * text() -> text
124
+ *
125
+ */
126
+ static VALUE tk_text(VALUE self)
127
+ {
128
+ Token *tk = (Token *)DATA_PTR(self);
129
+ return tk->text;
130
+ }
131
+
132
+ /*
133
+ * Get the start position of this token.
134
+ *
135
+ * call-seq:
136
+ * start() -> start_pos
137
+ *
138
+ */
139
+ static VALUE tk_start(VALUE self)
140
+ {
141
+ Token *tk = (Token *)DATA_PTR(self);
142
+ return tk->start;
143
+ }
144
+
145
+ /*
146
+ * Get the end position of this token.
147
+ *
148
+ * call-seq:
149
+ * end() -> end_pos
150
+ *
151
+ */
152
+ static VALUE tk_end(VALUE self)
153
+ {
154
+ Token *tk = (Token *)DATA_PTR(self);
155
+ return tk->end;
156
+ }
157
+
158
+ static VALUE cToken;
159
+ static VALUE tk_create(const char* base, const rmmseg::Token &t)
160
+ {
161
+ Token *tk = ALLOC(Token);
162
+ int start = t.text-base;
163
+
164
+ // This is necessary, see
165
+ // http://lifegoo.pluskid.org/?p=348
166
+ volatile VALUE text = rb_str_new(t.text, t.length);
167
+ tk->text = text;
168
+
169
+ tk->start = INT2FIX(start);
170
+ tk->end = INT2FIX(start + t.length);
171
+ volatile VALUE tok = Data_Wrap_Struct(cToken,
172
+ (RUBY_DATA_FUNC)tk_mark,
173
+ (RUBY_DATA_FUNC)tk_free,
174
+ tk);
175
+ return tok;
176
+ }
177
+
178
+ /*********************
179
+ * Algorithm Class
180
+ *********************/
181
+ struct Algorithm
182
+ {
183
+ VALUE text; // hold to avoid being garbage collected
184
+ rmmseg::Algorithm *algor;
185
+ };
186
+
187
+ static void algor_mark(Algorithm *a)
188
+ {
189
+ rb_gc_mark(a->text);
190
+ }
191
+ static void algor_free(Algorithm *a)
192
+ {
193
+ free(a->algor);
194
+ }
195
+
196
+ static VALUE cAlgorithm;
197
+
198
+ /*
199
+ * Create an Algorithm object to do segmenting on +text+.
200
+ *
201
+ * call-seq:
202
+ * new(text) -> algorithm
203
+ *
204
+ */
205
+ static VALUE algor_create(VALUE klass, VALUE text)
206
+ {
207
+ Algorithm *algor = ALLOC(Algorithm);
208
+ void *mem;
209
+ algor->text = text;
210
+ mem = malloc(sizeof(rmmseg::Algorithm));
211
+ algor->algor = new(mem) rmmseg::Algorithm(RSTRING_PTR(text),
212
+ RSTRING_LEN(text));
213
+
214
+ return Data_Wrap_Struct(klass,
215
+ (RUBY_DATA_FUNC)algor_mark,
216
+ (RUBY_DATA_FUNC)algor_free,
217
+ algor);
218
+ }
219
+
220
+ /*
221
+ * Get next token.
222
+ *
223
+ * call-seq:
224
+ * next_token() -> token
225
+ *
226
+ * Return +nil+ if no more token available.
227
+ */
228
+ static VALUE algor_next_token(VALUE self)
229
+ {
230
+ Algorithm *algor = (Algorithm *)DATA_PTR(self);
231
+ rmmseg::Token tk = algor->algor->next_token();
232
+
233
+ if (tk.length == 0)
234
+ return Qnil;
235
+ volatile VALUE rtk = tk_create(RSTRING_PTR(algor->text), tk);
236
+ return rtk;
237
+ }
238
+
239
+
240
+ void Init_rmmseg()
241
+ {
242
+ mRMMSeg = rb_define_module("RMMSeg");
243
+
244
+ /* Manage dictionaries used by rmmseg. */
245
+ mDictionary = rb_define_module_under(mRMMSeg, "Dictionary");
246
+ rb_define_singleton_method(mDictionary, "load_chars", RUBY_METHOD_FUNC(dic_load_chars), 1);
247
+ rb_define_singleton_method(mDictionary, "load_words", RUBY_METHOD_FUNC(dic_load_words), 1);
248
+ rb_define_singleton_method(mDictionary, "add", RUBY_METHOD_FUNC(dic_add), 3);
249
+ rb_define_singleton_method(mDictionary, "has_word?", RUBY_METHOD_FUNC(dic_has_word), 1);
250
+
251
+ /* A Token hold the text and related position information. */
252
+ cToken = rb_define_class_under(mRMMSeg, "Token", rb_cObject);
253
+ rb_undef_method(rb_singleton_class(cToken), "new");
254
+ rb_define_method(cToken, "text", RUBY_METHOD_FUNC(tk_text), 0);
255
+ rb_define_method(cToken, "start", RUBY_METHOD_FUNC(tk_start), 0);
256
+ rb_define_method(cToken, "end", RUBY_METHOD_FUNC(tk_end), 0);
257
+
258
+ /* An Algorithm object use the MMSEG algorithm to do segmenting. */
259
+ cAlgorithm = rb_define_class_under(mRMMSeg, "Algorithm", rb_cObject);
260
+ rb_define_singleton_method(cAlgorithm, "new", RUBY_METHOD_FUNC(algor_create), 1);
261
+ rb_define_method(cAlgorithm, "next_token", RUBY_METHOD_FUNC(algor_next_token), 0);
262
+ }
263
+ }
@@ -0,0 +1,86 @@
1
+ #ifndef _RULES_H_
2
+ #define _RULES_H_
3
+
4
+ #include <vector>
5
+ #include <algorithm>
6
+
7
+ #include "chunk.h"
8
+
9
+ namespace rmmseg
10
+ {
11
+ template <typename Cmp>
12
+ void take_highest(std::vector<Chunk> &chunks, const Cmp &cmp)
13
+ {
14
+ unsigned int i = 1, j;
15
+
16
+ for (j = 1; j < chunks.size(); ++j)
17
+ {
18
+ int rlt = cmp(chunks[j], chunks[0]);
19
+ if (rlt > 0)
20
+ i = 0;
21
+ if (rlt >= 0)
22
+ std::swap(chunks[i++], chunks[j]);
23
+ }
24
+ chunks.erase(chunks.begin()+i, chunks.end());
25
+ }
26
+
27
+ struct MMCmp_t
28
+ {
29
+ int operator()(const Chunk &a, const Chunk &b) const
30
+ {
31
+ return a.total_length() - b.total_length();
32
+ }
33
+ } MMCmp;
34
+ void mm_filter(std::vector<Chunk> &chunks)
35
+ {
36
+ take_highest(chunks, MMCmp);
37
+ }
38
+
39
+ struct LAWLCmp_t
40
+ {
41
+ int operator()(const Chunk &a, const Chunk &b) const
42
+ {
43
+ double rlt = a.average_length() - b.average_length();
44
+ if (rlt == 0)
45
+ return 0;
46
+ if (rlt > 0)
47
+ return 1;
48
+ return -1;
49
+ }
50
+ } LAWLCmp;
51
+ void lawl_filter(std::vector<Chunk> &chunks)
52
+ {
53
+ take_highest(chunks, LAWLCmp);
54
+ }
55
+
56
+ struct SVWLCmp_t
57
+ {
58
+ int operator()(const Chunk &a, const Chunk& b) const
59
+ {
60
+ double rlt = a.variance() - b.variance();
61
+ if (rlt == 0)
62
+ return 0;
63
+ if (rlt < 0)
64
+ return 1;
65
+ return -1;
66
+ }
67
+ } SVWLCmp;
68
+ void svwl_filter(std::vector<Chunk> &chunks)
69
+ {
70
+ take_highest(chunks, SVWLCmp);
71
+ }
72
+
73
+ struct LSDMFOCWCmp_t
74
+ {
75
+ int operator()(const Chunk &a, const Chunk& b) const
76
+ {
77
+ return a.degree_of_morphemic_freedom() - b.degree_of_morphemic_freedom();
78
+ }
79
+ } LSDMFOCWCmp;
80
+ void lsdmfocw_filter(std::vector<Chunk> &chunks)
81
+ {
82
+ take_highest(chunks, LSDMFOCWCmp);
83
+ }
84
+ }
85
+
86
+ #endif /* _RULES_H_ */
@@ -0,0 +1,19 @@
1
+ #ifndef _TOKEN_H_
2
+ #define _TOKEN_H_
3
+
4
+ namespace rmmseg
5
+ {
6
+ struct Token
7
+ {
8
+ Token(const char *txt, int len)
9
+ :text(txt), length(len) { }
10
+ // `text' may or may not be nul-terminated, its length
11
+ // should be stored in the `length' field.
12
+ //
13
+ // if length is 0, this is an empty token
14
+ const char *text;
15
+ int length;
16
+ };
17
+ }
18
+
19
+ #endif /* _TOKEN_H_ */
data/ext/rmmseg/word.h ADDED
@@ -0,0 +1,44 @@
1
+ #ifndef _WORD_H_
2
+ #define _WORD_H_
3
+
4
+ #include <climits>
5
+ #include <cstring>
6
+
7
+ #include "memory.h"
8
+
9
+ namespace rmmseg
10
+ {
11
+ const int word_embed_len = 4; /* at least 1 char (3 bytes+'\0') */
12
+ struct Word
13
+ {
14
+ unsigned char nbytes; /* number of bytes */
15
+ char length; /* number of characters */
16
+ unsigned short freq;
17
+ char text[word_embed_len];
18
+ };
19
+
20
+ /**
21
+ * text: the text of the word.
22
+ * length: number of characters (not bytes).
23
+ * freq: the frequency of the word.
24
+ */
25
+ inline Word *make_word(const char *text, int length=1,
26
+ int freq=0, int nbytes=-1)
27
+ {
28
+ if (freq > USHRT_MAX)
29
+ freq = USHRT_MAX; /* avoid overflow */
30
+ if (nbytes == -1)
31
+ nbytes = std::strlen(text);
32
+ Word *w = static_cast<Word *>(pool_alloc(sizeof(Word)
33
+ + nbytes+1
34
+ - word_embed_len));
35
+ w->nbytes = nbytes;
36
+ w->length = length;
37
+ w->freq = freq;
38
+ std::strncpy(w->text, text, nbytes);
39
+ w->text[nbytes] = '\0';
40
+ return w;
41
+ }
42
+ }
43
+
44
+ #endif /* _WORD_H_ */
@@ -0,0 +1,59 @@
1
+ module RMMSeg
2
+ module Dictionary
3
+ @dictionaries = [
4
+ [:chars, File.join(File.dirname(__FILE__),
5
+ "..", "..", "data", "chars.dic")],
6
+ [:words, File.join(File.dirname(__FILE__),
7
+ "..", "..", "data", "words.dic")]
8
+ ]
9
+
10
+ class << self
11
+ #
12
+ # An array of dictionaries used by RMMSeg. Each entry is of the
13
+ # following form:
14
+ #
15
+ # [type, path]
16
+ #
17
+ # where +type+ can either <tt>:chars</tt> or <tt>:words</tt>. +path+ is the path
18
+ # to the dictionary file.
19
+ #
20
+ # The format of <tt>:chars</tt> dictionary is a collection of lines of the
21
+ # following form:
22
+ #
23
+ # freq char
24
+ #
25
+ # Where +frequency+ is a number <b>less than 65535</b>. +char+ is the
26
+ # character. They are spearated by <b>exactly one space</b>.
27
+ #
28
+ # The format of <tt>:words</tt> dictionary is similar:
29
+ #
30
+ # length word
31
+ #
32
+ # except the first number is not the frequency, but the number of
33
+ # characters (not number of bytes) in the word.
34
+ #
35
+ # There's a script (convert.rb) in the tools directory that can be used
36
+ # to convert and normalize dictionaries.
37
+ attr_accessor :dictionaries
38
+
39
+ # Add a user defined dictionary, +type+ can be
40
+ # +:chars+ or <tt>:words</tt>. See doc of dictionaries.
41
+ def add_dictionary(path, type)
42
+ @dictionaries << [type, path]
43
+ end
44
+
45
+ # Load dictionaries. Call this method after set up the path of the
46
+ # dictionaries needed to load and before any Algorithm object is
47
+ # created.
48
+ def load_dictionaries()
49
+ @dictionaries.each do |type, path|
50
+ if type == :chars
51
+ load_chars(path)
52
+ elsif type == :words
53
+ load_words(path)
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,64 @@
1
+ require 'rubygems'
2
+ require 'rmmseg'
3
+ require 'ferret'
4
+
5
+ module RMMSeg
6
+ module Ferret
7
+ # The Analyzer class can be used with Ferret .
8
+ class Analyzer < ::Ferret::Analysis::Analyzer
9
+
10
+ # Construct an Analyzer. Optional block can be used to
11
+ # add more +TokenFilter+s. e.g.
12
+ #
13
+ # analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
14
+ # Ferret::Analysis::LowerCaseFilter.new(tokenizer)
15
+ # }
16
+ #
17
+ def initialize(&brk)
18
+ @brk = brk
19
+ end
20
+
21
+ def token_stream(field, text)
22
+ t = Tokenizer.new(text)
23
+ if @brk
24
+ @brk.call(t)
25
+ else
26
+ t
27
+ end
28
+ end
29
+ end
30
+
31
+ # The Tokenizer tokenize text with RMMSeg::Algorithm.
32
+ class Tokenizer < ::Ferret::Analysis::TokenStream
33
+ # Create a new Tokenizer to tokenize +text+
34
+ def initialize(str)
35
+ self.text = str
36
+ end
37
+
38
+ # Get next token
39
+ def next
40
+ tok = @algor.next_token
41
+ if tok.nil?
42
+ return nil
43
+ else
44
+ @token.text = tok.text
45
+ @token.start = tok.start
46
+ @token.end = tok.end
47
+ return @token
48
+ end
49
+ end
50
+
51
+ # Get the text being tokenized
52
+ def text
53
+ @text
54
+ end
55
+
56
+ # Set the text to be tokenized
57
+ def text=(str)
58
+ @token = ::Ferret::Analysis::Token.new("", 0, 0)
59
+ @text = str
60
+ @algor = Algorithm.new(@text)
61
+ end
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,7 @@
1
+ module Rmmseg
2
+ module Cpp
3
+ module Traditional
4
+ VERSION = "0.0.1"
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,9 @@
1
+ require "rmmseg-cpp-traditional/version"
2
+
3
+ module Rmmseg
4
+ module Cpp
5
+ module Traditional
6
+ # Your code goes here...
7
+ end
8
+ end
9
+ end
data/lib/rmmseg.rb ADDED
@@ -0,0 +1,3 @@
1
+ require File.join(File.dirname(__FILE__), 'rmmseg', 'dictionary')
2
+ require File.join(File.dirname(__FILE__), '..',
3
+ 'ext', 'rmmseg', 'rmmseg')
data/misc/convert.rb ADDED
@@ -0,0 +1,114 @@
1
+ #!/usr/bin/ruby
2
+
3
+ # A utility used to convert the old RMMSeg dictionary
4
+ # to rmmseg-cpp format.
5
+
6
+ # There are several constrains for the new rmmseg-cpp
7
+ # dictionary format.
8
+ # - length of word should be specified in the dict
9
+ # - number and string should be separated by ONE space
10
+ # - there should be a newline at the end of file
11
+
12
+ $KCODE='u'
13
+ require 'jcode'
14
+
15
+ def usage(msg=nil)
16
+ puts "***ERROR: #{msg}\n\n" if msg
17
+ puts <<EOT
18
+ Usage:
19
+
20
+ #{$0} action type input.dic output.dic
21
+
22
+ action: either 'convert' or 'normalize'
23
+ - 'convert' is used to convert the dict from
24
+ old RMMSeg format.
25
+ - 'normalize' is used to normalize an existing
26
+ rmmseg-cpp dict.
27
+
28
+ type: either 'words' or 'chars'
29
+
30
+ EOT
31
+ exit(0)
32
+ end
33
+
34
+ usage if ARGV.size != 4
35
+ usage("unknown action #{ARGV[0]}") if ! ['convert', 'normalize'].include? ARGV[0]
36
+ usage("unknown type #{ARGV[1]}") if ! ['words', 'chars'].include? ARGV[1]
37
+
38
+ def output(data)
39
+ File.open(ARGV[3], "w") do |f|
40
+ data.each do |num, word|
41
+ f.puts "#{num} #{word}" if word
42
+ end
43
+ end
44
+ end
45
+
46
+ def read_RMMSeg_chars
47
+ max = 0
48
+ File.readlines(ARGV[2]).map do |line|
49
+ if line =~ /^(.)\s+(\d+)$/
50
+ n = $2.to_i
51
+ max = n if n > max
52
+ [n, $1]
53
+ else
54
+ [nil, nil]
55
+ end
56
+ end.map do |num, word|
57
+ if word
58
+ [num*65535/max, word]
59
+ else
60
+ [nil, nil]
61
+ end
62
+ end
63
+ end
64
+
65
+ def read_RMMSeg_words
66
+ File.readlines(ARGV[2]).map do |line|
67
+ line.chomp!
68
+ if !line.empty?
69
+ [line.jlength, line]
70
+ else
71
+ [nil, nil]
72
+ end
73
+ end
74
+ end
75
+
76
+ def read_rmmseg_cpp_chars
77
+ max = 0
78
+ File.readlines(ARGV[2]).map do |line|
79
+ if line =~ /^(\d+)\s+(.)$/
80
+ n = $1.to_i
81
+ max = n if n > max
82
+ [n, $2]
83
+ else
84
+ [nil, nil]
85
+ end
86
+ end.map do |num, word|
87
+ if word
88
+ [num*65535/max, word]
89
+ else
90
+ [nil, nil]
91
+ end
92
+ end
93
+ end
94
+
95
+ def read_rmmseg_cpp_words
96
+ File.readlines(ARGV[2]).map do |line|
97
+ if line =~ /^(\d+)\s+(\w+)$/
98
+ [$1, $2]
99
+ else
100
+ [nil, nil]
101
+ end
102
+ end
103
+ end
104
+
105
+ case ARGV[0,2]
106
+ when ['convert', 'chars']
107
+ output(read_RMMSeg_chars)
108
+ when ['convert', 'words']
109
+ output(read_RMMSeg_words)
110
+ when ['normalize', 'chars']
111
+ output(read_rmmseg_cpp_chars)
112
+ when ['normalize', 'words']
113
+ output(read_rmmseg_cpp_words)
114
+ end