rmmseg-cpp-traditional 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. data/.gitignore +17 -0
  2. data/Gemfile +4 -0
  3. data/History.txt +21 -0
  4. data/LICENSE.txt +22 -0
  5. data/Manifest.txt +43 -0
  6. data/README +111 -0
  7. data/README.md +29 -0
  8. data/Rakefile +19 -0
  9. data/bin/rmmseg +63 -0
  10. data/data/chars.dic +12638 -0
  11. data/data/words.dic +120308 -0
  12. data/ext/rmmseg/algor.cpp +222 -0
  13. data/ext/rmmseg/algor.h +80 -0
  14. data/ext/rmmseg/chunk.h +59 -0
  15. data/ext/rmmseg/dict.cpp +230 -0
  16. data/ext/rmmseg/dict.h +34 -0
  17. data/ext/rmmseg/extconf.rb +17 -0
  18. data/ext/rmmseg/memory.cpp +9 -0
  19. data/ext/rmmseg/memory.h +43 -0
  20. data/ext/rmmseg/rmmseg.cpp +263 -0
  21. data/ext/rmmseg/rules.h +86 -0
  22. data/ext/rmmseg/token.h +19 -0
  23. data/ext/rmmseg/word.h +44 -0
  24. data/lib/rmmseg/dictionary.rb +59 -0
  25. data/lib/rmmseg/ferret.rb +64 -0
  26. data/lib/rmmseg-cpp-traditional/version.rb +7 -0
  27. data/lib/rmmseg-cpp-traditional.rb +9 -0
  28. data/lib/rmmseg.rb +3 -0
  29. data/misc/convert.rb +114 -0
  30. data/misc/ferret_example.rb +59 -0
  31. data/misc/homepage.erb +196 -0
  32. data/misc/homepage.html +1212 -0
  33. data/rmmseg-cpp-traditional.gemspec +19 -0
  34. data/spec/rmmseg_spec.rb +8 -0
  35. data/spec/spec_helper.rb +17 -0
  36. data/tasks/ann.rake +81 -0
  37. data/tasks/bones.rake +21 -0
  38. data/tasks/gem.rake +126 -0
  39. data/tasks/git.rake +41 -0
  40. data/tasks/homepage.rake +15 -0
  41. data/tasks/manifest.rake +49 -0
  42. data/tasks/notes.rake +28 -0
  43. data/tasks/post_load.rake +39 -0
  44. data/tasks/rdoc.rake +51 -0
  45. data/tasks/rubyforge.rake +58 -0
  46. data/tasks/setup.rb +268 -0
  47. data/tasks/spec.rake +55 -0
  48. data/tasks/svn.rake +48 -0
  49. data/tasks/test.rake +38 -0
  50. data/test/test_rmmseg.rb +0 -0
  51. metadata +116 -0
@@ -0,0 +1,263 @@
1
+ #include <ruby.h>
2
+ #include <cstdio> // for debug
3
+
4
+ #include "token.h"
5
+ #include "dict.h"
6
+ #include "algor.h"
7
+
8
+ using namespace std;
9
+
10
+ extern "C" {
11
+
12
+ /*****************************************
13
+ *
14
+ * Normal interface
15
+ *
16
+ *****************************************/
17
+
18
+ /*********************
19
+ * RMMSeg module
20
+ *********************/
21
+ static VALUE mRMMSeg;
22
+
23
+
24
+ /*********************
25
+ * Dictionary module
26
+ *********************/
27
+ static VALUE mDictionary;
28
+
29
+ /*
30
+ * Load a character dictionary.
31
+ *
32
+ * call-seq:
33
+ * load_chars(path) -> status
34
+ *
35
+ * Return +true+ if loaded successfully, +false+ otherwise.
36
+ */
37
+ static VALUE dic_load_chars(VALUE mod, VALUE path)
38
+ {
39
+ if (rmmseg::dict::load_chars(RSTRING_PTR(path)))
40
+ return Qtrue;
41
+ return Qfalse;
42
+ }
43
+
44
+ /*
45
+ * Load a word dictionary.
46
+ *
47
+ * call-seq:
48
+ * load_words(path) -> status
49
+ *
50
+ * Return +true+ if loaded successfully, +false+ otherwise.
51
+ */
52
+ static VALUE dic_load_words(VALUE mod, VALUE path)
53
+ {
54
+ if (rmmseg::dict::load_words(RSTRING_PTR(path)))
55
+ return Qtrue;
56
+ return Qfalse;
57
+ }
58
+
59
+ /*
60
+ * Add a word to the in-memory dictionary.
61
+ *
62
+ * call-seq:
63
+ * add(word, length, freq)
64
+ *
65
+ * - +word+ is a String.
66
+ * - +length+ is number of characters (not number of bytes) of the
67
+ * word to be added.
68
+ * - +freq+ is the frequency of the word. This is only used when
69
+ * it is a one-character word.
70
+ */
71
+ static VALUE dic_add(VALUE mod, VALUE word, VALUE len, VALUE freq)
72
+ {
73
+ const char *str = RSTRING_PTR(word);
74
+ int nbytes = RSTRING_LEN(word);
75
+ rmmseg::Word *w = rmmseg::make_word(str, FIX2INT(len), FIX2INT(freq), nbytes);
76
+ rmmseg::dict::add(w);
77
+ return Qnil;
78
+ }
79
+
80
+ /*
81
+ * Check whether one word is included in the dictionary.
82
+ *
83
+ * call-seq:
84
+ * has_word?(word) -> result
85
+ *
86
+ * Return +true+ if the word is included in the dictionary,
87
+ * +false+ otherwise.
88
+ */
89
+ static VALUE dic_has_word(VALUE mod, VALUE word)
90
+ {
91
+ const char *str = RSTRING_PTR(word);
92
+ int nbytes = RSTRING_LEN(word);
93
+ if (rmmseg::dict::get(str, nbytes) != NULL)
94
+ return Qtrue;
95
+ return Qfalse;
96
+ }
97
+
98
+
99
+ /**********************
100
+ * Token Class
101
+ **********************/
102
+ struct Token
103
+ {
104
+ VALUE text;
105
+ VALUE start;
106
+ VALUE end;
107
+ };
108
+
109
+ static void tk_mark(Token *t)
110
+ {
111
+ // start and end are Fixnums, no need to mark
112
+ rb_gc_mark(t->text);
113
+ }
114
+ static void tk_free(Token *t)
115
+ {
116
+ free(t);
117
+ }
118
+
119
+ /*
120
+ * Get the text held by this token.
121
+ *
122
+ * call-seq:
123
+ * text() -> text
124
+ *
125
+ */
126
+ static VALUE tk_text(VALUE self)
127
+ {
128
+ Token *tk = (Token *)DATA_PTR(self);
129
+ return tk->text;
130
+ }
131
+
132
+ /*
133
+ * Get the start position of this token.
134
+ *
135
+ * call-seq:
136
+ * start() -> start_pos
137
+ *
138
+ */
139
+ static VALUE tk_start(VALUE self)
140
+ {
141
+ Token *tk = (Token *)DATA_PTR(self);
142
+ return tk->start;
143
+ }
144
+
145
+ /*
146
+ * Get the end position of this token.
147
+ *
148
+ * call-seq:
149
+ * end() -> end_pos
150
+ *
151
+ */
152
+ static VALUE tk_end(VALUE self)
153
+ {
154
+ Token *tk = (Token *)DATA_PTR(self);
155
+ return tk->end;
156
+ }
157
+
158
+ static VALUE cToken;
159
+ static VALUE tk_create(const char* base, const rmmseg::Token &t)
160
+ {
161
+ Token *tk = ALLOC(Token);
162
+ int start = t.text-base;
163
+
164
+ // This is necessary, see
165
+ // http://lifegoo.pluskid.org/?p=348
166
+ volatile VALUE text = rb_str_new(t.text, t.length);
167
+ tk->text = text;
168
+
169
+ tk->start = INT2FIX(start);
170
+ tk->end = INT2FIX(start + t.length);
171
+ volatile VALUE tok = Data_Wrap_Struct(cToken,
172
+ (RUBY_DATA_FUNC)tk_mark,
173
+ (RUBY_DATA_FUNC)tk_free,
174
+ tk);
175
+ return tok;
176
+ }
177
+
178
+ /*********************
179
+ * Algorithm Class
180
+ *********************/
181
+ struct Algorithm
182
+ {
183
+ VALUE text; // hold to avoid being garbage collected
184
+ rmmseg::Algorithm *algor;
185
+ };
186
+
187
+ static void algor_mark(Algorithm *a)
188
+ {
189
+ rb_gc_mark(a->text);
190
+ }
191
+ static void algor_free(Algorithm *a)
192
+ {
193
+ free(a->algor);
194
+ }
195
+
196
+ static VALUE cAlgorithm;
197
+
198
+ /*
199
+ * Create an Algorithm object to do segmenting on +text+.
200
+ *
201
+ * call-seq:
202
+ * new(text) -> algorithm
203
+ *
204
+ */
205
+ static VALUE algor_create(VALUE klass, VALUE text)
206
+ {
207
+ Algorithm *algor = ALLOC(Algorithm);
208
+ void *mem;
209
+ algor->text = text;
210
+ mem = malloc(sizeof(rmmseg::Algorithm));
211
+ algor->algor = new(mem) rmmseg::Algorithm(RSTRING_PTR(text),
212
+ RSTRING_LEN(text));
213
+
214
+ return Data_Wrap_Struct(klass,
215
+ (RUBY_DATA_FUNC)algor_mark,
216
+ (RUBY_DATA_FUNC)algor_free,
217
+ algor);
218
+ }
219
+
220
+ /*
221
+ * Get next token.
222
+ *
223
+ * call-seq:
224
+ * next_token() -> token
225
+ *
226
+ * Return +nil+ if no more token available.
227
+ */
228
+ static VALUE algor_next_token(VALUE self)
229
+ {
230
+ Algorithm *algor = (Algorithm *)DATA_PTR(self);
231
+ rmmseg::Token tk = algor->algor->next_token();
232
+
233
+ if (tk.length == 0)
234
+ return Qnil;
235
+ volatile VALUE rtk = tk_create(RSTRING_PTR(algor->text), tk);
236
+ return rtk;
237
+ }
238
+
239
+
240
+ void Init_rmmseg()
241
+ {
242
+ mRMMSeg = rb_define_module("RMMSeg");
243
+
244
+ /* Manage dictionaries used by rmmseg. */
245
+ mDictionary = rb_define_module_under(mRMMSeg, "Dictionary");
246
+ rb_define_singleton_method(mDictionary, "load_chars", RUBY_METHOD_FUNC(dic_load_chars), 1);
247
+ rb_define_singleton_method(mDictionary, "load_words", RUBY_METHOD_FUNC(dic_load_words), 1);
248
+ rb_define_singleton_method(mDictionary, "add", RUBY_METHOD_FUNC(dic_add), 3);
249
+ rb_define_singleton_method(mDictionary, "has_word?", RUBY_METHOD_FUNC(dic_has_word), 1);
250
+
251
+ /* A Token hold the text and related position information. */
252
+ cToken = rb_define_class_under(mRMMSeg, "Token", rb_cObject);
253
+ rb_undef_method(rb_singleton_class(cToken), "new");
254
+ rb_define_method(cToken, "text", RUBY_METHOD_FUNC(tk_text), 0);
255
+ rb_define_method(cToken, "start", RUBY_METHOD_FUNC(tk_start), 0);
256
+ rb_define_method(cToken, "end", RUBY_METHOD_FUNC(tk_end), 0);
257
+
258
+ /* An Algorithm object use the MMSEG algorithm to do segmenting. */
259
+ cAlgorithm = rb_define_class_under(mRMMSeg, "Algorithm", rb_cObject);
260
+ rb_define_singleton_method(cAlgorithm, "new", RUBY_METHOD_FUNC(algor_create), 1);
261
+ rb_define_method(cAlgorithm, "next_token", RUBY_METHOD_FUNC(algor_next_token), 0);
262
+ }
263
+ }
@@ -0,0 +1,86 @@
1
+ #ifndef _RULES_H_
2
+ #define _RULES_H_
3
+
4
+ #include <vector>
5
+ #include <algorithm>
6
+
7
+ #include "chunk.h"
8
+
9
+ namespace rmmseg
10
+ {
11
+ template <typename Cmp>
12
+ void take_highest(std::vector<Chunk> &chunks, const Cmp &cmp)
13
+ {
14
+ unsigned int i = 1, j;
15
+
16
+ for (j = 1; j < chunks.size(); ++j)
17
+ {
18
+ int rlt = cmp(chunks[j], chunks[0]);
19
+ if (rlt > 0)
20
+ i = 0;
21
+ if (rlt >= 0)
22
+ std::swap(chunks[i++], chunks[j]);
23
+ }
24
+ chunks.erase(chunks.begin()+i, chunks.end());
25
+ }
26
+
27
+ struct MMCmp_t
28
+ {
29
+ int operator()(const Chunk &a, const Chunk &b) const
30
+ {
31
+ return a.total_length() - b.total_length();
32
+ }
33
+ } MMCmp;
34
+ void mm_filter(std::vector<Chunk> &chunks)
35
+ {
36
+ take_highest(chunks, MMCmp);
37
+ }
38
+
39
+ struct LAWLCmp_t
40
+ {
41
+ int operator()(const Chunk &a, const Chunk &b) const
42
+ {
43
+ double rlt = a.average_length() - b.average_length();
44
+ if (rlt == 0)
45
+ return 0;
46
+ if (rlt > 0)
47
+ return 1;
48
+ return -1;
49
+ }
50
+ } LAWLCmp;
51
+ void lawl_filter(std::vector<Chunk> &chunks)
52
+ {
53
+ take_highest(chunks, LAWLCmp);
54
+ }
55
+
56
+ struct SVWLCmp_t
57
+ {
58
+ int operator()(const Chunk &a, const Chunk& b) const
59
+ {
60
+ double rlt = a.variance() - b.variance();
61
+ if (rlt == 0)
62
+ return 0;
63
+ if (rlt < 0)
64
+ return 1;
65
+ return -1;
66
+ }
67
+ } SVWLCmp;
68
+ void svwl_filter(std::vector<Chunk> &chunks)
69
+ {
70
+ take_highest(chunks, SVWLCmp);
71
+ }
72
+
73
+ struct LSDMFOCWCmp_t
74
+ {
75
+ int operator()(const Chunk &a, const Chunk& b) const
76
+ {
77
+ return a.degree_of_morphemic_freedom() - b.degree_of_morphemic_freedom();
78
+ }
79
+ } LSDMFOCWCmp;
80
+ void lsdmfocw_filter(std::vector<Chunk> &chunks)
81
+ {
82
+ take_highest(chunks, LSDMFOCWCmp);
83
+ }
84
+ }
85
+
86
+ #endif /* _RULES_H_ */
@@ -0,0 +1,19 @@
1
+ #ifndef _TOKEN_H_
2
+ #define _TOKEN_H_
3
+
4
+ namespace rmmseg
5
+ {
6
+ struct Token
7
+ {
8
+ Token(const char *txt, int len)
9
+ :text(txt), length(len) { }
10
+ // `text' may or may not be nul-terminated, its length
11
+ // should be stored in the `length' field.
12
+ //
13
+ // if length is 0, this is an empty token
14
+ const char *text;
15
+ int length;
16
+ };
17
+ }
18
+
19
+ #endif /* _TOKEN_H_ */
data/ext/rmmseg/word.h ADDED
@@ -0,0 +1,44 @@
1
+ #ifndef _WORD_H_
2
+ #define _WORD_H_
3
+
4
+ #include <climits>
5
+ #include <cstring>
6
+
7
+ #include "memory.h"
8
+
9
+ namespace rmmseg
10
+ {
11
+ const int word_embed_len = 4; /* at least 1 char (3 bytes+'\0') */
12
+ struct Word
13
+ {
14
+ unsigned char nbytes; /* number of bytes */
15
+ char length; /* number of characters */
16
+ unsigned short freq;
17
+ char text[word_embed_len];
18
+ };
19
+
20
+ /**
21
+ * text: the text of the word.
22
+ * length: number of characters (not bytes).
23
+ * freq: the frequency of the word.
24
+ */
25
+ inline Word *make_word(const char *text, int length=1,
26
+ int freq=0, int nbytes=-1)
27
+ {
28
+ if (freq > USHRT_MAX)
29
+ freq = USHRT_MAX; /* avoid overflow */
30
+ if (nbytes == -1)
31
+ nbytes = std::strlen(text);
32
+ Word *w = static_cast<Word *>(pool_alloc(sizeof(Word)
33
+ + nbytes+1
34
+ - word_embed_len));
35
+ w->nbytes = nbytes;
36
+ w->length = length;
37
+ w->freq = freq;
38
+ std::strncpy(w->text, text, nbytes);
39
+ w->text[nbytes] = '\0';
40
+ return w;
41
+ }
42
+ }
43
+
44
+ #endif /* _WORD_H_ */
@@ -0,0 +1,59 @@
1
+ module RMMSeg
2
+ module Dictionary
3
+ @dictionaries = [
4
+ [:chars, File.join(File.dirname(__FILE__),
5
+ "..", "..", "data", "chars.dic")],
6
+ [:words, File.join(File.dirname(__FILE__),
7
+ "..", "..", "data", "words.dic")]
8
+ ]
9
+
10
+ class << self
11
+ #
12
+ # An array of dictionaries used by RMMSeg. Each entry is of the
13
+ # following form:
14
+ #
15
+ # [type, path]
16
+ #
17
+ # where +type+ can either <tt>:chars</tt> or <tt>:words</tt>. +path+ is the path
18
+ # to the dictionary file.
19
+ #
20
+ # The format of <tt>:chars</tt> dictionary is a collection of lines of the
21
+ # following form:
22
+ #
23
+ # freq char
24
+ #
25
+ # Where +frequency+ is a number <b>less than 65535</b>. +char+ is the
26
+ # character. They are spearated by <b>exactly one space</b>.
27
+ #
28
+ # The format of <tt>:words</tt> dictionary is similar:
29
+ #
30
+ # length word
31
+ #
32
+ # except the first number is not the frequency, but the number of
33
+ # characters (not number of bytes) in the word.
34
+ #
35
+ # There's a script (convert.rb) in the tools directory that can be used
36
+ # to convert and normalize dictionaries.
37
+ attr_accessor :dictionaries
38
+
39
+ # Add a user defined dictionary, +type+ can be
40
+ # +:chars+ or <tt>:words</tt>. See doc of dictionaries.
41
+ def add_dictionary(path, type)
42
+ @dictionaries << [type, path]
43
+ end
44
+
45
+ # Load dictionaries. Call this method after set up the path of the
46
+ # dictionaries needed to load and before any Algorithm object is
47
+ # created.
48
+ def load_dictionaries()
49
+ @dictionaries.each do |type, path|
50
+ if type == :chars
51
+ load_chars(path)
52
+ elsif type == :words
53
+ load_words(path)
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,64 @@
1
+ require 'rubygems'
2
+ require 'rmmseg'
3
+ require 'ferret'
4
+
5
+ module RMMSeg
6
+ module Ferret
7
+ # The Analyzer class can be used with Ferret .
8
+ class Analyzer < ::Ferret::Analysis::Analyzer
9
+
10
+ # Construct an Analyzer. Optional block can be used to
11
+ # add more +TokenFilter+s. e.g.
12
+ #
13
+ # analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
14
+ # Ferret::Analysis::LowerCaseFilter.new(tokenizer)
15
+ # }
16
+ #
17
+ def initialize(&brk)
18
+ @brk = brk
19
+ end
20
+
21
+ def token_stream(field, text)
22
+ t = Tokenizer.new(text)
23
+ if @brk
24
+ @brk.call(t)
25
+ else
26
+ t
27
+ end
28
+ end
29
+ end
30
+
31
+ # The Tokenizer tokenize text with RMMSeg::Algorithm.
32
+ class Tokenizer < ::Ferret::Analysis::TokenStream
33
+ # Create a new Tokenizer to tokenize +text+
34
+ def initialize(str)
35
+ self.text = str
36
+ end
37
+
38
+ # Get next token
39
+ def next
40
+ tok = @algor.next_token
41
+ if tok.nil?
42
+ return nil
43
+ else
44
+ @token.text = tok.text
45
+ @token.start = tok.start
46
+ @token.end = tok.end
47
+ return @token
48
+ end
49
+ end
50
+
51
+ # Get the text being tokenized
52
+ def text
53
+ @text
54
+ end
55
+
56
+ # Set the text to be tokenized
57
+ def text=(str)
58
+ @token = ::Ferret::Analysis::Token.new("", 0, 0)
59
+ @text = str
60
+ @algor = Algorithm.new(@text)
61
+ end
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,7 @@
1
+ module Rmmseg
2
+ module Cpp
3
+ module Traditional
4
+ VERSION = "0.0.1"
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,9 @@
1
+ require "rmmseg-cpp-traditional/version"
2
+
3
+ module Rmmseg
4
+ module Cpp
5
+ module Traditional
6
+ # Your code goes here...
7
+ end
8
+ end
9
+ end
data/lib/rmmseg.rb ADDED
@@ -0,0 +1,3 @@
1
+ require File.join(File.dirname(__FILE__), 'rmmseg', 'dictionary')
2
+ require File.join(File.dirname(__FILE__), '..',
3
+ 'ext', 'rmmseg', 'rmmseg')
data/misc/convert.rb ADDED
@@ -0,0 +1,114 @@
1
+ #!/usr/bin/ruby
2
+
3
+ # A utility used to convert the old RMMSeg dictionary
4
+ # to rmmseg-cpp format.
5
+
6
+ # There are several constrains for the new rmmseg-cpp
7
+ # dictionary format.
8
+ # - length of word should be specified in the dict
9
+ # - number and string should be separated by ONE space
10
+ # - there should be a newline at the end of file
11
+
12
+ $KCODE='u'
13
+ require 'jcode'
14
+
15
+ def usage(msg=nil)
16
+ puts "***ERROR: #{msg}\n\n" if msg
17
+ puts <<EOT
18
+ Usage:
19
+
20
+ #{$0} action type input.dic output.dic
21
+
22
+ action: either 'convert' or 'normalize'
23
+ - 'convert' is used to convert the dict from
24
+ old RMMSeg format.
25
+ - 'normalize' is used to normalize an existing
26
+ rmmseg-cpp dict.
27
+
28
+ type: either 'words' or 'chars'
29
+
30
+ EOT
31
+ exit(0)
32
+ end
33
+
34
+ usage if ARGV.size != 4
35
+ usage("unknown action #{ARGV[0]}") if ! ['convert', 'normalize'].include? ARGV[0]
36
+ usage("unknown type #{ARGV[1]}") if ! ['words', 'chars'].include? ARGV[1]
37
+
38
+ def output(data)
39
+ File.open(ARGV[3], "w") do |f|
40
+ data.each do |num, word|
41
+ f.puts "#{num} #{word}" if word
42
+ end
43
+ end
44
+ end
45
+
46
+ def read_RMMSeg_chars
47
+ max = 0
48
+ File.readlines(ARGV[2]).map do |line|
49
+ if line =~ /^(.)\s+(\d+)$/
50
+ n = $2.to_i
51
+ max = n if n > max
52
+ [n, $1]
53
+ else
54
+ [nil, nil]
55
+ end
56
+ end.map do |num, word|
57
+ if word
58
+ [num*65535/max, word]
59
+ else
60
+ [nil, nil]
61
+ end
62
+ end
63
+ end
64
+
65
+ def read_RMMSeg_words
66
+ File.readlines(ARGV[2]).map do |line|
67
+ line.chomp!
68
+ if !line.empty?
69
+ [line.jlength, line]
70
+ else
71
+ [nil, nil]
72
+ end
73
+ end
74
+ end
75
+
76
+ def read_rmmseg_cpp_chars
77
+ max = 0
78
+ File.readlines(ARGV[2]).map do |line|
79
+ if line =~ /^(\d+)\s+(.)$/
80
+ n = $1.to_i
81
+ max = n if n > max
82
+ [n, $2]
83
+ else
84
+ [nil, nil]
85
+ end
86
+ end.map do |num, word|
87
+ if word
88
+ [num*65535/max, word]
89
+ else
90
+ [nil, nil]
91
+ end
92
+ end
93
+ end
94
+
95
+ def read_rmmseg_cpp_words
96
+ File.readlines(ARGV[2]).map do |line|
97
+ if line =~ /^(\d+)\s+(\w+)$/
98
+ [$1, $2]
99
+ else
100
+ [nil, nil]
101
+ end
102
+ end
103
+ end
104
+
105
+ case ARGV[0,2]
106
+ when ['convert', 'chars']
107
+ output(read_RMMSeg_chars)
108
+ when ['convert', 'words']
109
+ output(read_RMMSeg_words)
110
+ when ['normalize', 'chars']
111
+ output(read_rmmseg_cpp_chars)
112
+ when ['normalize', 'words']
113
+ output(read_rmmseg_cpp_words)
114
+ end