plexus-rmmseg 0.1.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/History.txt +42 -0
- data/Manifest.txt +51 -0
- data/README.txt +74 -0
- data/Rakefile +12 -0
- data/TODO.txt +5 -0
- data/bin/rmmseg +65 -0
- data/data/chars.dic +12638 -0
- data/data/custom.dic +12 -0
- data/data/punctuation.dic +79 -0
- data/data/words.dic +120330 -0
- data/lib/rmmseg.rb +13 -0
- data/lib/rmmseg/algorithm.rb +136 -0
- data/lib/rmmseg/amibguity.rb +4 -0
- data/lib/rmmseg/chunk.rb +41 -0
- data/lib/rmmseg/complex_algorithm.rb +122 -0
- data/lib/rmmseg/config.rb +65 -0
- data/lib/rmmseg/dictionary.rb +80 -0
- data/lib/rmmseg/ferret.rb +109 -0
- data/lib/rmmseg/lawl_rule.rb +12 -0
- data/lib/rmmseg/lsdmfocw_rule.rb +13 -0
- data/lib/rmmseg/mm_rule.rb +13 -0
- data/lib/rmmseg/rule_helper.rb +28 -0
- data/lib/rmmseg/simple_algorithm.rb +37 -0
- data/lib/rmmseg/svwl_rule.rb +12 -0
- data/lib/rmmseg/token.rb +30 -0
- data/lib/rmmseg/version.rb +3 -0
- data/lib/rmmseg/word.rb +38 -0
- data/misc/ferret_example.rb +56 -0
- data/misc/homepage.erb +170 -0
- data/misc/homepage.html +1214 -0
- data/plexus-rmmseg.gemspec +20 -0
- data/spec/chunk_spec.rb +25 -0
- data/spec/complex_algorithm_spec.rb +18 -0
- data/spec/config_spec.rb +12 -0
- data/spec/dictionary_spec.rb +20 -0
- data/spec/lawl_rule_spec.rb +15 -0
- data/spec/lsdmfocw_rule_spec.rb +14 -0
- data/spec/mm_rule_spec.rb +15 -0
- data/spec/simple_algorithm_spec.rb +46 -0
- data/spec/spec_helper.rb +12 -0
- data/spec/svwl_rule_spec.rb +14 -0
- data/spec/word_spec.rb +9 -0
- data/tasks/ann.rake +76 -0
- data/tasks/annotations.rake +22 -0
- data/tasks/doc.rake +48 -0
- data/tasks/gem.rake +110 -0
- data/tasks/homepage.rake +12 -0
- data/tasks/manifest.rake +49 -0
- data/tasks/post_load.rake +26 -0
- data/tasks/rubyforge.rake +57 -0
- data/tasks/setup.rb +227 -0
- data/tasks/spec.rake +54 -0
- data/tasks/svn.rake +44 -0
- data/tasks/test.rake +38 -0
- metadata +121 -0
data/misc/homepage.html
ADDED
@@ -0,0 +1,1214 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<meta http-equiv="content-type" content="text/html; charset=utf-8"/>
|
5
|
+
<meta name="date" content="06 February 2008"/>
|
6
|
+
<meta name="author" content="pluskid"/>
|
7
|
+
<meta name="generator" content="Gerbil 2.0.0"/>
|
8
|
+
<title>RMMSeg Homepage</title>
|
9
|
+
|
10
|
+
<style type="text/css" media="screen">
|
11
|
+
body
|
12
|
+
{
|
13
|
+
color : #000000;
|
14
|
+
background-color : #FFFFFF;
|
15
|
+
line-height : 1.5em;
|
16
|
+
font-family : Calibri, Verdana, sans-serif;
|
17
|
+
}
|
18
|
+
|
19
|
+
/* emphasis */
|
20
|
+
|
21
|
+
em,
|
22
|
+
blockquote
|
23
|
+
{
|
24
|
+
font-family : Cambria, Georgia, serif;
|
25
|
+
}
|
26
|
+
|
27
|
+
/* headings */
|
28
|
+
|
29
|
+
h1,
|
30
|
+
h2,
|
31
|
+
h3,
|
32
|
+
h4,
|
33
|
+
h5,
|
34
|
+
h6,
|
35
|
+
.title
|
36
|
+
{
|
37
|
+
font-weight : lighter;
|
38
|
+
font-family : Constantia, "Book Antiqua", "URW Bookman L", serif;
|
39
|
+
}
|
40
|
+
|
41
|
+
#lof h1,
|
42
|
+
#lof h2,
|
43
|
+
#lof h3,
|
44
|
+
#lof h4,
|
45
|
+
#lof h5,
|
46
|
+
#lof h6
|
47
|
+
{
|
48
|
+
margin-top : 1.25em;
|
49
|
+
}
|
50
|
+
|
51
|
+
#content h1,
|
52
|
+
#content h2,
|
53
|
+
#content h3,
|
54
|
+
#content h4,
|
55
|
+
#content h5,
|
56
|
+
#content h6
|
57
|
+
{
|
58
|
+
margin-top : 2.5em;
|
59
|
+
line-height : 1.25em;
|
60
|
+
}
|
61
|
+
|
62
|
+
#content h1
|
63
|
+
{
|
64
|
+
font-size : 2.0em;
|
65
|
+
}
|
66
|
+
|
67
|
+
#content h2
|
68
|
+
{
|
69
|
+
font-size : 1.8em;
|
70
|
+
}
|
71
|
+
|
72
|
+
#content h3
|
73
|
+
{
|
74
|
+
font-size : 1.6em;
|
75
|
+
}
|
76
|
+
|
77
|
+
#content h4
|
78
|
+
{
|
79
|
+
font-size : 1.4em;
|
80
|
+
}
|
81
|
+
|
82
|
+
#content h5
|
83
|
+
{
|
84
|
+
font-size : 1.2em;
|
85
|
+
}
|
86
|
+
|
87
|
+
#content h6
|
88
|
+
{
|
89
|
+
font-size : 1.0em;
|
90
|
+
}
|
91
|
+
|
92
|
+
/* tables */
|
93
|
+
|
94
|
+
table
|
95
|
+
{
|
96
|
+
border : none;
|
97
|
+
margin : auto; /* center horizontally */
|
98
|
+
margin-top : 1em;
|
99
|
+
}
|
100
|
+
|
101
|
+
th,
|
102
|
+
td
|
103
|
+
{
|
104
|
+
padding : 1em;
|
105
|
+
border : 1px solid #C0C0C0;
|
106
|
+
vertical-align : top;
|
107
|
+
background-color : #FFFFFF;
|
108
|
+
}
|
109
|
+
|
110
|
+
th
|
111
|
+
{
|
112
|
+
background-color : #F5F5F5;
|
113
|
+
}
|
114
|
+
|
115
|
+
/* document structure */
|
116
|
+
|
117
|
+
#header
|
118
|
+
{
|
119
|
+
margin-bottom : 5em;
|
120
|
+
text-align : center;
|
121
|
+
}
|
122
|
+
|
123
|
+
#abstract
|
124
|
+
{
|
125
|
+
margin-bottom : 5em;
|
126
|
+
}
|
127
|
+
|
128
|
+
#toc li
|
129
|
+
{
|
130
|
+
list-style-type : none;
|
131
|
+
}
|
132
|
+
|
133
|
+
#toc li ul
|
134
|
+
{
|
135
|
+
padding-bottom : 1em;
|
136
|
+
border-left : thick solid #F5F5F5;
|
137
|
+
_border-left : none; /* for IE6 */
|
138
|
+
}
|
139
|
+
|
140
|
+
#toc li ul:hover
|
141
|
+
{
|
142
|
+
border-color : #DCDCDC;
|
143
|
+
}
|
144
|
+
|
145
|
+
#toc > ul
|
146
|
+
{
|
147
|
+
padding-left : 1em;
|
148
|
+
}
|
149
|
+
|
150
|
+
#references
|
151
|
+
{
|
152
|
+
margin-top : 5em;
|
153
|
+
}
|
154
|
+
|
155
|
+
#footer
|
156
|
+
{
|
157
|
+
border-top : thick dotted #DCDCDC;
|
158
|
+
padding-top : 1em;
|
159
|
+
margin-top : 5em;
|
160
|
+
text-align : center;
|
161
|
+
}
|
162
|
+
|
163
|
+
/* document nodes */
|
164
|
+
|
165
|
+
.part > .title,
|
166
|
+
.chapter > .title
|
167
|
+
{
|
168
|
+
padding-bottom : 0.5em;
|
169
|
+
}
|
170
|
+
|
171
|
+
.part > .title > big,
|
172
|
+
.chapter > .title > big
|
173
|
+
{
|
174
|
+
display : block;
|
175
|
+
margin-top : 0.25em;
|
176
|
+
}
|
177
|
+
|
178
|
+
.part .title big,
|
179
|
+
.chapter .title big
|
180
|
+
{
|
181
|
+
_display : block; /* for IE6 */
|
182
|
+
_margin-top : 0.25em; /* for IE6 */
|
183
|
+
_margin-bottom : 0.75em; /* for IE6 */
|
184
|
+
}
|
185
|
+
|
186
|
+
.paragraph > .title,
|
187
|
+
.tip > .title,
|
188
|
+
.note > .title,
|
189
|
+
.caution > .title,
|
190
|
+
.warning > .title,
|
191
|
+
.important > .title,
|
192
|
+
.figure > .title,
|
193
|
+
.table > .title,
|
194
|
+
.example > .title,
|
195
|
+
.equation > .title,
|
196
|
+
.procedure > .title
|
197
|
+
{
|
198
|
+
font-size : large;
|
199
|
+
margin-top : 2em;
|
200
|
+
}
|
201
|
+
|
202
|
+
.paragraph .title,
|
203
|
+
.tip .title,
|
204
|
+
.note .title,
|
205
|
+
.caution .title,
|
206
|
+
.warning .title,
|
207
|
+
.important .title,
|
208
|
+
.figure .title,
|
209
|
+
.table .title,
|
210
|
+
.example .title,
|
211
|
+
.equation .title,
|
212
|
+
.procedure .title
|
213
|
+
{
|
214
|
+
_font-size : large; /* for IE6 */
|
215
|
+
_font-weight : bold; /* large is not bold in IE6 */
|
216
|
+
_margin-top : 2em; /* for IE6 */
|
217
|
+
}
|
218
|
+
|
219
|
+
.tip ,
|
220
|
+
.note ,
|
221
|
+
.caution ,
|
222
|
+
.warning ,
|
223
|
+
.important,
|
224
|
+
.figure ,
|
225
|
+
.table ,
|
226
|
+
.example ,
|
227
|
+
.equation ,
|
228
|
+
.procedure
|
229
|
+
{
|
230
|
+
margin : 3em;
|
231
|
+
}
|
232
|
+
|
233
|
+
.tip > .icon,
|
234
|
+
.note > .icon,
|
235
|
+
.caution > .icon,
|
236
|
+
.warning > .icon,
|
237
|
+
.important > .icon
|
238
|
+
{
|
239
|
+
float : left;
|
240
|
+
margin : 0 1em 1em 0; /* top right bottom left */
|
241
|
+
}
|
242
|
+
|
243
|
+
.tip .icon,
|
244
|
+
.note .icon,
|
245
|
+
.caution .icon,
|
246
|
+
.warning .icon,
|
247
|
+
.important .icon
|
248
|
+
{
|
249
|
+
_display : none; /* IE6 cannot display embedded images */
|
250
|
+
}
|
251
|
+
|
252
|
+
.figure > .title
|
253
|
+
{
|
254
|
+
text-align : center;
|
255
|
+
}
|
256
|
+
|
257
|
+
.figure .title
|
258
|
+
{
|
259
|
+
_text-align : center; /* for IE6 */
|
260
|
+
}
|
261
|
+
|
262
|
+
.figure > .content img
|
263
|
+
{
|
264
|
+
display : block;
|
265
|
+
margin : auto;
|
266
|
+
}
|
267
|
+
|
268
|
+
.figure .content img
|
269
|
+
{
|
270
|
+
_display : block; /* for IE6 */
|
271
|
+
_margin : auto; /* for IE6 */
|
272
|
+
}
|
273
|
+
|
274
|
+
body
|
275
|
+
{
|
276
|
+
margin : auto;
|
277
|
+
padding : 0.5em;
|
278
|
+
max-width : 36em;
|
279
|
+
}
|
280
|
+
|
281
|
+
/* hyperlinks */
|
282
|
+
|
283
|
+
a
|
284
|
+
{
|
285
|
+
color : #0000FF;
|
286
|
+
text-decoration : none;
|
287
|
+
}
|
288
|
+
|
289
|
+
a:visited
|
290
|
+
{
|
291
|
+
color : #800080;
|
292
|
+
}
|
293
|
+
|
294
|
+
a:hover
|
295
|
+
{
|
296
|
+
color : #FF0000;
|
297
|
+
text-decoration : underline;
|
298
|
+
}
|
299
|
+
|
300
|
+
a:target
|
301
|
+
{
|
302
|
+
color : #FF0000;
|
303
|
+
text-decoration : underline;
|
304
|
+
}
|
305
|
+
|
306
|
+
a.toc:link,
|
307
|
+
a.toc:visited
|
308
|
+
{
|
309
|
+
text-decoration : none;
|
310
|
+
z-index : 1;
|
311
|
+
}
|
312
|
+
|
313
|
+
a img
|
314
|
+
{
|
315
|
+
border : none;
|
316
|
+
}
|
317
|
+
|
318
|
+
/*
|
319
|
+
mark external links with a symbol to help the user
|
320
|
+
distinguish between internal and external links
|
321
|
+
*/
|
322
|
+
a:after
|
323
|
+
{
|
324
|
+
content: "∗";
|
325
|
+
}
|
326
|
+
|
327
|
+
a[href^="#"]:after
|
328
|
+
{
|
329
|
+
content: "";
|
330
|
+
}
|
331
|
+
|
332
|
+
/* source code */
|
333
|
+
|
334
|
+
tt,
|
335
|
+
code,
|
336
|
+
pre
|
337
|
+
{
|
338
|
+
font-family : Consolas, "Lucida Console", monospace;
|
339
|
+
}
|
340
|
+
|
341
|
+
tt
|
342
|
+
{
|
343
|
+
font-weight : bold;
|
344
|
+
color : #A52A2A;
|
345
|
+
background-color : #FFFAF0;
|
346
|
+
}
|
347
|
+
|
348
|
+
/* output of syntax colorizer */
|
349
|
+
.code
|
350
|
+
{
|
351
|
+
background-color : #FFFFF0;
|
352
|
+
}
|
353
|
+
|
354
|
+
pre
|
355
|
+
{
|
356
|
+
line-height : normal;
|
357
|
+
border : 1px dashed #C0C0C0;
|
358
|
+
background-color : #F5FFDF;
|
359
|
+
padding : 1em;
|
360
|
+
overflow : auto;
|
361
|
+
cursor : text;
|
362
|
+
}
|
363
|
+
|
364
|
+
/*
|
365
|
+
pre:hover
|
366
|
+
{
|
367
|
+
border : none;
|
368
|
+
position : fixed;
|
369
|
+
z-index : 1;
|
370
|
+
margin : 0;
|
371
|
+
top : 0;
|
372
|
+
left : 0;
|
373
|
+
right : 0;
|
374
|
+
bottom : 0;
|
375
|
+
overflow : auto;
|
376
|
+
cursor : text;
|
377
|
+
}
|
378
|
+
*/
|
379
|
+
|
380
|
+
/* emphasis */
|
381
|
+
|
382
|
+
blockquote
|
383
|
+
{
|
384
|
+
margin : 1em;
|
385
|
+
border : 5px dotted #C0C0C0;
|
386
|
+
padding : 1em;
|
387
|
+
color : #444;
|
388
|
+
}
|
389
|
+
|
390
|
+
hr
|
391
|
+
{
|
392
|
+
color : #FF0000; /* for IE6 */
|
393
|
+
background-color : #FF0000; /* for Firefox */
|
394
|
+
}
|
395
|
+
|
396
|
+
</style>
|
397
|
+
<style type="text/css" media="print">
|
398
|
+
body
|
399
|
+
{
|
400
|
+
color : #000000;
|
401
|
+
background-color : #FFFFFF;
|
402
|
+
line-height : 1.5em;
|
403
|
+
font-family : Calibri, Verdana, sans-serif;
|
404
|
+
}
|
405
|
+
|
406
|
+
/* emphasis */
|
407
|
+
|
408
|
+
em,
|
409
|
+
blockquote
|
410
|
+
{
|
411
|
+
font-family : Cambria, Georgia, serif;
|
412
|
+
}
|
413
|
+
|
414
|
+
/* headings */
|
415
|
+
|
416
|
+
h1,
|
417
|
+
h2,
|
418
|
+
h3,
|
419
|
+
h4,
|
420
|
+
h5,
|
421
|
+
h6,
|
422
|
+
.title
|
423
|
+
{
|
424
|
+
font-weight : lighter;
|
425
|
+
font-family : Constantia, "Book Antiqua", "URW Bookman L", serif;
|
426
|
+
}
|
427
|
+
|
428
|
+
#lof h1,
|
429
|
+
#lof h2,
|
430
|
+
#lof h3,
|
431
|
+
#lof h4,
|
432
|
+
#lof h5,
|
433
|
+
#lof h6
|
434
|
+
{
|
435
|
+
margin-top : 1.25em;
|
436
|
+
}
|
437
|
+
|
438
|
+
#content h1,
|
439
|
+
#content h2,
|
440
|
+
#content h3,
|
441
|
+
#content h4,
|
442
|
+
#content h5,
|
443
|
+
#content h6
|
444
|
+
{
|
445
|
+
margin-top : 2.5em;
|
446
|
+
line-height : 1.25em;
|
447
|
+
}
|
448
|
+
|
449
|
+
#content h1
|
450
|
+
{
|
451
|
+
font-size : 2.0em;
|
452
|
+
}
|
453
|
+
|
454
|
+
#content h2
|
455
|
+
{
|
456
|
+
font-size : 1.8em;
|
457
|
+
}
|
458
|
+
|
459
|
+
#content h3
|
460
|
+
{
|
461
|
+
font-size : 1.6em;
|
462
|
+
}
|
463
|
+
|
464
|
+
#content h4
|
465
|
+
{
|
466
|
+
font-size : 1.4em;
|
467
|
+
}
|
468
|
+
|
469
|
+
#content h5
|
470
|
+
{
|
471
|
+
font-size : 1.2em;
|
472
|
+
}
|
473
|
+
|
474
|
+
#content h6
|
475
|
+
{
|
476
|
+
font-size : 1.0em;
|
477
|
+
}
|
478
|
+
|
479
|
+
/* tables */
|
480
|
+
|
481
|
+
table
|
482
|
+
{
|
483
|
+
border : none;
|
484
|
+
margin : auto; /* center horizontally */
|
485
|
+
margin-top : 1em;
|
486
|
+
}
|
487
|
+
|
488
|
+
th,
|
489
|
+
td
|
490
|
+
{
|
491
|
+
padding : 1em;
|
492
|
+
border : 1px solid #C0C0C0;
|
493
|
+
vertical-align : top;
|
494
|
+
background-color : #FFFFFF;
|
495
|
+
}
|
496
|
+
|
497
|
+
th
|
498
|
+
{
|
499
|
+
background-color : #F5F5F5;
|
500
|
+
}
|
501
|
+
|
502
|
+
/* document structure */
|
503
|
+
|
504
|
+
#header
|
505
|
+
{
|
506
|
+
margin-bottom : 5em;
|
507
|
+
text-align : center;
|
508
|
+
}
|
509
|
+
|
510
|
+
#abstract
|
511
|
+
{
|
512
|
+
margin-bottom : 5em;
|
513
|
+
}
|
514
|
+
|
515
|
+
#toc li
|
516
|
+
{
|
517
|
+
list-style-type : none;
|
518
|
+
}
|
519
|
+
|
520
|
+
#toc li ul
|
521
|
+
{
|
522
|
+
padding-bottom : 1em;
|
523
|
+
border-left : thick solid #F5F5F5;
|
524
|
+
_border-left : none; /* for IE6 */
|
525
|
+
}
|
526
|
+
|
527
|
+
#toc li ul:hover
|
528
|
+
{
|
529
|
+
border-color : #DCDCDC;
|
530
|
+
}
|
531
|
+
|
532
|
+
#toc > ul
|
533
|
+
{
|
534
|
+
padding-left : 1em;
|
535
|
+
}
|
536
|
+
|
537
|
+
#references
|
538
|
+
{
|
539
|
+
margin-top : 5em;
|
540
|
+
}
|
541
|
+
|
542
|
+
#footer
|
543
|
+
{
|
544
|
+
border-top : thick dotted #DCDCDC;
|
545
|
+
padding-top : 1em;
|
546
|
+
margin-top : 5em;
|
547
|
+
text-align : center;
|
548
|
+
}
|
549
|
+
|
550
|
+
/* document nodes */
|
551
|
+
|
552
|
+
.part > .title,
|
553
|
+
.chapter > .title
|
554
|
+
{
|
555
|
+
padding-bottom : 0.5em;
|
556
|
+
}
|
557
|
+
|
558
|
+
.part > .title > big,
|
559
|
+
.chapter > .title > big
|
560
|
+
{
|
561
|
+
display : block;
|
562
|
+
margin-top : 0.25em;
|
563
|
+
}
|
564
|
+
|
565
|
+
.part .title big,
|
566
|
+
.chapter .title big
|
567
|
+
{
|
568
|
+
_display : block; /* for IE6 */
|
569
|
+
_margin-top : 0.25em; /* for IE6 */
|
570
|
+
_margin-bottom : 0.75em; /* for IE6 */
|
571
|
+
}
|
572
|
+
|
573
|
+
.paragraph > .title,
|
574
|
+
.tip > .title,
|
575
|
+
.note > .title,
|
576
|
+
.caution > .title,
|
577
|
+
.warning > .title,
|
578
|
+
.important > .title,
|
579
|
+
.figure > .title,
|
580
|
+
.table > .title,
|
581
|
+
.example > .title,
|
582
|
+
.equation > .title,
|
583
|
+
.procedure > .title
|
584
|
+
{
|
585
|
+
font-size : large;
|
586
|
+
margin-top : 2em;
|
587
|
+
}
|
588
|
+
|
589
|
+
.paragraph .title,
|
590
|
+
.tip .title,
|
591
|
+
.note .title,
|
592
|
+
.caution .title,
|
593
|
+
.warning .title,
|
594
|
+
.important .title,
|
595
|
+
.figure .title,
|
596
|
+
.table .title,
|
597
|
+
.example .title,
|
598
|
+
.equation .title,
|
599
|
+
.procedure .title
|
600
|
+
{
|
601
|
+
_font-size : large; /* for IE6 */
|
602
|
+
_font-weight : bold; /* large is not bold in IE6 */
|
603
|
+
_margin-top : 2em; /* for IE6 */
|
604
|
+
}
|
605
|
+
|
606
|
+
.tip ,
|
607
|
+
.note ,
|
608
|
+
.caution ,
|
609
|
+
.warning ,
|
610
|
+
.important,
|
611
|
+
.figure ,
|
612
|
+
.table ,
|
613
|
+
.example ,
|
614
|
+
.equation ,
|
615
|
+
.procedure
|
616
|
+
{
|
617
|
+
margin : 3em;
|
618
|
+
}
|
619
|
+
|
620
|
+
.tip > .icon,
|
621
|
+
.note > .icon,
|
622
|
+
.caution > .icon,
|
623
|
+
.warning > .icon,
|
624
|
+
.important > .icon
|
625
|
+
{
|
626
|
+
float : left;
|
627
|
+
margin : 0 1em 1em 0; /* top right bottom left */
|
628
|
+
}
|
629
|
+
|
630
|
+
.tip .icon,
|
631
|
+
.note .icon,
|
632
|
+
.caution .icon,
|
633
|
+
.warning .icon,
|
634
|
+
.important .icon
|
635
|
+
{
|
636
|
+
_display : none; /* IE6 cannot display embedded images */
|
637
|
+
}
|
638
|
+
|
639
|
+
.figure > .title
|
640
|
+
{
|
641
|
+
text-align : center;
|
642
|
+
}
|
643
|
+
|
644
|
+
.figure .title
|
645
|
+
{
|
646
|
+
_text-align : center; /* for IE6 */
|
647
|
+
}
|
648
|
+
|
649
|
+
.figure > .content img
|
650
|
+
{
|
651
|
+
display : block;
|
652
|
+
margin : auto;
|
653
|
+
}
|
654
|
+
|
655
|
+
.figure .content img
|
656
|
+
{
|
657
|
+
_display : block; /* for IE6 */
|
658
|
+
_margin : auto; /* for IE6 */
|
659
|
+
}
|
660
|
+
|
661
|
+
/* headings */
|
662
|
+
|
663
|
+
h1,
|
664
|
+
h2,
|
665
|
+
h3,
|
666
|
+
h4,
|
667
|
+
h5,
|
668
|
+
h6
|
669
|
+
{
|
670
|
+
font-weight : normal;
|
671
|
+
}
|
672
|
+
|
673
|
+
/* hyperlinks */
|
674
|
+
|
675
|
+
a:link,
|
676
|
+
a:visited,
|
677
|
+
a:active
|
678
|
+
{
|
679
|
+
color : #0000FF;
|
680
|
+
font-weight : bold;
|
681
|
+
text-decoration : underline;
|
682
|
+
}
|
683
|
+
|
684
|
+
a:after
|
685
|
+
{
|
686
|
+
content : " (" attr(href) ")";
|
687
|
+
font-family : sans-serif;
|
688
|
+
font-weight : normal;
|
689
|
+
font-size : 90%;
|
690
|
+
}
|
691
|
+
|
692
|
+
a[href^="#"]:after
|
693
|
+
{
|
694
|
+
content : "";
|
695
|
+
}
|
696
|
+
|
697
|
+
a[href^="#"]
|
698
|
+
{
|
699
|
+
color : #A52A2A;
|
700
|
+
font-weight : lighter;
|
701
|
+
text-decoration : none;
|
702
|
+
font-style : italic;
|
703
|
+
}
|
704
|
+
|
705
|
+
a.toc:link,
|
706
|
+
a.toc:visited
|
707
|
+
{
|
708
|
+
color : inherit;
|
709
|
+
.color : #000000; /* for IE6 and IE7 */
|
710
|
+
font-weight : inherit;
|
711
|
+
text-decoration : none;
|
712
|
+
font-style : normal;
|
713
|
+
}
|
714
|
+
|
715
|
+
/* source code */
|
716
|
+
|
717
|
+
tt
|
718
|
+
{
|
719
|
+
color : inherit;
|
720
|
+
background-color : inherit;
|
721
|
+
font-weight : normal;
|
722
|
+
}
|
723
|
+
|
724
|
+
pre,
|
725
|
+
.code
|
726
|
+
{
|
727
|
+
border : none;
|
728
|
+
overflow : visible;
|
729
|
+
background-color : inherit;
|
730
|
+
}
|
731
|
+
|
732
|
+
/* document structure */
|
733
|
+
|
734
|
+
#lof
|
735
|
+
{
|
736
|
+
display : none;
|
737
|
+
}
|
738
|
+
|
739
|
+
/* document nodes */
|
740
|
+
|
741
|
+
.part > .title > big,
|
742
|
+
.chapter > .title > big
|
743
|
+
{
|
744
|
+
padding-bottom : 0.5em;
|
745
|
+
}
|
746
|
+
|
747
|
+
.part .title big,
|
748
|
+
.chapter .title big
|
749
|
+
{
|
750
|
+
_padding-bottom : 0.5em; /* for IE6 */
|
751
|
+
}
|
752
|
+
|
753
|
+
</style>
|
754
|
+
</head>
|
755
|
+
<body>
|
756
|
+
|
757
|
+
<div id="header">
|
758
|
+
|
759
|
+
|
760
|
+
<h1 class="title">RMMSeg Homepage</h1>
|
761
|
+
<h2 class="authors"><a href="http://pluskid.lifegoo.com">pluskid</a></h2>
|
762
|
+
<h3 class="date">06 February 2008</h3>
|
763
|
+
|
764
|
+
</div>
|
765
|
+
|
766
|
+
|
767
|
+
|
768
|
+
<div id="toc"><h1>Contents</h1> <ul><li>1 <a id="a-606666518" href="#Introduction">Introduction</a></li><li>2 <a id="a-606668658" href="#Setup">Setup</a><ul><li>2.1 <a id="a-606670158" href="#Requirements">Requirements</a></li><li>2.2 <a id="a-606672268" href="#Installation">Installation</a><ul><li>2.2.1 <a id="a-606673868" href="#Using-RubyGems">Using RubyGems</a></li><li>2.2.2 <a id="a-606675958" href="#From-Subversion">From Subversion</a></li></ul></li></ul></li><li>3 <a id="a-606680748" href="#Usage">Usage</a><ul><li>3.1 <a id="a-606682288" href="#Stand-Alone-rmmseg">Stand Alone rmmseg</a></li><li>3.2 <a id="a-606684368" href="#Analyzer-for-Ferret">Analyzer for Ferret</a></li><li>3.3 <a id="a-606690578" href="#Customization">Customization</a></li></ul></li><li>4 <a id="a-606693198" href="#Resources">Resources</a></li></ul></div>
|
769
|
+
|
770
|
+
<div id="lof"><h1>Figures</h1> <ol><li><a id="a-606688358" href="#Ferret-Example-Screenshot">Ferret Example Screenshot</a></li></ol><h1>Notes</h1> <ol><li><a id="a-606677598" href="#The-latest-code-might-be-unstable">The latest code might be unstable</a></li></ol></div>
|
771
|
+
|
772
|
+
<div id="content">
|
773
|
+
<div class="chapter">
|
774
|
+
<h1 class="title">
|
775
|
+
Chapter
|
776
|
+
<a class="toc" id="Introduction" href="#a-606666518">1</a>
|
777
|
+
|
778
|
+
<br/>
|
779
|
+
|
780
|
+
<big>Introduction</big>
|
781
|
+
</h1>
|
782
|
+
|
783
|
+
<div class="content"><p>RMMSeg is an implementation of
|
784
|
+
<a href="http://technology.chtsai.org/mmseg/">MMSEG</a> Chinese word
|
785
|
+
segmentation algorithm. It is based on two variants of maximum
|
786
|
+
matching algorithms. Two algorithms are available for using:</p>
|
787
|
+
|
788
|
+
|
789
|
+
<ul>
|
790
|
+
<li>simple algorithm that uses only forward maximum matching.</li>
|
791
|
+
<li>complex algorithm that uses three-word chunk maximum matching and 3
|
792
|
+
aditonal rules to solve ambiguities.</li>
|
793
|
+
</ul>
|
794
|
+
|
795
|
+
|
796
|
+
<p>For more information about the algorithm, please refer to the
|
797
|
+
following essays:</p>
|
798
|
+
|
799
|
+
|
800
|
+
<ul>
|
801
|
+
<li>http://technology.chtsai.org/mmseg/</li>
|
802
|
+
<li>http://pluskid.lifegoo.com/?p=261</li>
|
803
|
+
</ul>
|
804
|
+
|
805
|
+
|
806
|
+
<p>RMMSeg can be used as either a stand alone program or an Analyzer of
|
807
|
+
<a href="http://ferret.davebalmain.com/trac">Ferret</a>.</p></div>
|
808
|
+
</div>
|
809
|
+
<div class="chapter">
|
810
|
+
<h1 class="title">
|
811
|
+
Chapter
|
812
|
+
<a class="toc" id="Setup" href="#a-606668658">2</a>
|
813
|
+
|
814
|
+
<br/>
|
815
|
+
|
816
|
+
<big>Setup</big>
|
817
|
+
</h1>
|
818
|
+
|
819
|
+
<div class="content"><div class="section">
|
820
|
+
<h2 class="title">
|
821
|
+
<a class="toc" id="Requirements" href="#a-606670158">2.1</a> Requirements
|
822
|
+
</h2>
|
823
|
+
<div class="content">Your system needs the following software to run RMMSeg.
|
824
|
+
|
825
|
+
|
826
|
+
<table border="1">
|
827
|
+
<tr>
|
828
|
+
<th>Software </th>
|
829
|
+
<th>Notes </th>
|
830
|
+
</tr>
|
831
|
+
<tr>
|
832
|
+
<td> <a href="http://ruby-lang.org">Ruby</a> </td>
|
833
|
+
<td> Version 1.8.x is required </td>
|
834
|
+
</tr>
|
835
|
+
<tr>
|
836
|
+
<td> <a href="http://seattlerb.rubyforge.org/hoe/">hoe</a> </td>
|
837
|
+
<td> If you want to build the gem manually </td>
|
838
|
+
</tr>
|
839
|
+
<tr>
|
840
|
+
<td> <a href="http://rake.rubyforge.org/">Rake</a> </td>
|
841
|
+
<td> If you want to build the gem manually </td>
|
842
|
+
</tr>
|
843
|
+
<tr>
|
844
|
+
<td> <a href="http://rspec.rubyforge.org/">rspec</a> </td>
|
845
|
+
<td> If you want to run the testcases </td>
|
846
|
+
</tr>
|
847
|
+
</table></div>
|
848
|
+
</div>
|
849
|
+
<div class="section">
|
850
|
+
<h2 class="title">
|
851
|
+
<a class="toc" id="Installation" href="#a-606672268">2.2</a> Installation
|
852
|
+
</h2>
|
853
|
+
<div class="content"><div class="section">
|
854
|
+
<h3 class="title">
|
855
|
+
<a class="toc" id="Using-RubyGems" href="#a-606673868">2.2.1</a> Using RubyGems
|
856
|
+
</h3>
|
857
|
+
<div class="content"><p>To install the gem remotely from <a href="http://rubyforge.org">RubyForge</a> :</p>
|
858
|
+
|
859
|
+
|
860
|
+
<pre>sudo gem install rmmseg</pre>
|
861
|
+
|
862
|
+
|
863
|
+
<p>Or you can download the gem file manually from <a href="http://rubyforge.org/projects/rmmseg/">RubyForge</a> and install it locally:</p>
|
864
|
+
|
865
|
+
|
866
|
+
<pre>sudo gem install --local rmmseg-x.y.z.gem</pre></div>
|
867
|
+
</div>
|
868
|
+
<div class="section">
|
869
|
+
<h3 class="title">
|
870
|
+
<a class="toc" id="From-Subversion" href="#a-606675958">2.2.2</a> From Subversion
|
871
|
+
</h3>
|
872
|
+
<div class="content"><p>From subversion repository hosted at <a href="http://rmmseg.rubyforge.org/svn/">RubyForge</a>, you can always get the latest source code.
|
873
|
+
<div class="note">
|
874
|
+
<p class="title"><a class="toc" id="The-latest-code-might-be-unstable" href="#a-606677598">Note 1</a>. The latest code might be unstable</p>
|
875
|
+
|
876
|
+
<img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAADAAAAAwCAYAAABXAvmHAAAABHNCSVQICAgI
|
877
|
+
fAhkiAAAAAlwSFlzAAAN1wAADdcBQiibeAAAABl0RVh0U29mdHdhcmUAd3d3
|
878
|
+
Lmlua3NjYXBlLm9yZ5vuPBoAAAmCSURBVGiB1ZltbJXlGcd/9/N+Tlssh1Ja
|
879
|
+
hNEJaK2UFxFptqC2wGCSTWdM3DSQzWVDEzVxy4xfjNkHQ8xMnEEXviwZ62Rj
|
880
|
+
6siWTcVBZeJA5yBQpOWlKNZSaEvpyyk95zwv970Pp+fY5/S0tFBkXsmTc67n
|
881
|
+
eu77+f/u6349Ryil+Cqbdq0FXKldEuClp+3nfvVz/eJLT9vPXQv/igFc13tq
|
882
|
+
w4/ujLqu99S18C9lRr6br7zyyi2WZb2eSqUqpVSUlE5BSmW8/PLL6sv2bdtu
|
883
|
+
TiaT9z/++ONN486AZVm/XLiw+qZlt98Wun/bsqUhf+nSJSF/ya2LQv6ixQtD
|
884
|
+
/sJFC0L+guqqkF91S2XIr6y6idlfm1Vp2/Yz+XSOCiCEqDJMQ3ze+hm6bnC+
|
885
|
+
sx9d1zl+vBlN1znf2Y+m6TQfa0LTvvCbmo6G/KNHj4T8Ix83hvzGI4dD/uHG
|
886
|
+
QyG/peUElmUJIURlPp2jAkgpiwXg+R5FM9dRv3U/heXrkFJSWHZ32i/7NlJK
|
887
|
+
CsrWUr91P9EZa5FSEildQ/3W/USmfwspJc701dRv3Y9dshopJda0VdRv3Y8V
|
888
|
+
W4mUEjO2kvqt+zCm1iGlxCiu5fe/24defBeu6yHSkopHAxD51oEtW7bEb1++
|
889
|
+
rLC5+ShBEKCU+uJCgWJCvlJD9/iinlw/o2O4b5omixct4fixkz2PPPJILB9A
|
890
|
+
3kEspYzOmjWbxYuWjIiNtfBdjdjg4EWam44XjhYf0YU2b948xTTNoGRaSbjl
|
891
|
+
R2mlqx0rLCxC0zR98+bNdj6AERkIgiDmOI6nlDK/7NbOF1NKYdt2KplMxoCz
|
892
|
+
ufERAKZpxiKRSDC8wnfeeWfUl06W0FxbtWpV9tloNOoPDg6OD0BKGYtEIqGX
|
893
|
+
rV69etKFXio2vGs5jqOklOMbxJqmxaLRqBheSRAEo77sSoWOZkKI7DORSERo
|
894
|
+
mjY+AKVULBKJGMMB3n333Uum/XK35aOVq6ury2YiEonoSqnxAziOYw0HqKur
|
895
|
+
G5fIyR68mU/Hccyes8cq19cVbQB84Fh9Q/xgXgDTNGfYtq0NBxivkMmGAGg7
|
896
|
+
3cTOV58xe9tPPvnwuodUxFX89cOd/vq6orX1DfG9IwB0XS+zbTs0kHbt2jVh
|
897
|
+
iMwscrkQrS2H+ce2TfSdPc2Dd9xN1cL7TK29E9rbWWBXWk9o+95eX1c0Jd9K
|
898
|
+
XGqaZghg5cqVExIyEaG51tL0IW9u2wRuijVrf4JXXIHR1oZoaIC2NjhzBtHb
|
899
|
+
S+lcTT9jBNPzAUyzLCvUBy9HyETLHm98nze3bWKw7xy1NZXcOG8JUmq0BgGt
|
900
|
+
paVoRUWUfPwxnb5Pl+/RJ6QAIvkGcXFuBq4mxNEDu3lr2yaUN8BdyyuZO/d2
|
901
|
+
hNBQ0sPrf43C0yvoqqjic9umdXCQ85akeT6plMUL9f+Mf5pvIZtiGAa5g3gy
|
902
|
+
IZRSNH74Nm/9cRO2HrByeSUVFRWAQAYpEj2nSF44hZI+F/tn4V0/n3hPJ+1V
|
903
|
+
ym+KKFcY4lXPU89A/r1QgWEY+L6fvbd79+4rhkg/Izlz8j+c/O/rlBRHufuO
|
904
|
+
BcyeNRsA6Q+S6G4h1XcaJdMLZ0evT3tXG/65VznU9j5eofqN5/N8/c7+9kyd
|
905
|
+
ofPAiy++GIlGo/ENGzborutOWOhosSDw+ehfb7Dzzy9QVjKFO5bdTHl5eTrm
|
906
|
+
xklcOEmq73NQEqmgvduj5axLR5/GRVdQd89G2uMFsqd3wH722Wf94XWHMhCJ
|
907
|
+
RGK2bbtAZDLmc99z+aDhT+x6/dfMmTWdH3znG5ROL03HUr0kzp/AjbcDikAq
|
908
|
+
Wrt8TrWn6IgbSC1K3b2PsbzuAQqLrmP79u2u0KypQNeoAL7vxxzH8XMHb0ND
|
909
|
+
w4Qh2k99RPO//0DlvNlsuL+WabFpAHiJbpLdx3EHOgBwfcVnnR4t7SnO9Qk0
|
910
|
+
q5Ablt7Lgz/+BZqmZ+uzbdsfGBiIjQkwtI1QuTNQbW3tJVt7uB14bwcfHdnB
|
911
|
+
xofvoTgaBQTuxXMkuk/gD3YDkHAVn55z+eRcis64QUnZXB7Y8DNuXnzXiNlv
|
912
|
+
aD+kDMMYsR8akQHTNIWU8rJnoAN7d7DrtefZ+MSjRGfMI/HpPuInd+EnewGI
|
913
|
+
JySnzrp81unRGRdU3LiUHz76JBU3LhkhOmNSShzHId+WOgSQTCZjlmWFdqIA
|
914
|
+
e/bsGdcs03ZiH62H/sJPH12PXXAd0r2IXVZFz5E36BkIaDnr0trl09kXUF6x
|
915
|
+
iOW191BYXM7p9j5Ot+8J1XXnnXeGYEzT1F3XHRsgCIKYZVlmLsCKFStGtMqw
|
916
|
+
Mvi+z4H3dtD1yXus+e59WJaBn+hBc+MoBUfaFM2nk1y4KKlaupq1teuZWjIT
|
917
|
+
wzDQdR3DMDAMA9M0s/UOFy+lxLZtw/f9sQE8z5tu27aR24UyhwshROiQ47ou
|
918
|
+
rutyaN/f2Pv3Lew9fJ7Dxzr4/pqv0xt36e4ZoOt8D6mUy7zFa6m5dR0FRVPx
|
919
|
+
lUFvby+maWJZFqZpZr9blkVmIR0OY5qm6XnetDEBlFLllmWNaO29e/eS8xy+
|
920
|
+
7+P7Pm0nPuD4B9soKCxkzTdvoOC6Ms755TglMWZ+bSpzIsWYzhR0XSdQOslk
|
921
|
+
ksw7hBAIIdA0LfuZ2cbkvs80TaGUKh8TwPf9UsuykFKGIGpqanLLZVPbffNc
|
922
|
+
vvfQY3heGsjzPDzPy3at3Exqmoau6+i6jmmaGIaB4zg4joNhGKEsZ0xKyZCu
|
923
|
+
GZcCKMnsRIf3QV3XRwBkYmUzK0JAmSsIguz3fK2t6zqapqFp2og6873Hsiw8
|
924
|
+
z5s+JkAqlZqaycDlTKO5gsZTbjynvkwGXNedOiaA67pT8gFMBOJqxDIAqVRq
|
925
|
+
xI+8IYBEIlE4lKoJn4evZiwDkEgkinKfyeZ748aNpuu6pmma2QzkXplKv+yY
|
926
|
+
lBLTNEkmk86CBQus4QDZDDQ2Nl4/f/5817Isp7S0NDv3T6aNZzUfzXzfRwjh
|
927
|
+
K6XmCCHOAoNKKWkACCEKqqurpwkhZEtLC42NjZOledKsuroaTdOU67olwACg
|
928
|
+
CSEGDCGEBdiu6zq+75NMJkNL+v+LJRIJfN8nkUhEgAgQAK5BehzoPT098e7u
|
929
|
+
bj2VSjFnzpxrKjafSSm5cOEC/f39KUAnrVsTgAkUAYU1NTUPx2KxjUEQTLmW
|
930
|
+
YvOZpmnxjo6O3x48eHA76S4UB3qEUgohRBQoGLoyKTJJp+nyR97kmA1IwANS
|
931
|
+
wODQ1auUSmYP9UNjoQBwAIv0DBUMFb6WZpNuRB9wSYsfUEr5kOdfSiGETrr1
|
932
|
+
DUCQhrhWJkj394A0gKeUCjVo3r9Zv0r2P3yyQqPd16MPAAAAAElFTkSuQmCC
|
933
|
+
" alt="note" class="icon"/>
|
934
|
+
|
935
|
+
<div class="content">Some new features may only be available in the latest code in subversion, but the code might be broken in some cases. So it is recommended to use the released gem package for production.</div>
|
936
|
+
</div> To check out the code from Rubyforge, you need to install subversion, then:</p>
|
937
|
+
|
938
|
+
|
939
|
+
<pre>svn checkout http://rmmseg.rubyforge.org/svn/trunk/ rmmseg</pre>
|
940
|
+
|
941
|
+
|
942
|
+
<p>Then you can run</p>
|
943
|
+
|
944
|
+
|
945
|
+
<pre>rake gem</pre>
|
946
|
+
|
947
|
+
|
948
|
+
<p>to build the gem file.</p></div>
|
949
|
+
</div></div>
|
950
|
+
</div></div>
|
951
|
+
</div>
|
952
|
+
<div class="chapter">
|
953
|
+
<h1 class="title">
|
954
|
+
Chapter
|
955
|
+
<a class="toc" id="Usage" href="#a-606680748">3</a>
|
956
|
+
|
957
|
+
<br/>
|
958
|
+
|
959
|
+
<big>Usage</big>
|
960
|
+
</h1>
|
961
|
+
|
962
|
+
<div class="content"><div class="section">
|
963
|
+
<h2 class="title">
|
964
|
+
<a class="toc" id="Stand-Alone-rmmseg" href="#a-606682288">3.1</a> Stand Alone rmmseg
|
965
|
+
</h2>
|
966
|
+
<div class="content"><p>RMMSeg comes with a script <code class="code">rmmseg</code>. To get the basic usage, just execute it with <code class="code">-h</code> option:</p>
|
967
|
+
|
968
|
+
|
969
|
+
<pre>rmmseg -h</pre>
|
970
|
+
|
971
|
+
|
972
|
+
<p>It reads from STDIN and print result to STDOUT. Here is a real
|
973
|
+
example:</p>
|
974
|
+
|
975
|
+
|
976
|
+
<pre>$ echo "我们都喜欢用 Ruby" | rmmseg
|
977
|
+
我们 都 喜欢 用 Ruby</pre></div>
|
978
|
+
</div>
|
979
|
+
<div class="section">
|
980
|
+
<h2 class="title">
|
981
|
+
<a class="toc" id="Analyzer-for-Ferret" href="#a-606684368">3.2</a> Analyzer for Ferret
|
982
|
+
</h2>
|
983
|
+
<div class="content"><p>RMMSeg include an analyzer for Ferret. It is simply ready to
|
984
|
+
use. Just require it and pass it to Ferret. Here’s a complete
|
985
|
+
example:</p>
|
986
|
+
|
987
|
+
|
988
|
+
<pre class="code" lang="ruby">
|
989
|
+
<span style="color:#888">#!/usr/bin/env ruby</span>
|
990
|
+
require <span style="background-color:#fff0f0"><span style="color:#710">'</span><span style="color:#D20">rubygems</span><span style="color:#710">'</span></span>
|
991
|
+
require <span style="background-color:#fff0f0"><span style="color:#710">'</span><span style="color:#D20">rmmseg</span><span style="color:#710">'</span></span>
|
992
|
+
require <span style="background-color:#fff0f0"><span style="color:#710">'</span><span style="color:#D20">rmmseg/ferret</span><span style="color:#710">'</span></span>
|
993
|
+
|
994
|
+
analyzer = <span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Ferret</span>::<span style="color:#036; font-weight:bold">Analyzer</span>.new { |tokenizer|
|
995
|
+
<span style="color:#036; font-weight:bold">Ferret</span>::<span style="color:#036; font-weight:bold">Analysis</span>::<span style="color:#036; font-weight:bold">LowerCaseFilter</span>.new(tokenizer)
|
996
|
+
}
|
997
|
+
|
998
|
+
<span style="color:#d70; font-weight:bold">$index</span> = <span style="color:#036; font-weight:bold">Ferret</span>::<span style="color:#036; font-weight:bold">Index</span>::<span style="color:#036; font-weight:bold">Index</span>.new(<span style="color:#A60">:analyzer</span> => analyzer)
|
999
|
+
|
1000
|
+
<span style="color:#d70; font-weight:bold">$index</span> << {
|
1001
|
+
<span style="color:#A60">:title</span> => <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">分词</span><span style="color:#710">"</span></span>,
|
1002
|
+
<span style="color:#A60">:content</span> => <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">中文分词比较困难,不像英文那样,直接在空格和标点符号的地方断开就可以了。</span><span style="color:#710">"</span></span>
|
1003
|
+
}
|
1004
|
+
<span style="color:#d70; font-weight:bold">$index</span> << {
|
1005
|
+
<span style="color:#A60">:title</span> => <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">RMMSeg</span><span style="color:#710">"</span></span>,
|
1006
|
+
<span style="color:#A60">:content</span> => <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。</span><span style="color:#710">"</span></span>
|
1007
|
+
}
|
1008
|
+
<span style="color:#d70; font-weight:bold">$index</span> << {
|
1009
|
+
<span style="color:#A60">:title</span> => <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">Ruby 1.9</span><span style="color:#710">"</span></span>,
|
1010
|
+
<span style="color:#A60">:content</span> => <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">Ruby 1.9.0 已经发布了,1.9 的一个重大改进就是对 Unicode 的支持。</span><span style="color:#710">"</span></span>
|
1011
|
+
}
|
1012
|
+
<span style="color:#d70; font-weight:bold">$index</span> << {
|
1013
|
+
<span style="color:#A60">:title</span> => <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">Ferret</span><span style="color:#710">"</span></span>,
|
1014
|
+
<span style="color:#A60">:content</span> => <span style="background-color:#fff0f0"><span style="color:#710"><<END</span></span><span style="background-color:#fff0f0"><span style="color:#D20">
|
1015
|
+
Ferret is a high-performance, full-featured text search engine library
|
1016
|
+
written for Ruby. It is inspired by Apache Lucene Java project. With
|
1017
|
+
the introduction of Ferret, Ruby users now have one of the fastest and
|
1018
|
+
most flexible search libraries available. And it is surprisingly easy
|
1019
|
+
to use.</span><span style="color:#710">
|
1020
|
+
END</span></span>
|
1021
|
+
}
|
1022
|
+
|
1023
|
+
<span style="color:#080; font-weight:bold">def</span> <span style="color:#06B; font-weight:bold">highlight_search</span>(key)
|
1024
|
+
<span style="color:#d70; font-weight:bold">$index</span>.search_each(<span style="background-color:#fff0f0"><span style="color:#710">%Q!</span><span style="color:#D20">content:"</span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>key<span style="font-weight: bold; color: #888">}</span></span><span style="color:#D20">"</span><span style="color:#710">!</span></span>) <span style="color:#080; font-weight:bold">do</span> |id, score|
|
1025
|
+
puts <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">*** Document </span><span style="color:#04D">\"</span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span><span style="color:#d70; font-weight:bold">$index</span>[id][<span style="color:#A60">:title</span>]<span style="font-weight: bold; color: #888">}</span></span><span style="color:#04D">\"</span><span style="color:#D20"> found with a score of </span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>score<span style="font-weight: bold; color: #888">}</span></span><span style="color:#710">"</span></span>
|
1026
|
+
puts <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">-</span><span style="color:#710">"</span></span>*<span style="color:#00D; font-weight:bold">40</span>
|
1027
|
+
highlights = <span style="color:#d70; font-weight:bold">$index</span>.highlight(<span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">content:</span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>key<span style="font-weight: bold; color: #888">}</span></span><span style="color:#710">"</span></span>, id,
|
1028
|
+
<span style="color:#A60">:field</span> => <span style="color:#A60">:content</span>,
|
1029
|
+
<span style="color:#A60">:pre_tag</span> => <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#04D">\033</span><span style="color:#D20">[36m</span><span style="color:#710">"</span></span>,
|
1030
|
+
<span style="color:#A60">:post_tag</span> => <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#04D">\033</span><span style="color:#D20">[m</span><span style="color:#710">"</span></span>)
|
1031
|
+
puts <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>highlights<span style="font-weight: bold; color: #888">}</span></span><span style="color:#710">"</span></span>
|
1032
|
+
puts <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#710">"</span></span>
|
1033
|
+
<span style="color:#080; font-weight:bold">end</span>
|
1034
|
+
<span style="color:#080; font-weight:bold">end</span>
|
1035
|
+
|
1036
|
+
<span style="color:#038; font-weight:bold">ARGV</span>.each { |key|
|
1037
|
+
puts <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#04D">\033</span><span style="color:#D20">[33mSearching for </span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>key<span style="font-weight: bold; color: #888">}</span></span><span style="color:#D20">...</span><span style="color:#04D">\033</span><span style="color:#D20">[m</span><span style="color:#710">"</span></span>
|
1038
|
+
puts <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#710">"</span></span>
|
1039
|
+
highlight_search(key)
|
1040
|
+
}
|
1041
|
+
|
1042
|
+
<span style="color:#888"># Local Variables:</span>
|
1043
|
+
<span style="color:#888"># coding: utf-8</span>
|
1044
|
+
<span style="color:#888"># End:</span>
|
1045
|
+
</pre>
|
1046
|
+
|
1047
|
+
|
1048
|
+
<p>execute it on the following key words:</p>
|
1049
|
+
|
1050
|
+
|
1051
|
+
<pre>$ ruby ferret_example.rb Ruby 中文</pre>
|
1052
|
+
|
1053
|
+
|
1054
|
+
<p>will generate the following results:</p>
|
1055
|
+
|
1056
|
+
|
1057
|
+
<pre class="code" lang="text">
|
1058
|
+
Searching for Ruby...
|
1059
|
+
|
1060
|
+
*** Document "RMMSeg" found with a score of 0.21875
|
1061
|
+
----------------------------------------
|
1062
|
+
RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。
|
1063
|
+
|
1064
|
+
*** Document "Ruby 1.9" found with a score of 0.21875
|
1065
|
+
----------------------------------------
|
1066
|
+
Ruby 1.9.0 已经发布了,1.9 的一个重大改进就是对 Unicode 的支持。
|
1067
|
+
|
1068
|
+
*** Document "Ferret" found with a score of 0.176776692271233
|
1069
|
+
----------------------------------------
|
1070
|
+
Ferret is a high-performance, full-featured text search engine library
|
1071
|
+
written for Ruby. It is inspired by Apache Lucene Java project. With
|
1072
|
+
the introduction of Ferret, Ruby users now have one of the fastest and
|
1073
|
+
most flexible search libraries available. And it's surprisingly easy
|
1074
|
+
to use.
|
1075
|
+
|
1076
|
+
Searching for 中文...
|
1077
|
+
|
1078
|
+
*** Document "分词" found with a score of 0.281680464744568
|
1079
|
+
----------------------------------------
|
1080
|
+
中文分词比较困难,不像英文那样,直接在空格和标点符号的地方断开就可以了。
|
1081
|
+
|
1082
|
+
*** Document "RMMSeg" found with a score of 0.281680464744568
|
1083
|
+
----------------------------------------
|
1084
|
+
RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。
|
1085
|
+
</pre>
|
1086
|
+
|
1087
|
+
|
1088
|
+
<p>And if you run the example in terminal, you’ll see the result
|
1089
|
+
highlighted as in <a class="xref" href="#Ferret-Example-Screenshot">Figure 1: <em>Ferret Example Screenshot</em></a>.</p>
|
1090
|
+
|
1091
|
+
|
1092
|
+
<p><div class="figure">
|
1093
|
+
<p class="title"><a class="toc" id="Ferret-Example-Screenshot" href="#a-606688358">Figure 1</a>. Ferret Example Screenshot</p>
|
1094
|
+
<div class="content"><img src="http://pluskid.lifegoo.com/wp-content/uploads/2008/02/rmmseg.png" alt="" /></div>
|
1095
|
+
</div></p></div>
|
1096
|
+
</div>
|
1097
|
+
<div class="section">
|
1098
|
+
<h2 class="title">
|
1099
|
+
<a class="toc" id="Customization" href="#a-606690578">3.3</a> Customization
|
1100
|
+
</h2>
|
1101
|
+
<div class="content"><p>RMMSeg can be customized through <code class="code"><span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Config</span></code>. For example, to use your own dictionaries, just set it before starting to do segmentation:</p>
|
1102
|
+
|
1103
|
+
|
1104
|
+
<pre class="code" lang="ruby">
|
1105
|
+
<span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Config</span>.dictionaries = [[<span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">dict1.dic</span><span style="color:#710">"</span></span>, <span style="color:#038; font-weight:bold">true</span>], <span style="color:#888"># with frequency info</span>
|
1106
|
+
[<span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">dict2.dic</span><span style="color:#710">"</span></span>, <span style="color:#038; font-weight:bold">false</span>], <span style="color:#888"># without</span>
|
1107
|
+
[<span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">dict3.dic</span><span style="color:#710">"</span></span>, <span style="color:#038; font-weight:bold">false</span>]]
|
1108
|
+
<span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Config</span>.max_word_length = <span style="color:#00D; font-weight:bold">6</span>
|
1109
|
+
</pre>
|
1110
|
+
|
1111
|
+
|
1112
|
+
<p>Or to use the simple algorithm for more efficient (and less accurate) segmenting:</p>
|
1113
|
+
|
1114
|
+
|
1115
|
+
<pre class="code">
|
1116
|
+
<span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Config</span>.algorithm = <span style="color:#A60">:simple</span>
|
1117
|
+
</pre>
|
1118
|
+
|
1119
|
+
|
1120
|
+
<p>For more information on customization, please refer to the RDoc of <a href="http://rmmseg.rubyforge.org/rmmseg/index.html">RMMSeg::Config</a>.</p></div>
|
1121
|
+
</div></div>
|
1122
|
+
</div>
|
1123
|
+
<div class="chapter">
|
1124
|
+
<h1 class="title">
|
1125
|
+
Chapter
|
1126
|
+
<a class="toc" id="Resources" href="#a-606693198">4</a>
|
1127
|
+
|
1128
|
+
<br/>
|
1129
|
+
|
1130
|
+
<big>Resources</big>
|
1131
|
+
</h1>
|
1132
|
+
|
1133
|
+
<div class="content"><ul>
|
1134
|
+
<li><a href="http://rubyforge.org/projects/rmmseg/">Project Home</a>: The Project page at RubyForge.</li>
|
1135
|
+
<li><a href="http://rmmseg.rubyforge.org/rmmseg/index.html">RDoc of RMMSeg</a>: The auto generated rdoc of RMMSeg.</li>
|
1136
|
+
<li><a href="http://pluskid.lifegoo.com/?p=272">A Screencast</a>: Demo of Ferret RMMSeg and acts_as_ferret.</li>
|
1137
|
+
<li><a href="http://pluskid.lifegoo.com/?p=261">Implementation Details</a>: My blog post about the implementation details of RMMSeg (Chinese).</li>
|
1138
|
+
<li><a href="mailto:pluskid@gmail.com">Author’s Email</a>: Contact me if you have any problem.</li>
|
1139
|
+
</ul></div>
|
1140
|
+
</div></div>
|
1141
|
+
|
1142
|
+
|
1143
|
+
<br style="display: none"/>
|
1144
|
+
<hr style="display: none"/>
|
1145
|
+
<br style="display: none"/>
|
1146
|
+
|
1147
|
+
|
1148
|
+
<div id="footer">
|
1149
|
+
|
1150
|
+
Generated on Wed Feb 06 11:37:39 -0800 2008 by <a href="http://gerbil.rubyforge.org">Gerbil</a> 2.0.0.
|
1151
|
+
|
1152
|
+
<p>The admonition icons (<img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAADAAAAAwCAYAAABXAvmHAAAABHNCSVQICAgI
|
1153
|
+
fAhkiAAAAAlwSFlzAAAN1wAADdcBQiibeAAAABl0RVh0U29mdHdhcmUAd3d3
|
1154
|
+
Lmlua3NjYXBlLm9yZ5vuPBoAAAmCSURBVGiB1ZltbJXlGcd/9/N+Tlssh1Ja
|
1155
|
+
hNEJaK2UFxFptqC2wGCSTWdM3DSQzWVDEzVxy4xfjNkHQ8xMnEEXviwZ62Rj
|
1156
|
+
6siWTcVBZeJA5yBQpOWlKNZSaEvpyyk95zwv970Pp+fY5/S0tFBkXsmTc67n
|
1157
|
+
eu77+f/u6349Ryil+Cqbdq0FXKldEuClp+3nfvVz/eJLT9vPXQv/igFc13tq
|
1158
|
+
w4/ujLqu99S18C9lRr6br7zyyi2WZb2eSqUqpVSUlE5BSmW8/PLL6sv2bdtu
|
1159
|
+
TiaT9z/++ONN486AZVm/XLiw+qZlt98Wun/bsqUhf+nSJSF/ya2LQv6ixQtD
|
1160
|
+
/sJFC0L+guqqkF91S2XIr6y6idlfm1Vp2/Yz+XSOCiCEqDJMQ3ze+hm6bnC+
|
1161
|
+
sx9d1zl+vBlN1znf2Y+m6TQfa0LTvvCbmo6G/KNHj4T8Ix83hvzGI4dD/uHG
|
1162
|
+
QyG/peUElmUJIURlPp2jAkgpiwXg+R5FM9dRv3U/heXrkFJSWHZ32i/7NlJK
|
1163
|
+
CsrWUr91P9EZa5FSEildQ/3W/USmfwspJc701dRv3Y9dshopJda0VdRv3Y8V
|
1164
|
+
W4mUEjO2kvqt+zCm1iGlxCiu5fe/24defBeu6yHSkopHAxD51oEtW7bEb1++
|
1165
|
+
rLC5+ShBEKCU+uJCgWJCvlJD9/iinlw/o2O4b5omixct4fixkz2PPPJILB9A
|
1166
|
+
3kEspYzOmjWbxYuWjIiNtfBdjdjg4EWam44XjhYf0YU2b948xTTNoGRaSbjl
|
1167
|
+
R2mlqx0rLCxC0zR98+bNdj6AERkIgiDmOI6nlDK/7NbOF1NKYdt2KplMxoCz
|
1168
|
+
ufERAKZpxiKRSDC8wnfeeWfUl06W0FxbtWpV9tloNOoPDg6OD0BKGYtEIqGX
|
1169
|
+
rV69etKFXio2vGs5jqOklOMbxJqmxaLRqBheSRAEo77sSoWOZkKI7DORSERo
|
1170
|
+
mjY+AKVULBKJGMMB3n333Uum/XK35aOVq6ury2YiEonoSqnxAziOYw0HqKur
|
1171
|
+
G5fIyR68mU/Hccyes8cq19cVbQB84Fh9Q/xgXgDTNGfYtq0NBxivkMmGAGg7
|
1172
|
+
3cTOV58xe9tPPvnwuodUxFX89cOd/vq6orX1DfG9IwB0XS+zbTs0kHbt2jVh
|
1173
|
+
iMwscrkQrS2H+ce2TfSdPc2Dd9xN1cL7TK29E9rbWWBXWk9o+95eX1c0Jd9K
|
1174
|
+
XGqaZghg5cqVExIyEaG51tL0IW9u2wRuijVrf4JXXIHR1oZoaIC2NjhzBtHb
|
1175
|
+
S+lcTT9jBNPzAUyzLCvUBy9HyETLHm98nze3bWKw7xy1NZXcOG8JUmq0BgGt
|
1176
|
+
paVoRUWUfPwxnb5Pl+/RJ6QAIvkGcXFuBq4mxNEDu3lr2yaUN8BdyyuZO/d2
|
1177
|
+
hNBQ0sPrf43C0yvoqqjic9umdXCQ85akeT6plMUL9f+Mf5pvIZtiGAa5g3gy
|
1178
|
+
IZRSNH74Nm/9cRO2HrByeSUVFRWAQAYpEj2nSF44hZI+F/tn4V0/n3hPJ+1V
|
1179
|
+
ym+KKFcY4lXPU89A/r1QgWEY+L6fvbd79+4rhkg/Izlz8j+c/O/rlBRHufuO
|
1180
|
+
BcyeNRsA6Q+S6G4h1XcaJdMLZ0evT3tXG/65VznU9j5eofqN5/N8/c7+9kyd
|
1181
|
+
ofPAiy++GIlGo/ENGzborutOWOhosSDw+ehfb7Dzzy9QVjKFO5bdTHl5eTrm
|
1182
|
+
xklcOEmq73NQEqmgvduj5axLR5/GRVdQd89G2uMFsqd3wH722Wf94XWHMhCJ
|
1183
|
+
RGK2bbtAZDLmc99z+aDhT+x6/dfMmTWdH3znG5ROL03HUr0kzp/AjbcDikAq
|
1184
|
+
Wrt8TrWn6IgbSC1K3b2PsbzuAQqLrmP79u2u0KypQNeoAL7vxxzH8XMHb0ND
|
1185
|
+
w4Qh2k99RPO//0DlvNlsuL+WabFpAHiJbpLdx3EHOgBwfcVnnR4t7SnO9Qk0
|
1186
|
+
q5Ablt7Lgz/+BZqmZ+uzbdsfGBiIjQkwtI1QuTNQbW3tJVt7uB14bwcfHdnB
|
1187
|
+
xofvoTgaBQTuxXMkuk/gD3YDkHAVn55z+eRcis64QUnZXB7Y8DNuXnzXiNlv
|
1188
|
+
aD+kDMMYsR8akQHTNIWU8rJnoAN7d7DrtefZ+MSjRGfMI/HpPuInd+EnewGI
|
1189
|
+
JySnzrp81unRGRdU3LiUHz76JBU3LhkhOmNSShzHId+WOgSQTCZjlmWFdqIA
|
1190
|
+
e/bsGdcs03ZiH62H/sJPH12PXXAd0r2IXVZFz5E36BkIaDnr0trl09kXUF6x
|
1191
|
+
iOW191BYXM7p9j5Ot+8J1XXnnXeGYEzT1F3XHRsgCIKYZVlmLsCKFStGtMqw
|
1192
|
+
Mvi+z4H3dtD1yXus+e59WJaBn+hBc+MoBUfaFM2nk1y4KKlaupq1teuZWjIT
|
1193
|
+
wzDQdR3DMDAMA9M0s/UOFy+lxLZtw/f9sQE8z5tu27aR24UyhwshROiQ47ou
|
1194
|
+
rutyaN/f2Pv3Lew9fJ7Dxzr4/pqv0xt36e4ZoOt8D6mUy7zFa6m5dR0FRVPx
|
1195
|
+
lUFvby+maWJZFqZpZr9blkVmIR0OY5qm6XnetDEBlFLllmWNaO29e/eS8xy+
|
1196
|
+
7+P7Pm0nPuD4B9soKCxkzTdvoOC6Ms755TglMWZ+bSpzIsWYzhR0XSdQOslk
|
1197
|
+
ksw7hBAIIdA0LfuZ2cbkvs80TaGUKh8TwPf9UsuykFKGIGpqanLLZVPbffNc
|
1198
|
+
vvfQY3heGsjzPDzPy3at3Exqmoau6+i6jmmaGIaB4zg4joNhGKEsZ0xKyZCu
|
1199
|
+
GZcCKMnsRIf3QV3XRwBkYmUzK0JAmSsIguz3fK2t6zqapqFp2og6873Hsiw8
|
1200
|
+
z5s+JkAqlZqaycDlTKO5gsZTbjynvkwGXNedOiaA67pT8gFMBOJqxDIAqVRq
|
1201
|
+
xI+8IYBEIlE4lKoJn4evZiwDkEgkinKfyeZ748aNpuu6pmma2QzkXplKv+yY
|
1202
|
+
lBLTNEkmk86CBQus4QDZDDQ2Nl4/f/5817Isp7S0NDv3T6aNZzUfzXzfRwjh
|
1203
|
+
K6XmCCHOAoNKKWkACCEKqqurpwkhZEtLC42NjZOledKsuroaTdOU67olwACg
|
1204
|
+
CSEGDCGEBdiu6zq+75NMJkNL+v+LJRIJfN8nkUhEgAgQAK5BehzoPT098e7u
|
1205
|
+
bj2VSjFnzpxrKjafSSm5cOEC/f39KUAnrVsTgAkUAYU1NTUPx2KxjUEQTLmW
|
1206
|
+
YvOZpmnxjo6O3x48eHA76S4UB3qEUgohRBQoGLoyKTJJp+nyR97kmA1IwANS
|
1207
|
+
wODQ1auUSmYP9UNjoQBwAIv0DBUMFb6WZpNuRB9wSYsfUEr5kOdfSiGETrr1
|
1208
|
+
DUCQhrhWJkj394A0gKeUCjVo3r9Zv0r2P3yyQqPd16MPAAAAAElFTkSuQmCC
|
1209
|
+
" alt="note"/>) used in this document are Copyright © 2005 <a href="http://tango.freedesktop.org">Tango Desktop Project</a>. They are part of the <a href="http://tango.freedesktop.org/Tango_Icon_Library">Tango Icon Theme</a> set, which is distributed under the <a href="http://creativecommons.org/licenses/by-sa/2.5/">Creative Commons Attribution-ShareAlike 2.5 License Agreement</a>.</p>
|
1210
|
+
|
1211
|
+
</div>
|
1212
|
+
|
1213
|
+
</body>
|
1214
|
+
</html>
|