rmmseg 0.0.1 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +6 -1
- data/Manifest.txt +4 -2
- data/README.txt +13 -1
- data/Rakefile +1 -1
- data/TODO.txt +0 -1
- data/bin/rmmseg +0 -2
- data/{lib/rmmseg → data}/chars.dic +0 -0
- data/data/punctuation.dic +79 -0
- data/{lib/rmmseg → data}/words.dic +0 -0
- data/lib/rmmseg.rb +1 -1
- data/lib/rmmseg/config.rb +3 -2
- data/lib/rmmseg/ferret.rb +72 -1
- data/misc/ferret_example.rb +56 -0
- data/misc/homepage.erb +86 -6
- data/misc/homepage.html +166 -20
- metadata +5 -3
data/History.txt
CHANGED
@@ -1,6 +1,11 @@
|
|
1
|
+
=== 0.1.0 / 2008-02-01
|
2
|
+
|
3
|
+
* Add filter to filter out Chinese punctuations.
|
4
|
+
|
5
|
+
|
1
6
|
=== 0.0.1 / 2008-01-31
|
2
7
|
|
3
|
-
*
|
8
|
+
* Analyzer integration with Ferret.
|
4
9
|
* rdoc added
|
5
10
|
* Lazily init the +Word+ objects inside the +Dictionary+.
|
6
11
|
* Handle English punctuation correctly.
|
data/Manifest.txt
CHANGED
@@ -4,10 +4,12 @@ README.txt
|
|
4
4
|
Rakefile
|
5
5
|
TODO.txt
|
6
6
|
bin/rmmseg
|
7
|
+
data/chars.dic
|
8
|
+
data/punctuation.dic
|
9
|
+
data/words.dic
|
7
10
|
lib/rmmseg.rb
|
8
11
|
lib/rmmseg/algorithm.rb
|
9
12
|
lib/rmmseg/amibguity.rb
|
10
|
-
lib/rmmseg/chars.dic
|
11
13
|
lib/rmmseg/chunk.rb
|
12
14
|
lib/rmmseg/complex_algorithm.rb
|
13
15
|
lib/rmmseg/config.rb
|
@@ -21,7 +23,7 @@ lib/rmmseg/simple_algorithm.rb
|
|
21
23
|
lib/rmmseg/svwl_rule.rb
|
22
24
|
lib/rmmseg/token.rb
|
23
25
|
lib/rmmseg/word.rb
|
24
|
-
|
26
|
+
misc/ferret_example.rb
|
25
27
|
misc/homepage.erb
|
26
28
|
misc/homepage.html
|
27
29
|
spec/chunk_spec.rb
|
data/README.txt
CHANGED
@@ -23,11 +23,23 @@ following essays:
|
|
23
23
|
|
24
24
|
* Provides +rmmseg+ command line tool for quick and easy way to access
|
25
25
|
the word segment feature.
|
26
|
-
* Provides an +
|
26
|
+
* Provides an +Analyzer+ for integrating with Ferret[http://ferret.davebalmain.com/trac].
|
27
27
|
|
28
28
|
== SYNOPSIS:
|
29
29
|
|
30
|
+
Using the command line tool +rmmseg+ is simple:
|
30
31
|
$ rmmseg --separator _ < input.txt
|
32
|
+
passing option +-h+ can get an overview of all supported options.
|
33
|
+
|
34
|
+
Using the +Analyzer+ for Ferret is even easier:
|
35
|
+
|
36
|
+
require 'rmmseg'
|
37
|
+
require 'rmmseg/ferret'
|
38
|
+
|
39
|
+
alalyzer = RMMSeg::Ferret::Analyzer.new
|
40
|
+
index = Ferret::Index::Index.new(:analyzer => analyzer)
|
41
|
+
|
42
|
+
For more details, please refer to the {homepage usage section}[http://rmmseg.rubyforge.org/index.html#Usage].
|
31
43
|
|
32
44
|
== REQUIREMENTS:
|
33
45
|
|
data/Rakefile
CHANGED
data/TODO.txt
CHANGED
data/bin/rmmseg
CHANGED
File without changes
|
@@ -0,0 +1,79 @@
|
|
1
|
+
{
|
2
|
+
×
|
3
|
+
π
|
4
|
+
)
|
5
|
+
〖
|
6
|
+
;
|
7
|
+
〗
|
8
|
+
<
|
9
|
+
°
|
10
|
+
“
|
11
|
+
+
|
12
|
+
◆
|
13
|
+
♀
|
14
|
+
=
|
15
|
+
±
|
16
|
+
←
|
17
|
+
}
|
18
|
+
,
|
19
|
+
”
|
20
|
+
㎡
|
21
|
+
◇
|
22
|
+
>
|
23
|
+
↑
|
24
|
+
~
|
25
|
+
△
|
26
|
+
?
|
27
|
+
♂
|
28
|
+
‰
|
29
|
+
——
|
30
|
+
→
|
31
|
+
■
|
32
|
+
¥
|
33
|
+
-
|
34
|
+
@
|
35
|
+
≈
|
36
|
+
↓
|
37
|
+
□
|
38
|
+
〈
|
39
|
+
′
|
40
|
+
〉
|
41
|
+
/
|
42
|
+
★
|
43
|
+
《
|
44
|
+
○
|
45
|
+
″
|
46
|
+
☆
|
47
|
+
》
|
48
|
+
·
|
49
|
+
∶
|
50
|
+
!
|
51
|
+
『
|
52
|
+
§
|
53
|
+
●
|
54
|
+
』
|
55
|
+
…
|
56
|
+
【
|
57
|
+
℃
|
58
|
+
#
|
59
|
+
÷
|
60
|
+
】
|
61
|
+
№
|
62
|
+
$
|
63
|
+
※
|
64
|
+
≤
|
65
|
+
‖
|
66
|
+
%
|
67
|
+
≥
|
68
|
+
|
69
|
+
&
|
70
|
+
、
|
71
|
+
‘
|
72
|
+
〔
|
73
|
+
。
|
74
|
+
’
|
75
|
+
√
|
76
|
+
(
|
77
|
+
〕
|
78
|
+
£
|
79
|
+
:
|
File without changes
|
data/lib/rmmseg.rb
CHANGED
data/lib/rmmseg/config.rb
CHANGED
@@ -6,8 +6,9 @@ module RMMSeg
|
|
6
6
|
class Config
|
7
7
|
@algorithm = :complex
|
8
8
|
@on_ambiguity = :select_first
|
9
|
-
|
10
|
-
|
9
|
+
data_dir = File.join(File.dirname(__FILE__), "..", "..", "data")
|
10
|
+
@dictionaries = [[File.join(data_dir, "chars.dic"), true],
|
11
|
+
[File.join(data_dir, "words.dic"), false]]
|
11
12
|
@max_word_length = 4
|
12
13
|
|
13
14
|
class << self
|
data/lib/rmmseg/ferret.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# This file integrate RMMSeg with Ferret
|
2
|
+
require 'singleton'
|
2
3
|
require 'rubygems'
|
3
4
|
require 'ferret'
|
4
5
|
|
@@ -6,8 +7,25 @@ module RMMSeg
|
|
6
7
|
module Ferret
|
7
8
|
# The Analyzer class can be used with Ferret .
|
8
9
|
class Analyzer < ::Ferret::Analysis::Analyzer
|
10
|
+
|
11
|
+
# Construct an Analyzer. Optional block can be used to
|
12
|
+
# add more +TokenFilter+s. e.g.
|
13
|
+
#
|
14
|
+
# analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
|
15
|
+
# Ferret::Analysis::LowerCaseFilter.new(tokenizer)
|
16
|
+
# }
|
17
|
+
#
|
18
|
+
def initialize(&brk)
|
19
|
+
@brk = brk
|
20
|
+
end
|
21
|
+
|
9
22
|
def token_stream(field, text)
|
10
|
-
Tokenizer.new(text)
|
23
|
+
t = PunctuationFilter.new(Tokenizer.new(text))
|
24
|
+
if @brk
|
25
|
+
@brk.call(t)
|
26
|
+
else
|
27
|
+
t
|
28
|
+
end
|
11
29
|
end
|
12
30
|
end
|
13
31
|
|
@@ -39,5 +57,58 @@ module RMMSeg
|
|
39
57
|
@algor = RMMSeg::Config.algorithm_instance(@text)
|
40
58
|
end
|
41
59
|
end
|
60
|
+
|
61
|
+
# PunctuationFilter filter out the stand alone Chinese
|
62
|
+
# punctuation tokens.
|
63
|
+
class PunctuationFilter < ::Ferret::Analysis::TokenStream
|
64
|
+
# The punctuation dictionary.
|
65
|
+
class Dictionary
|
66
|
+
include Singleton
|
67
|
+
|
68
|
+
DIC_FILE = File.join(File.dirname(__FILE__),
|
69
|
+
"..",
|
70
|
+
"..",
|
71
|
+
"data",
|
72
|
+
"punctuation.dic")
|
73
|
+
def initialize
|
74
|
+
@dic = Hash.new
|
75
|
+
File.open(DIC_FILE, "r") do |f|
|
76
|
+
f.each_line { |line|
|
77
|
+
@dic[line.chomp.freeze] = nil
|
78
|
+
}
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def include?(str)
|
83
|
+
@dic.has_key?(str)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def initialize(stream)
|
88
|
+
@stream = stream
|
89
|
+
end
|
90
|
+
|
91
|
+
# Get next token, skip stand alone Chinese punctuations.
|
92
|
+
def next
|
93
|
+
token = nil
|
94
|
+
dic = Dictionary.instance
|
95
|
+
loop do
|
96
|
+
token = @stream.next
|
97
|
+
break if token.nil?
|
98
|
+
|
99
|
+
break unless dic.include? token.text
|
100
|
+
end
|
101
|
+
|
102
|
+
token
|
103
|
+
end
|
104
|
+
|
105
|
+
def text
|
106
|
+
@stream.text
|
107
|
+
end
|
108
|
+
|
109
|
+
def text=(str)
|
110
|
+
@stream.text = str
|
111
|
+
end
|
112
|
+
end
|
42
113
|
end
|
43
114
|
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'rmmseg'
|
4
|
+
require 'rmmseg/ferret'
|
5
|
+
|
6
|
+
analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
|
7
|
+
Ferret::Analysis::LowerCaseFilter.new(tokenizer)
|
8
|
+
}
|
9
|
+
$index = Ferret::Index::Index.new(:analyzer => analyzer,
|
10
|
+
:path => '/tmp/index')
|
11
|
+
|
12
|
+
$index << {
|
13
|
+
:title => "分词",
|
14
|
+
:content => "中文分词比较困难,不像英文那样,直接在空格和标点符号的地方断开就可以了。"
|
15
|
+
}
|
16
|
+
$index << {
|
17
|
+
:title => "RMMSeg",
|
18
|
+
:content => "RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。"
|
19
|
+
}
|
20
|
+
$index << {
|
21
|
+
:title => "Ruby 1.9",
|
22
|
+
:content => "Ruby 1.9.0 已经发布了,1.9 的一个重大改进就是对 Unicode 的支持。"
|
23
|
+
}
|
24
|
+
$index << {
|
25
|
+
:title => "Ferret",
|
26
|
+
:content => <<END
|
27
|
+
Ferret is a high-performance, full-featured text search engine library
|
28
|
+
written for Ruby. It is inspired by Apache Lucene Java project. With
|
29
|
+
the introduction of Ferret, Ruby users now have one of the fastest and
|
30
|
+
most flexible search libraries available. And it is surprisingly easy
|
31
|
+
to use.
|
32
|
+
END
|
33
|
+
}
|
34
|
+
|
35
|
+
def highlight_search(key)
|
36
|
+
$index.search_each(%Q!content:"#{key}"!) do |id, score|
|
37
|
+
puts "*** Document \"#{$index[id][:title]}\" found with a score of #{score}"
|
38
|
+
puts "-"*40
|
39
|
+
highlights = $index.highlight("content:#{key}", id,
|
40
|
+
:field => :content,
|
41
|
+
:pre_tag => "\033[36m",
|
42
|
+
:post_tag => "\033[m")
|
43
|
+
puts "#{highlights}"
|
44
|
+
puts ""
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
ARGV.each { |key|
|
49
|
+
puts "\033[33mSearching for #{key}...\033[m"
|
50
|
+
puts ""
|
51
|
+
highlight_search(key)
|
52
|
+
}
|
53
|
+
|
54
|
+
# Local Variables:
|
55
|
+
# coding: utf-8
|
56
|
+
# End:
|
data/misc/homepage.erb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
<%# -*- mode: text; coding: utf-8 -*- %>
|
1
2
|
<%
|
2
3
|
$title = "RMMSeg Homepage"
|
3
4
|
$authors = { 'pluskid' => 'http://pluskid.lifegoo.com' }
|
@@ -22,7 +23,7 @@
|
|
22
23
|
* http://technology.chtsai.org/mmseg/
|
23
24
|
* http://pluskid.lifegoo.com/?p=261
|
24
25
|
|
25
|
-
RMMSeg can be used as either a stand alone program or an
|
26
|
+
RMMSeg can be used as either a stand alone program or an Analyzer of
|
26
27
|
"Ferret":http://ferret.davebalmain.com/trac.
|
27
28
|
|
28
29
|
<% end %>
|
@@ -46,7 +47,7 @@
|
|
46
47
|
|
47
48
|
sudo gem install rmmseg
|
48
49
|
|
49
|
-
Or you can download the gem file manually from RubyForge and install it locally:
|
50
|
+
Or you can download the gem file manually from "RubyForge":http://rubyforge.org/projects/rmmseg/ and install it locally:
|
50
51
|
|
51
52
|
sudo gem install --local rmmseg-x.y.z.gem
|
52
53
|
|
@@ -77,15 +78,94 @@
|
|
77
78
|
|
78
79
|
rmmseg -h
|
79
80
|
|
80
|
-
It reads from STDIN and print result to STDOUT.
|
81
|
+
It reads from STDIN and print result to STDOUT. Here is a real
|
82
|
+
example:
|
83
|
+
|
84
|
+
$ echo "我们都喜欢用 Ruby" | rmmseg
|
85
|
+
我们 都 喜欢 用 Ruby
|
86
|
+
|
87
|
+
<% end %>
|
88
|
+
|
89
|
+
<% section "Analyzer for Ferret" do %>
|
90
|
+
RMMSeg include an analyzer for Ferret. It is simply ready to
|
91
|
+
use. Just require it and pass it to Ferret. Here's a complete
|
92
|
+
example:
|
93
|
+
|
94
|
+
<code lang="ruby">
|
95
|
+
<%# include ferret_example.rb %>
|
96
|
+
</code>
|
97
|
+
|
98
|
+
execute it on the following key words:
|
99
|
+
|
100
|
+
$ ruby ferret_example.rb Ruby 中文
|
101
|
+
|
102
|
+
will generate the following results:
|
103
|
+
|
104
|
+
<code lang="text">
|
105
|
+
Searching for Ruby...
|
106
|
+
|
107
|
+
*** Document "RMMSeg" found with a score of 0.21875
|
108
|
+
----------------------------------------
|
109
|
+
RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。
|
110
|
+
|
111
|
+
*** Document "Ruby 1.9" found with a score of 0.21875
|
112
|
+
----------------------------------------
|
113
|
+
Ruby 1.9.0 已经发布了,1.9 的一个重大改进就是对 Unicode 的支持。
|
114
|
+
|
115
|
+
*** Document "Ferret" found with a score of 0.176776692271233
|
116
|
+
----------------------------------------
|
117
|
+
Ferret is a high-performance, full-featured text search engine library
|
118
|
+
written for Ruby. It is inspired by Apache Lucene Java project. With
|
119
|
+
the introduction of Ferret, Ruby users now have one of the fastest and
|
120
|
+
most flexible search libraries available. And it's surprisingly easy
|
121
|
+
to use.
|
122
|
+
|
123
|
+
Searching for 中文...
|
124
|
+
|
125
|
+
*** Document "分词" found with a score of 0.281680464744568
|
126
|
+
----------------------------------------
|
127
|
+
中文分词比较困难,不像英文那样,直接在空格和标点符号的地方断开就可以了。
|
128
|
+
|
129
|
+
*** Document "RMMSeg" found with a score of 0.281680464744568
|
130
|
+
----------------------------------------
|
131
|
+
RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。
|
132
|
+
</code>
|
133
|
+
|
134
|
+
And if you run the example in terminal, you'll see the result
|
135
|
+
highlighted as in <%= xref "Ferret Example Screenshot" %>.
|
136
|
+
|
137
|
+
<% figure "Ferret Example Screenshot" do %>
|
138
|
+
!http://pluskid.lifegoo.com/wp-content/uploads/2008/02/rmmseg.png!
|
139
|
+
<% end %>
|
140
|
+
|
141
|
+
<% end %>
|
142
|
+
|
143
|
+
<% section "Customization" do %>
|
144
|
+
RMMSeg can be customized through @RMMSeg::Config@. For example, to use your own dictionaries, just set it before starting to do segmentation:
|
145
|
+
|
146
|
+
<code lang="ruby">
|
147
|
+
RMMSeg::Config.dictionaries = [["dict1.dic", true], # with frequency info
|
148
|
+
["dict2.dic", false], # without
|
149
|
+
["dict3.dic", false]]
|
150
|
+
RMMSeg::Config.max_word_length = 6
|
151
|
+
</code>
|
152
|
+
|
153
|
+
Or to use the simple algorithm for more efficient (and less accurate) segmenting:
|
154
|
+
|
155
|
+
<code>
|
156
|
+
RMMSeg::Config.algorithm = :simple
|
157
|
+
</code>
|
158
|
+
|
159
|
+
For more information on customization, please refer to the RDoc of "RMMSeg::Config":http://rmmseg.rubyforge.org/rmmseg/index.html.
|
81
160
|
<% end %>
|
82
161
|
|
83
162
|
<% end %>
|
84
163
|
|
85
164
|
<% chapter "Resources" do %>
|
86
|
-
* "Project Home":http://
|
87
|
-
* "
|
88
|
-
* "
|
165
|
+
* "Project Home":http://rubyforge.org/projects/rmmseg/: The Project page at RubyForge.
|
166
|
+
* "RDoc of RMMSeg":http://rmmseg.rubyforge.org/rmmseg/index.html: The auto generated rdoc of RMMSeg.
|
167
|
+
* "Implementation Details":http://pluskid.lifegoo.com/?p=261: My blog post about the implementation details of RMMSeg (Chinese).
|
168
|
+
* "Author's Email":mailto:pluskid@gmail.com: Contact me if you have any problem.
|
89
169
|
<% end %>
|
90
170
|
|
91
171
|
<% footer do %>
|
data/misc/homepage.html
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
<html>
|
3
3
|
<head>
|
4
4
|
<meta http-equiv="content-type" content="text/html; charset=utf-8"/>
|
5
|
-
<meta name="date" content="
|
5
|
+
<meta name="date" content="01 February 2008"/>
|
6
6
|
<meta name="author" content="pluskid"/>
|
7
7
|
<meta name="generator" content="Gerbil 1.1.0"/>
|
8
8
|
<title>RMMSeg Homepage</title>
|
@@ -763,19 +763,19 @@
|
|
763
763
|
|
764
764
|
<h1 class="title">RMMSeg Homepage</h1>
|
765
765
|
<h2 class="authors"><a href="http://pluskid.lifegoo.com">pluskid</a></h2>
|
766
|
-
<h3 class="date">
|
766
|
+
<h3 class="date">01 February 2008</h3>
|
767
767
|
</div>
|
768
768
|
|
769
769
|
|
770
|
-
<div id="toc"><h1>Contents</h1> <ul><li>1 <a id="a-
|
770
|
+
<div id="toc"><h1>Contents</h1> <ul><li>1 <a id="a-606801458" href="#Introduction">Introduction</a></li><li>2 <a id="a-606803598" href="#Setup">Setup</a><ul><li>2.1 <a id="a-606805098" href="#Requirements">Requirements</a></li><li>2.2 <a id="a-606807208" href="#Installation">Installation</a><ul><li>2.2.1 <a id="a-606808808" href="#Using-RubyGems">Using RubyGems</a></li><li>2.2.2 <a id="a-606810898" href="#From-Subversion">From Subversion</a></li></ul></li></ul></li><li>3 <a id="a-606815688" href="#Usage">Usage</a><ul><li>3.1 <a id="a-606817228" href="#Stand-Alone-rmmseg">Stand Alone rmmseg</a></li><li>3.2 <a id="a-606819308" href="#Analyzer-for-Ferret">Analyzer for Ferret</a></li><li>3.3 <a id="a-606825488" href="#Customization">Customization</a></li></ul></li><li>4 <a id="a-606828108" href="#Resources">Resources</a></li></ul></div>
|
771
771
|
|
772
|
-
<div id="lof"><h1>Notes</h1> <ol><li><a id="a-
|
772
|
+
<div id="lof"><h1>Figures</h1> <ol><li><a id="a-606823268" href="#Ferret-Example-Screenshot">Ferret Example Screenshot</a></li></ol><h1>Notes</h1> <ol><li><a id="a-606812538" href="#The-latest-code-might-be-unstable">The latest code might be unstable</a></li></ol></div>
|
773
773
|
|
774
774
|
<div id="content">
|
775
775
|
<div class="chapter">
|
776
776
|
<h1 class="title">
|
777
777
|
Chapter
|
778
|
-
<a class="toc" id="Introduction" href="#a-
|
778
|
+
<a class="toc" id="Introduction" href="#a-606801458">1</a>
|
779
779
|
|
780
780
|
<br/>
|
781
781
|
|
@@ -805,13 +805,13 @@ following essays:</p>
|
|
805
805
|
</ul>
|
806
806
|
|
807
807
|
|
808
|
-
<p>RMMSeg can be used as either a stand alone program or an
|
808
|
+
<p>RMMSeg can be used as either a stand alone program or an Analyzer of
|
809
809
|
<a href="http://ferret.davebalmain.com/trac">Ferret</a>.</p></div>
|
810
810
|
</div>
|
811
811
|
<div class="chapter">
|
812
812
|
<h1 class="title">
|
813
813
|
Chapter
|
814
|
-
<a class="toc" id="Setup" href="#a-
|
814
|
+
<a class="toc" id="Setup" href="#a-606803598">2</a>
|
815
815
|
|
816
816
|
<br/>
|
817
817
|
|
@@ -820,7 +820,7 @@ following essays:</p>
|
|
820
820
|
|
821
821
|
<div class="content"><div class="section">
|
822
822
|
<h2 class="title">
|
823
|
-
<a class="toc" id="Requirements" href="#a-
|
823
|
+
<a class="toc" id="Requirements" href="#a-606805098">2.1</a> Requirements
|
824
824
|
</h2>
|
825
825
|
<div class="content">Your system needs the following software to run RMMSeg.
|
826
826
|
|
@@ -850,11 +850,11 @@ following essays:</p>
|
|
850
850
|
</div>
|
851
851
|
<div class="section">
|
852
852
|
<h2 class="title">
|
853
|
-
<a class="toc" id="Installation" href="#a-
|
853
|
+
<a class="toc" id="Installation" href="#a-606807208">2.2</a> Installation
|
854
854
|
</h2>
|
855
855
|
<div class="content"><div class="section">
|
856
856
|
<h3 class="title">
|
857
|
-
<a class="toc" id="Using-RubyGems" href="#a-
|
857
|
+
<a class="toc" id="Using-RubyGems" href="#a-606808808">2.2.1</a> Using RubyGems
|
858
858
|
</h3>
|
859
859
|
<div class="content"><p>To install the gem remotely from <a href="http://rubyforge.org">RubyForge</a> :</p>
|
860
860
|
|
@@ -862,18 +862,18 @@ following essays:</p>
|
|
862
862
|
<pre>sudo gem install rmmseg</pre>
|
863
863
|
|
864
864
|
|
865
|
-
<p>Or you can download the gem file manually from RubyForge and install it locally:</p>
|
865
|
+
<p>Or you can download the gem file manually from <a href="http://rubyforge.org/projects/rmmseg/">RubyForge</a> and install it locally:</p>
|
866
866
|
|
867
867
|
|
868
868
|
<pre>sudo gem install --local rmmseg-x.y.z.gem</pre></div>
|
869
869
|
</div>
|
870
870
|
<div class="section">
|
871
871
|
<h3 class="title">
|
872
|
-
<a class="toc" id="From-Subversion" href="#a-
|
872
|
+
<a class="toc" id="From-Subversion" href="#a-606810898">2.2.2</a> From Subversion
|
873
873
|
</h3>
|
874
874
|
<div class="content"><p>From subversion repository hosted at <a href="http://rmmseg.rubyforge.org/svn/">RubyForge</a>, you can always get the latest source code.
|
875
875
|
<div class="note">
|
876
|
-
<p class="title"><a class="toc" id="The-latest-code-might-be-unstable" href="#a-
|
876
|
+
<p class="title"><a class="toc" id="The-latest-code-might-be-unstable" href="#a-606812538">Note 1</a>. The latest code might be unstable</p>
|
877
877
|
|
878
878
|
<img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAADAAAAAwCAYAAABXAvmHAAAABHNCSVQICAgI
|
879
879
|
fAhkiAAAAAlwSFlzAAAN1wAADdcBQiibeAAAABl0RVh0U29mdHdhcmUAd3d3
|
@@ -954,7 +954,7 @@ DUCQhrhWJkj394A0gKeUCjVo3r9Zv0r2P3yyQqPd16MPAAAAAElFTkSuQmCC
|
|
954
954
|
<div class="chapter">
|
955
955
|
<h1 class="title">
|
956
956
|
Chapter
|
957
|
-
<a class="toc" id="Usage" href="#a-
|
957
|
+
<a class="toc" id="Usage" href="#a-606815688">3</a>
|
958
958
|
|
959
959
|
<br/>
|
960
960
|
|
@@ -963,7 +963,7 @@ DUCQhrhWJkj394A0gKeUCjVo3r9Zv0r2P3yyQqPd16MPAAAAAElFTkSuQmCC
|
|
963
963
|
|
964
964
|
<div class="content"><div class="section">
|
965
965
|
<h2 class="title">
|
966
|
-
<a class="toc" id="Stand-Alone-rmmseg" href="#a-
|
966
|
+
<a class="toc" id="Stand-Alone-rmmseg" href="#a-606817228">3.1</a> Stand Alone rmmseg
|
967
967
|
</h2>
|
968
968
|
<div class="content"><p>RMMSeg comes with a script <code class="code">rmmseg</code>. To get the basic usage, just execute it with <code class="code">-h</code> option:</p>
|
969
969
|
|
@@ -971,13 +971,158 @@ DUCQhrhWJkj394A0gKeUCjVo3r9Zv0r2P3yyQqPd16MPAAAAAElFTkSuQmCC
|
|
971
971
|
<pre>rmmseg -h</pre>
|
972
972
|
|
973
973
|
|
974
|
-
<p>It reads from STDIN and print result to STDOUT
|
974
|
+
<p>It reads from STDIN and print result to STDOUT. Here is a real
|
975
|
+
example:</p>
|
976
|
+
|
977
|
+
|
978
|
+
<pre>$ echo "我们都喜欢用 Ruby" | rmmseg
|
979
|
+
我们 都 喜欢 用 Ruby</pre></div>
|
980
|
+
</div>
|
981
|
+
<div class="section">
|
982
|
+
<h2 class="title">
|
983
|
+
<a class="toc" id="Analyzer-for-Ferret" href="#a-606819308">3.2</a> Analyzer for Ferret
|
984
|
+
</h2>
|
985
|
+
<div class="content"><p>RMMSeg include an analyzer for Ferret. It is simply ready to
|
986
|
+
use. Just require it and pass it to Ferret. Here’s a complete
|
987
|
+
example:</p>
|
988
|
+
|
989
|
+
|
990
|
+
<pre class="code" lang="ruby">
|
991
|
+
<span style="color:#888">#!/usr/bin/env ruby</span>
|
992
|
+
require <span style="background-color:#fff0f0"><span style="color:#710">'</span><span style="color:#D20">rubygems</span><span style="color:#710">'</span></span>
|
993
|
+
require <span style="background-color:#fff0f0"><span style="color:#710">'</span><span style="color:#D20">rmmseg</span><span style="color:#710">'</span></span>
|
994
|
+
require <span style="background-color:#fff0f0"><span style="color:#710">'</span><span style="color:#D20">rmmseg/ferret</span><span style="color:#710">'</span></span>
|
995
|
+
|
996
|
+
analyzer = <span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Ferret</span>::<span style="color:#036; font-weight:bold">Analyzer</span>.new
|
997
|
+
<span style="color:#d70; font-weight:bold">$index</span> = <span style="color:#036; font-weight:bold">Ferret</span>::<span style="color:#036; font-weight:bold">Index</span>::<span style="color:#036; font-weight:bold">Index</span>.new(<span style="color:#A60">:analyzer</span> => analyzer)
|
998
|
+
|
999
|
+
<span style="color:#d70; font-weight:bold">$index</span> << {
|
1000
|
+
<span style="color:#A60">:title</span> => <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">分词</span><span style="color:#710">"</span></span>,
|
1001
|
+
<span style="color:#A60">:content</span> => <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">中文分词比较困难,不像英文那样,直接在空格和标点符号的地方断开就可以了。</span><span style="color:#710">"</span></span>
|
1002
|
+
}
|
1003
|
+
<span style="color:#d70; font-weight:bold">$index</span> << {
|
1004
|
+
<span style="color:#A60">:title</span> => <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">RMMSeg</span><span style="color:#710">"</span></span>,
|
1005
|
+
<span style="color:#A60">:content</span> => <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。</span><span style="color:#710">"</span></span>
|
1006
|
+
}
|
1007
|
+
<span style="color:#d70; font-weight:bold">$index</span> << {
|
1008
|
+
<span style="color:#A60">:title</span> => <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">Ruby 1.9</span><span style="color:#710">"</span></span>,
|
1009
|
+
<span style="color:#A60">:content</span> => <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">Ruby 1.9.0 已经发布了,1.9 的一个重大改进就是对 Unicode 的支持。</span><span style="color:#710">"</span></span>
|
1010
|
+
}
|
1011
|
+
<span style="color:#d70; font-weight:bold">$index</span> << {
|
1012
|
+
<span style="color:#A60">:title</span> => <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">Ferret</span><span style="color:#710">"</span></span>,
|
1013
|
+
<span style="color:#A60">:content</span> => <span style="background-color:#fff0f0"><span style="color:#710"><<END</span></span><span style="background-color:#fff0f0"><span style="color:#D20">
|
1014
|
+
Ferret is a high-performance, full-featured text search engine library
|
1015
|
+
written for Ruby. It is inspired by Apache Lucene Java project. With
|
1016
|
+
the introduction of Ferret, Ruby users now have one of the fastest and
|
1017
|
+
most flexible search libraries available. And it is surprisingly easy
|
1018
|
+
to use.</span><span style="color:#710">
|
1019
|
+
END</span></span>
|
1020
|
+
}
|
1021
|
+
|
1022
|
+
<span style="color:#080; font-weight:bold">def</span> <span style="color:#06B; font-weight:bold">highlight_search</span>(key)
|
1023
|
+
<span style="color:#d70; font-weight:bold">$index</span>.search_each(<span style="background-color:#fff0f0"><span style="color:#710">%Q!</span><span style="color:#D20">content:"</span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>key<span style="font-weight: bold; color: #888">}</span></span><span style="color:#D20">"</span><span style="color:#710">!</span></span>) <span style="color:#080; font-weight:bold">do</span> |id, score|
|
1024
|
+
puts <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">*** Document </span><span style="color:#04D">\"</span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span><span style="color:#d70; font-weight:bold">$index</span>[id][<span style="color:#A60">:title</span>]<span style="font-weight: bold; color: #888">}</span></span><span style="color:#04D">\"</span><span style="color:#D20"> found with a score of </span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>score<span style="font-weight: bold; color: #888">}</span></span><span style="color:#710">"</span></span>
|
1025
|
+
puts <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">-</span><span style="color:#710">"</span></span>*<span style="color:#00D; font-weight:bold">40</span>
|
1026
|
+
highlights = <span style="color:#d70; font-weight:bold">$index</span>.highlight(<span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">content:</span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>key<span style="font-weight: bold; color: #888">}</span></span><span style="color:#710">"</span></span>, id,
|
1027
|
+
<span style="color:#A60">:field</span> => <span style="color:#A60">:content</span>,
|
1028
|
+
<span style="color:#A60">:pre_tag</span> => <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#04D">\033</span><span style="color:#D20">[36m</span><span style="color:#710">"</span></span>,
|
1029
|
+
<span style="color:#A60">:post_tag</span> => <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#04D">\033</span><span style="color:#D20">[m</span><span style="color:#710">"</span></span>)
|
1030
|
+
puts <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>highlights<span style="font-weight: bold; color: #888">}</span></span><span style="color:#710">"</span></span>
|
1031
|
+
puts <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#710">"</span></span>
|
1032
|
+
<span style="color:#080; font-weight:bold">end</span>
|
1033
|
+
<span style="color:#080; font-weight:bold">end</span>
|
1034
|
+
|
1035
|
+
<span style="color:#038; font-weight:bold">ARGV</span>.each { |key|
|
1036
|
+
puts <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#04D">\033</span><span style="color:#D20">[33mSearching for </span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>key<span style="font-weight: bold; color: #888">}</span></span><span style="color:#D20">...</span><span style="color:#04D">\033</span><span style="color:#D20">[m</span><span style="color:#710">"</span></span>
|
1037
|
+
puts <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#710">"</span></span>
|
1038
|
+
highlight_search(key)
|
1039
|
+
}
|
1040
|
+
|
1041
|
+
<span style="color:#888"># Local Variables:</span>
|
1042
|
+
<span style="color:#888"># coding: utf-8</span>
|
1043
|
+
<span style="color:#888"># End:</span>
|
1044
|
+
</pre>
|
1045
|
+
|
1046
|
+
|
1047
|
+
<p>execute it on the following key words:</p>
|
1048
|
+
|
1049
|
+
|
1050
|
+
<pre>$ ruby ferret_example.rb Ruby 中文</pre>
|
1051
|
+
|
1052
|
+
|
1053
|
+
<p>will generate the following results:</p>
|
1054
|
+
|
1055
|
+
|
1056
|
+
<pre class="code" lang="text">
|
1057
|
+
Searching for Ruby...
|
1058
|
+
|
1059
|
+
*** Document "RMMSeg" found with a score of 0.21875
|
1060
|
+
----------------------------------------
|
1061
|
+
RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。
|
1062
|
+
|
1063
|
+
*** Document "Ruby 1.9" found with a score of 0.21875
|
1064
|
+
----------------------------------------
|
1065
|
+
Ruby 1.9.0 已经发布了,1.9 的一个重大改进就是对 Unicode 的支持。
|
1066
|
+
|
1067
|
+
*** Document "Ferret" found with a score of 0.176776692271233
|
1068
|
+
----------------------------------------
|
1069
|
+
Ferret is a high-performance, full-featured text search engine library
|
1070
|
+
written for Ruby. It is inspired by Apache Lucene Java project. With
|
1071
|
+
the introduction of Ferret, Ruby users now have one of the fastest and
|
1072
|
+
most flexible search libraries available. And it's surprisingly easy
|
1073
|
+
to use.
|
1074
|
+
|
1075
|
+
Searching for 中文...
|
1076
|
+
|
1077
|
+
*** Document "分词" found with a score of 0.281680464744568
|
1078
|
+
----------------------------------------
|
1079
|
+
中文分词比较困难,不像英文那样,直接在空格和标点符号的地方断开就可以了。
|
1080
|
+
|
1081
|
+
*** Document "RMMSeg" found with a score of 0.281680464744568
|
1082
|
+
----------------------------------------
|
1083
|
+
RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。
|
1084
|
+
</pre>
|
1085
|
+
|
1086
|
+
|
1087
|
+
<p>And if you run the example in terminal, you’ll see the result
|
1088
|
+
highlighted as in <a class="xref" href="#Ferret-Example-Screenshot">Figure 1: <em>Ferret Example Screenshot</em></a>.</p>
|
1089
|
+
|
1090
|
+
|
1091
|
+
<p><div class="figure">
|
1092
|
+
<p class="title"><a class="toc" id="Ferret-Example-Screenshot" href="#a-606823268">Figure 1</a>. Ferret Example Screenshot</p>
|
1093
|
+
<div class="content"><img src="http://pluskid.lifegoo.com/wp-content/uploads/2008/02/rmmseg.png" alt="" /></div>
|
1094
|
+
</div></p></div>
|
1095
|
+
</div>
|
1096
|
+
<div class="section">
|
1097
|
+
<h2 class="title">
|
1098
|
+
<a class="toc" id="Customization" href="#a-606825488">3.3</a> Customization
|
1099
|
+
</h2>
|
1100
|
+
<div class="content"><p>RMMSeg can be customized through <code class="code"><span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Config</span></code>. For example, to use your own dictionaries, just set it before starting to do segmentation:</p>
|
1101
|
+
|
1102
|
+
|
1103
|
+
<pre class="code" lang="ruby">
|
1104
|
+
<span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Config</span>.dictionaries = [[<span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">dict1.dic</span><span style="color:#710">"</span></span>, <span style="color:#038; font-weight:bold">true</span>], <span style="color:#888"># with frequency info</span>
|
1105
|
+
[<span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">dict2.dic</span><span style="color:#710">"</span></span>, <span style="color:#038; font-weight:bold">false</span>], <span style="color:#888"># without</span>
|
1106
|
+
[<span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">dict3.dic</span><span style="color:#710">"</span></span>, <span style="color:#038; font-weight:bold">false</span>]]
|
1107
|
+
<span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Config</span>.max_word_length = <span style="color:#00D; font-weight:bold">6</span>
|
1108
|
+
</pre>
|
1109
|
+
|
1110
|
+
|
1111
|
+
<p>Or to use the simple algorithm for more efficient (and less accurate) segmenting:</p>
|
1112
|
+
|
1113
|
+
|
1114
|
+
<pre class="code">
|
1115
|
+
<span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Config</span>.algorithm = <span style="color:#A60">:simple</span>
|
1116
|
+
</pre>
|
1117
|
+
|
1118
|
+
|
1119
|
+
<p>For more information on customization, please refer to the RDoc of <a href="http://rmmseg.rubyforge.org/rmmseg/index.html">RMMSeg::Config</a>.</p></div>
|
975
1120
|
</div></div>
|
976
1121
|
</div>
|
977
1122
|
<div class="chapter">
|
978
1123
|
<h1 class="title">
|
979
1124
|
Chapter
|
980
|
-
<a class="toc" id="Resources" href="#a-
|
1125
|
+
<a class="toc" id="Resources" href="#a-606828108">4</a>
|
981
1126
|
|
982
1127
|
<br/>
|
983
1128
|
|
@@ -985,9 +1130,10 @@ DUCQhrhWJkj394A0gKeUCjVo3r9Zv0r2P3yyQqPd16MPAAAAAElFTkSuQmCC
|
|
985
1130
|
</h1>
|
986
1131
|
|
987
1132
|
<div class="content"><ul>
|
988
|
-
<li><a href="http://
|
989
|
-
<li><a href="http://
|
990
|
-
<li><a href="http://
|
1133
|
+
<li><a href="http://rubyforge.org/projects/rmmseg/">Project Home</a>: The Project page at RubyForge.</li>
|
1134
|
+
<li><a href="http://rmmseg.rubyforge.org/rmmseg/index.html">RDoc of RMMSeg</a>: The auto generated rdoc of RMMSeg.</li>
|
1135
|
+
<li><a href="http://pluskid.lifegoo.com/?p=261">Implementation Details</a>: My blog post about the implementation details of RMMSeg (Chinese).</li>
|
1136
|
+
<li><a href="mailto:pluskid@gmail.com">Author’s Email</a>: Contact me if you have any problem.</li>
|
991
1137
|
</ul></div>
|
992
1138
|
</div>
|
993
1139
|
</div>
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rmmseg
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- pluskid
|
@@ -39,10 +39,12 @@ files:
|
|
39
39
|
- Rakefile
|
40
40
|
- TODO.txt
|
41
41
|
- bin/rmmseg
|
42
|
+
- data/chars.dic
|
43
|
+
- data/punctuation.dic
|
44
|
+
- data/words.dic
|
42
45
|
- lib/rmmseg.rb
|
43
46
|
- lib/rmmseg/algorithm.rb
|
44
47
|
- lib/rmmseg/amibguity.rb
|
45
|
-
- lib/rmmseg/chars.dic
|
46
48
|
- lib/rmmseg/chunk.rb
|
47
49
|
- lib/rmmseg/complex_algorithm.rb
|
48
50
|
- lib/rmmseg/config.rb
|
@@ -56,7 +58,7 @@ files:
|
|
56
58
|
- lib/rmmseg/svwl_rule.rb
|
57
59
|
- lib/rmmseg/token.rb
|
58
60
|
- lib/rmmseg/word.rb
|
59
|
-
-
|
61
|
+
- misc/ferret_example.rb
|
60
62
|
- misc/homepage.erb
|
61
63
|
- misc/homepage.html
|
62
64
|
- spec/chunk_spec.rb
|