rmmseg 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +6 -1
- data/Manifest.txt +4 -2
- data/README.txt +13 -1
- data/Rakefile +1 -1
- data/TODO.txt +0 -1
- data/bin/rmmseg +0 -2
- data/{lib/rmmseg → data}/chars.dic +0 -0
- data/data/punctuation.dic +79 -0
- data/{lib/rmmseg → data}/words.dic +0 -0
- data/lib/rmmseg.rb +1 -1
- data/lib/rmmseg/config.rb +3 -2
- data/lib/rmmseg/ferret.rb +72 -1
- data/misc/ferret_example.rb +56 -0
- data/misc/homepage.erb +86 -6
- data/misc/homepage.html +166 -20
- metadata +5 -3
data/History.txt
CHANGED
@@ -1,6 +1,11 @@
|
|
1
|
+
=== 0.1.0 / 2008-02-01
|
2
|
+
|
3
|
+
* Add filter to filter out Chinese punctuations.
|
4
|
+
|
5
|
+
|
1
6
|
=== 0.0.1 / 2008-01-31
|
2
7
|
|
3
|
-
*
|
8
|
+
* Analyzer integration with Ferret.
|
4
9
|
* rdoc added
|
5
10
|
* Lazily init the +Word+ objects inside the +Dictionary+.
|
6
11
|
* Handle English punctuation correctly.
|
data/Manifest.txt
CHANGED
@@ -4,10 +4,12 @@ README.txt
|
|
4
4
|
Rakefile
|
5
5
|
TODO.txt
|
6
6
|
bin/rmmseg
|
7
|
+
data/chars.dic
|
8
|
+
data/punctuation.dic
|
9
|
+
data/words.dic
|
7
10
|
lib/rmmseg.rb
|
8
11
|
lib/rmmseg/algorithm.rb
|
9
12
|
lib/rmmseg/amibguity.rb
|
10
|
-
lib/rmmseg/chars.dic
|
11
13
|
lib/rmmseg/chunk.rb
|
12
14
|
lib/rmmseg/complex_algorithm.rb
|
13
15
|
lib/rmmseg/config.rb
|
@@ -21,7 +23,7 @@ lib/rmmseg/simple_algorithm.rb
|
|
21
23
|
lib/rmmseg/svwl_rule.rb
|
22
24
|
lib/rmmseg/token.rb
|
23
25
|
lib/rmmseg/word.rb
|
24
|
-
|
26
|
+
misc/ferret_example.rb
|
25
27
|
misc/homepage.erb
|
26
28
|
misc/homepage.html
|
27
29
|
spec/chunk_spec.rb
|
data/README.txt
CHANGED
@@ -23,11 +23,23 @@ following essays:
|
|
23
23
|
|
24
24
|
* Provides +rmmseg+ command line tool for quick and easy way to access
|
25
25
|
the word segment feature.
|
26
|
-
* Provides an +
|
26
|
+
* Provides an +Analyzer+ for integrating with Ferret[http://ferret.davebalmain.com/trac].
|
27
27
|
|
28
28
|
== SYNOPSIS:
|
29
29
|
|
30
|
+
Using the command line tool +rmmseg+ is simple:
|
30
31
|
$ rmmseg --separator _ < input.txt
|
32
|
+
passing option +-h+ can get an overview of all supported options.
|
33
|
+
|
34
|
+
Using the +Analyzer+ for Ferret is even easier:
|
35
|
+
|
36
|
+
require 'rmmseg'
|
37
|
+
require 'rmmseg/ferret'
|
38
|
+
|
39
|
+
alalyzer = RMMSeg::Ferret::Analyzer.new
|
40
|
+
index = Ferret::Index::Index.new(:analyzer => analyzer)
|
41
|
+
|
42
|
+
For more details, please refer to the {homepage usage section}[http://rmmseg.rubyforge.org/index.html#Usage].
|
31
43
|
|
32
44
|
== REQUIREMENTS:
|
33
45
|
|
data/Rakefile
CHANGED
data/TODO.txt
CHANGED
data/bin/rmmseg
CHANGED
File without changes
|
@@ -0,0 +1,79 @@
|
|
1
|
+
{
|
2
|
+
×
|
3
|
+
π
|
4
|
+
)
|
5
|
+
〖
|
6
|
+
;
|
7
|
+
〗
|
8
|
+
<
|
9
|
+
°
|
10
|
+
“
|
11
|
+
+
|
12
|
+
◆
|
13
|
+
♀
|
14
|
+
=
|
15
|
+
±
|
16
|
+
←
|
17
|
+
}
|
18
|
+
,
|
19
|
+
”
|
20
|
+
㎡
|
21
|
+
◇
|
22
|
+
>
|
23
|
+
↑
|
24
|
+
~
|
25
|
+
△
|
26
|
+
?
|
27
|
+
♂
|
28
|
+
‰
|
29
|
+
——
|
30
|
+
→
|
31
|
+
■
|
32
|
+
¥
|
33
|
+
-
|
34
|
+
@
|
35
|
+
≈
|
36
|
+
↓
|
37
|
+
□
|
38
|
+
〈
|
39
|
+
′
|
40
|
+
〉
|
41
|
+
/
|
42
|
+
★
|
43
|
+
《
|
44
|
+
○
|
45
|
+
″
|
46
|
+
☆
|
47
|
+
》
|
48
|
+
·
|
49
|
+
∶
|
50
|
+
!
|
51
|
+
『
|
52
|
+
§
|
53
|
+
●
|
54
|
+
』
|
55
|
+
…
|
56
|
+
【
|
57
|
+
℃
|
58
|
+
#
|
59
|
+
÷
|
60
|
+
】
|
61
|
+
№
|
62
|
+
$
|
63
|
+
※
|
64
|
+
≤
|
65
|
+
‖
|
66
|
+
%
|
67
|
+
≥
|
68
|
+
|
69
|
+
&
|
70
|
+
、
|
71
|
+
‘
|
72
|
+
〔
|
73
|
+
。
|
74
|
+
’
|
75
|
+
√
|
76
|
+
(
|
77
|
+
〕
|
78
|
+
£
|
79
|
+
:
|
File without changes
|
data/lib/rmmseg.rb
CHANGED
data/lib/rmmseg/config.rb
CHANGED
@@ -6,8 +6,9 @@ module RMMSeg
|
|
6
6
|
class Config
|
7
7
|
@algorithm = :complex
|
8
8
|
@on_ambiguity = :select_first
|
9
|
-
|
10
|
-
|
9
|
+
data_dir = File.join(File.dirname(__FILE__), "..", "..", "data")
|
10
|
+
@dictionaries = [[File.join(data_dir, "chars.dic"), true],
|
11
|
+
[File.join(data_dir, "words.dic"), false]]
|
11
12
|
@max_word_length = 4
|
12
13
|
|
13
14
|
class << self
|
data/lib/rmmseg/ferret.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# This file integrate RMMSeg with Ferret
|
2
|
+
require 'singleton'
|
2
3
|
require 'rubygems'
|
3
4
|
require 'ferret'
|
4
5
|
|
@@ -6,8 +7,25 @@ module RMMSeg
|
|
6
7
|
module Ferret
|
7
8
|
# The Analyzer class can be used with Ferret .
|
8
9
|
class Analyzer < ::Ferret::Analysis::Analyzer
|
10
|
+
|
11
|
+
# Construct an Analyzer. Optional block can be used to
|
12
|
+
# add more +TokenFilter+s. e.g.
|
13
|
+
#
|
14
|
+
# analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
|
15
|
+
# Ferret::Analysis::LowerCaseFilter.new(tokenizer)
|
16
|
+
# }
|
17
|
+
#
|
18
|
+
def initialize(&brk)
|
19
|
+
@brk = brk
|
20
|
+
end
|
21
|
+
|
9
22
|
def token_stream(field, text)
|
10
|
-
Tokenizer.new(text)
|
23
|
+
t = PunctuationFilter.new(Tokenizer.new(text))
|
24
|
+
if @brk
|
25
|
+
@brk.call(t)
|
26
|
+
else
|
27
|
+
t
|
28
|
+
end
|
11
29
|
end
|
12
30
|
end
|
13
31
|
|
@@ -39,5 +57,58 @@ module RMMSeg
|
|
39
57
|
@algor = RMMSeg::Config.algorithm_instance(@text)
|
40
58
|
end
|
41
59
|
end
|
60
|
+
|
61
|
+
# PunctuationFilter filter out the stand alone Chinese
|
62
|
+
# punctuation tokens.
|
63
|
+
class PunctuationFilter < ::Ferret::Analysis::TokenStream
|
64
|
+
# The punctuation dictionary.
|
65
|
+
class Dictionary
|
66
|
+
include Singleton
|
67
|
+
|
68
|
+
DIC_FILE = File.join(File.dirname(__FILE__),
|
69
|
+
"..",
|
70
|
+
"..",
|
71
|
+
"data",
|
72
|
+
"punctuation.dic")
|
73
|
+
def initialize
|
74
|
+
@dic = Hash.new
|
75
|
+
File.open(DIC_FILE, "r") do |f|
|
76
|
+
f.each_line { |line|
|
77
|
+
@dic[line.chomp.freeze] = nil
|
78
|
+
}
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def include?(str)
|
83
|
+
@dic.has_key?(str)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def initialize(stream)
|
88
|
+
@stream = stream
|
89
|
+
end
|
90
|
+
|
91
|
+
# Get next token, skip stand alone Chinese punctuations.
|
92
|
+
def next
|
93
|
+
token = nil
|
94
|
+
dic = Dictionary.instance
|
95
|
+
loop do
|
96
|
+
token = @stream.next
|
97
|
+
break if token.nil?
|
98
|
+
|
99
|
+
break unless dic.include? token.text
|
100
|
+
end
|
101
|
+
|
102
|
+
token
|
103
|
+
end
|
104
|
+
|
105
|
+
def text
|
106
|
+
@stream.text
|
107
|
+
end
|
108
|
+
|
109
|
+
def text=(str)
|
110
|
+
@stream.text = str
|
111
|
+
end
|
112
|
+
end
|
42
113
|
end
|
43
114
|
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'rmmseg'
|
4
|
+
require 'rmmseg/ferret'
|
5
|
+
|
6
|
+
analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
|
7
|
+
Ferret::Analysis::LowerCaseFilter.new(tokenizer)
|
8
|
+
}
|
9
|
+
$index = Ferret::Index::Index.new(:analyzer => analyzer,
|
10
|
+
:path => '/tmp/index')
|
11
|
+
|
12
|
+
$index << {
|
13
|
+
:title => "分词",
|
14
|
+
:content => "中文分词比较困难,不像英文那样,直接在空格和标点符号的地方断开就可以了。"
|
15
|
+
}
|
16
|
+
$index << {
|
17
|
+
:title => "RMMSeg",
|
18
|
+
:content => "RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。"
|
19
|
+
}
|
20
|
+
$index << {
|
21
|
+
:title => "Ruby 1.9",
|
22
|
+
:content => "Ruby 1.9.0 已经发布了,1.9 的一个重大改进就是对 Unicode 的支持。"
|
23
|
+
}
|
24
|
+
$index << {
|
25
|
+
:title => "Ferret",
|
26
|
+
:content => <<END
|
27
|
+
Ferret is a high-performance, full-featured text search engine library
|
28
|
+
written for Ruby. It is inspired by Apache Lucene Java project. With
|
29
|
+
the introduction of Ferret, Ruby users now have one of the fastest and
|
30
|
+
most flexible search libraries available. And it is surprisingly easy
|
31
|
+
to use.
|
32
|
+
END
|
33
|
+
}
|
34
|
+
|
35
|
+
def highlight_search(key)
|
36
|
+
$index.search_each(%Q!content:"#{key}"!) do |id, score|
|
37
|
+
puts "*** Document \"#{$index[id][:title]}\" found with a score of #{score}"
|
38
|
+
puts "-"*40
|
39
|
+
highlights = $index.highlight("content:#{key}", id,
|
40
|
+
:field => :content,
|
41
|
+
:pre_tag => "\033[36m",
|
42
|
+
:post_tag => "\033[m")
|
43
|
+
puts "#{highlights}"
|
44
|
+
puts ""
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
ARGV.each { |key|
|
49
|
+
puts "\033[33mSearching for #{key}...\033[m"
|
50
|
+
puts ""
|
51
|
+
highlight_search(key)
|
52
|
+
}
|
53
|
+
|
54
|
+
# Local Variables:
|
55
|
+
# coding: utf-8
|
56
|
+
# End:
|
data/misc/homepage.erb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
<%# -*- mode: text; coding: utf-8 -*- %>
|
1
2
|
<%
|
2
3
|
$title = "RMMSeg Homepage"
|
3
4
|
$authors = { 'pluskid' => 'http://pluskid.lifegoo.com' }
|
@@ -22,7 +23,7 @@
|
|
22
23
|
* http://technology.chtsai.org/mmseg/
|
23
24
|
* http://pluskid.lifegoo.com/?p=261
|
24
25
|
|
25
|
-
RMMSeg can be used as either a stand alone program or an
|
26
|
+
RMMSeg can be used as either a stand alone program or an Analyzer of
|
26
27
|
"Ferret":http://ferret.davebalmain.com/trac.
|
27
28
|
|
28
29
|
<% end %>
|
@@ -46,7 +47,7 @@
|
|
46
47
|
|
47
48
|
sudo gem install rmmseg
|
48
49
|
|
49
|
-
Or you can download the gem file manually from RubyForge and install it locally:
|
50
|
+
Or you can download the gem file manually from "RubyForge":http://rubyforge.org/projects/rmmseg/ and install it locally:
|
50
51
|
|
51
52
|
sudo gem install --local rmmseg-x.y.z.gem
|
52
53
|
|
@@ -77,15 +78,94 @@
|
|
77
78
|
|
78
79
|
rmmseg -h
|
79
80
|
|
80
|
-
It reads from STDIN and print result to STDOUT.
|
81
|
+
It reads from STDIN and print result to STDOUT. Here is a real
|
82
|
+
example:
|
83
|
+
|
84
|
+
$ echo "我们都喜欢用 Ruby" | rmmseg
|
85
|
+
我们 都 喜欢 用 Ruby
|
86
|
+
|
87
|
+
<% end %>
|
88
|
+
|
89
|
+
<% section "Analyzer for Ferret" do %>
|
90
|
+
RMMSeg include an analyzer for Ferret. It is simply ready to
|
91
|
+
use. Just require it and pass it to Ferret. Here's a complete
|
92
|
+
example:
|
93
|
+
|
94
|
+
<code lang="ruby">
|
95
|
+
<%# include ferret_example.rb %>
|
96
|
+
</code>
|
97
|
+
|
98
|
+
execute it on the following key words:
|
99
|
+
|
100
|
+
$ ruby ferret_example.rb Ruby 中文
|
101
|
+
|
102
|
+
will generate the following results:
|
103
|
+
|
104
|
+
<code lang="text">
|
105
|
+
Searching for Ruby...
|
106
|
+
|
107
|
+
*** Document "RMMSeg" found with a score of 0.21875
|
108
|
+
----------------------------------------
|
109
|
+
RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。
|
110
|
+
|
111
|
+
*** Document "Ruby 1.9" found with a score of 0.21875
|
112
|
+
----------------------------------------
|
113
|
+
Ruby 1.9.0 已经发布了,1.9 的一个重大改进就是对 Unicode 的支持。
|
114
|
+
|
115
|
+
*** Document "Ferret" found with a score of 0.176776692271233
|
116
|
+
----------------------------------------
|
117
|
+
Ferret is a high-performance, full-featured text search engine library
|
118
|
+
written for Ruby. It is inspired by Apache Lucene Java project. With
|
119
|
+
the introduction of Ferret, Ruby users now have one of the fastest and
|
120
|
+
most flexible search libraries available. And it's surprisingly easy
|
121
|
+
to use.
|
122
|
+
|
123
|
+
Searching for 中文...
|
124
|
+
|
125
|
+
*** Document "分词" found with a score of 0.281680464744568
|
126
|
+
----------------------------------------
|
127
|
+
中文分词比较困难,不像英文那样,直接在空格和标点符号的地方断开就可以了。
|
128
|
+
|
129
|
+
*** Document "RMMSeg" found with a score of 0.281680464744568
|
130
|
+
----------------------------------------
|
131
|
+
RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。
|
132
|
+
</code>
|
133
|
+
|
134
|
+
And if you run the example in terminal, you'll see the result
|
135
|
+
highlighted as in <%= xref "Ferret Example Screenshot" %>.
|
136
|
+
|
137
|
+
<% figure "Ferret Example Screenshot" do %>
|
138
|
+
!http://pluskid.lifegoo.com/wp-content/uploads/2008/02/rmmseg.png!
|
139
|
+
<% end %>
|
140
|
+
|
141
|
+
<% end %>
|
142
|
+
|
143
|
+
<% section "Customization" do %>
|
144
|
+
RMMSeg can be customized through @RMMSeg::Config@. For example, to use your own dictionaries, just set it before starting to do segmentation:
|
145
|
+
|
146
|
+
<code lang="ruby">
|
147
|
+
RMMSeg::Config.dictionaries = [["dict1.dic", true], # with frequency info
|
148
|
+
["dict2.dic", false], # without
|
149
|
+
["dict3.dic", false]]
|
150
|
+
RMMSeg::Config.max_word_length = 6
|
151
|
+
</code>
|
152
|
+
|
153
|
+
Or to use the simple algorithm for more efficient (and less accurate) segmenting:
|
154
|
+
|
155
|
+
<code>
|
156
|
+
RMMSeg::Config.algorithm = :simple
|
157
|
+
</code>
|
158
|
+
|
159
|
+
For more information on customization, please refer to the RDoc of "RMMSeg::Config":http://rmmseg.rubyforge.org/rmmseg/index.html.
|
81
160
|
<% end %>
|
82
161
|
|
83
162
|
<% end %>
|
84
163
|
|
85
164
|
<% chapter "Resources" do %>
|
86
|
-
* "Project Home":http://
|
87
|
-
* "
|
88
|
-
* "
|
165
|
+
* "Project Home":http://rubyforge.org/projects/rmmseg/: The Project page at RubyForge.
|
166
|
+
* "RDoc of RMMSeg":http://rmmseg.rubyforge.org/rmmseg/index.html: The auto generated rdoc of RMMSeg.
|
167
|
+
* "Implementation Details":http://pluskid.lifegoo.com/?p=261: My blog post about the implementation details of RMMSeg (Chinese).
|
168
|
+
* "Author's Email":mailto:pluskid@gmail.com: Contact me if you have any problem.
|
89
169
|
<% end %>
|
90
170
|
|
91
171
|
<% footer do %>
|
data/misc/homepage.html
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
<html>
|
3
3
|
<head>
|
4
4
|
<meta http-equiv="content-type" content="text/html; charset=utf-8"/>
|
5
|
-
<meta name="date" content="
|
5
|
+
<meta name="date" content="01 February 2008"/>
|
6
6
|
<meta name="author" content="pluskid"/>
|
7
7
|
<meta name="generator" content="Gerbil 1.1.0"/>
|
8
8
|
<title>RMMSeg Homepage</title>
|
@@ -763,19 +763,19 @@
|
|
763
763
|
|
764
764
|
<h1 class="title">RMMSeg Homepage</h1>
|
765
765
|
<h2 class="authors"><a href="http://pluskid.lifegoo.com">pluskid</a></h2>
|
766
|
-
<h3 class="date">
|
766
|
+
<h3 class="date">01 February 2008</h3>
|
767
767
|
</div>
|
768
768
|
|
769
769
|
|
770
|
-
<div id="toc"><h1>Contents</h1> <ul><li>1 <a id="a-
|
770
|
+
<div id="toc"><h1>Contents</h1> <ul><li>1 <a id="a-606801458" href="#Introduction">Introduction</a></li><li>2 <a id="a-606803598" href="#Setup">Setup</a><ul><li>2.1 <a id="a-606805098" href="#Requirements">Requirements</a></li><li>2.2 <a id="a-606807208" href="#Installation">Installation</a><ul><li>2.2.1 <a id="a-606808808" href="#Using-RubyGems">Using RubyGems</a></li><li>2.2.2 <a id="a-606810898" href="#From-Subversion">From Subversion</a></li></ul></li></ul></li><li>3 <a id="a-606815688" href="#Usage">Usage</a><ul><li>3.1 <a id="a-606817228" href="#Stand-Alone-rmmseg">Stand Alone rmmseg</a></li><li>3.2 <a id="a-606819308" href="#Analyzer-for-Ferret">Analyzer for Ferret</a></li><li>3.3 <a id="a-606825488" href="#Customization">Customization</a></li></ul></li><li>4 <a id="a-606828108" href="#Resources">Resources</a></li></ul></div>
|
771
771
|
|
772
|
-
<div id="lof"><h1>Notes</h1> <ol><li><a id="a-
|
772
|
+
<div id="lof"><h1>Figures</h1> <ol><li><a id="a-606823268" href="#Ferret-Example-Screenshot">Ferret Example Screenshot</a></li></ol><h1>Notes</h1> <ol><li><a id="a-606812538" href="#The-latest-code-might-be-unstable">The latest code might be unstable</a></li></ol></div>
|
773
773
|
|
774
774
|
<div id="content">
|
775
775
|
<div class="chapter">
|
776
776
|
<h1 class="title">
|
777
777
|
Chapter
|
778
|
-
<a class="toc" id="Introduction" href="#a-
|
778
|
+
<a class="toc" id="Introduction" href="#a-606801458">1</a>
|
779
779
|
|
780
780
|
<br/>
|
781
781
|
|
@@ -805,13 +805,13 @@ following essays:</p>
|
|
805
805
|
</ul>
|
806
806
|
|
807
807
|
|
808
|
-
<p>RMMSeg can be used as either a stand alone program or an
|
808
|
+
<p>RMMSeg can be used as either a stand alone program or an Analyzer of
|
809
809
|
<a href="http://ferret.davebalmain.com/trac">Ferret</a>.</p></div>
|
810
810
|
</div>
|
811
811
|
<div class="chapter">
|
812
812
|
<h1 class="title">
|
813
813
|
Chapter
|
814
|
-
<a class="toc" id="Setup" href="#a-
|
814
|
+
<a class="toc" id="Setup" href="#a-606803598">2</a>
|
815
815
|
|
816
816
|
<br/>
|
817
817
|
|
@@ -820,7 +820,7 @@ following essays:</p>
|
|
820
820
|
|
821
821
|
<div class="content"><div class="section">
|
822
822
|
<h2 class="title">
|
823
|
-
<a class="toc" id="Requirements" href="#a-
|
823
|
+
<a class="toc" id="Requirements" href="#a-606805098">2.1</a> Requirements
|
824
824
|
</h2>
|
825
825
|
<div class="content">Your system needs the following software to run RMMSeg.
|
826
826
|
|
@@ -850,11 +850,11 @@ following essays:</p>
|
|
850
850
|
</div>
|
851
851
|
<div class="section">
|
852
852
|
<h2 class="title">
|
853
|
-
<a class="toc" id="Installation" href="#a-
|
853
|
+
<a class="toc" id="Installation" href="#a-606807208">2.2</a> Installation
|
854
854
|
</h2>
|
855
855
|
<div class="content"><div class="section">
|
856
856
|
<h3 class="title">
|
857
|
-
<a class="toc" id="Using-RubyGems" href="#a-
|
857
|
+
<a class="toc" id="Using-RubyGems" href="#a-606808808">2.2.1</a> Using RubyGems
|
858
858
|
</h3>
|
859
859
|
<div class="content"><p>To install the gem remotely from <a href="http://rubyforge.org">RubyForge</a> :</p>
|
860
860
|
|
@@ -862,18 +862,18 @@ following essays:</p>
|
|
862
862
|
<pre>sudo gem install rmmseg</pre>
|
863
863
|
|
864
864
|
|
865
|
-
<p>Or you can download the gem file manually from RubyForge and install it locally:</p>
|
865
|
+
<p>Or you can download the gem file manually from <a href="http://rubyforge.org/projects/rmmseg/">RubyForge</a> and install it locally:</p>
|
866
866
|
|
867
867
|
|
868
868
|
<pre>sudo gem install --local rmmseg-x.y.z.gem</pre></div>
|
869
869
|
</div>
|
870
870
|
<div class="section">
|
871
871
|
<h3 class="title">
|
872
|
-
<a class="toc" id="From-Subversion" href="#a-
|
872
|
+
<a class="toc" id="From-Subversion" href="#a-606810898">2.2.2</a> From Subversion
|
873
873
|
</h3>
|
874
874
|
<div class="content"><p>From subversion repository hosted at <a href="http://rmmseg.rubyforge.org/svn/">RubyForge</a>, you can always get the latest source code.
|
875
875
|
<div class="note">
|
876
|
-
<p class="title"><a class="toc" id="The-latest-code-might-be-unstable" href="#a-
|
876
|
+
<p class="title"><a class="toc" id="The-latest-code-might-be-unstable" href="#a-606812538">Note 1</a>. The latest code might be unstable</p>
|
877
877
|
|
878
878
|
<img src="
|
879
879
|
fAhkiAAAAAlwSFlzAAAN1wAADdcBQiibeAAAABl0RVh0U29mdHdhcmUAd3d3
|
@@ -954,7 +954,7 @@ DUCQhrhWJkj394A0gKeUCjVo3r9Zv0r2P3yyQqPd16MPAAAAAElFTkSuQmCC
|
|
954
954
|
<div class="chapter">
|
955
955
|
<h1 class="title">
|
956
956
|
Chapter
|
957
|
-
<a class="toc" id="Usage" href="#a-
|
957
|
+
<a class="toc" id="Usage" href="#a-606815688">3</a>
|
958
958
|
|
959
959
|
<br/>
|
960
960
|
|
@@ -963,7 +963,7 @@ DUCQhrhWJkj394A0gKeUCjVo3r9Zv0r2P3yyQqPd16MPAAAAAElFTkSuQmCC
|
|
963
963
|
|
964
964
|
<div class="content"><div class="section">
|
965
965
|
<h2 class="title">
|
966
|
-
<a class="toc" id="Stand-Alone-rmmseg" href="#a-
|
966
|
+
<a class="toc" id="Stand-Alone-rmmseg" href="#a-606817228">3.1</a> Stand Alone rmmseg
|
967
967
|
</h2>
|
968
968
|
<div class="content"><p>RMMSeg comes with a script <code class="code">rmmseg</code>. To get the basic usage, just execute it with <code class="code">-h</code> option:</p>
|
969
969
|
|
@@ -971,13 +971,158 @@ DUCQhrhWJkj394A0gKeUCjVo3r9Zv0r2P3yyQqPd16MPAAAAAElFTkSuQmCC
|
|
971
971
|
<pre>rmmseg -h</pre>
|
972
972
|
|
973
973
|
|
974
|
-
<p>It reads from STDIN and print result to STDOUT
|
974
|
+
<p>It reads from STDIN and print result to STDOUT. Here is a real
|
975
|
+
example:</p>
|
976
|
+
|
977
|
+
|
978
|
+
<pre>$ echo "我们都喜欢用 Ruby" | rmmseg
|
979
|
+
我们 都 喜欢 用 Ruby</pre></div>
|
980
|
+
</div>
|
981
|
+
<div class="section">
|
982
|
+
<h2 class="title">
|
983
|
+
<a class="toc" id="Analyzer-for-Ferret" href="#a-606819308">3.2</a> Analyzer for Ferret
|
984
|
+
</h2>
|
985
|
+
<div class="content"><p>RMMSeg include an analyzer for Ferret. It is simply ready to
|
986
|
+
use. Just require it and pass it to Ferret. Here’s a complete
|
987
|
+
example:</p>
|
988
|
+
|
989
|
+
|
990
|
+
<pre class="code" lang="ruby">
|
991
|
+
<span style="color:#888">#!/usr/bin/env ruby</span>
|
992
|
+
require <span style="background-color:#fff0f0"><span style="color:#710">'</span><span style="color:#D20">rubygems</span><span style="color:#710">'</span></span>
|
993
|
+
require <span style="background-color:#fff0f0"><span style="color:#710">'</span><span style="color:#D20">rmmseg</span><span style="color:#710">'</span></span>
|
994
|
+
require <span style="background-color:#fff0f0"><span style="color:#710">'</span><span style="color:#D20">rmmseg/ferret</span><span style="color:#710">'</span></span>
|
995
|
+
|
996
|
+
analyzer = <span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Ferret</span>::<span style="color:#036; font-weight:bold">Analyzer</span>.new
|
997
|
+
<span style="color:#d70; font-weight:bold">$index</span> = <span style="color:#036; font-weight:bold">Ferret</span>::<span style="color:#036; font-weight:bold">Index</span>::<span style="color:#036; font-weight:bold">Index</span>.new(<span style="color:#A60">:analyzer</span> => analyzer)
|
998
|
+
|
999
|
+
<span style="color:#d70; font-weight:bold">$index</span> << {
|
1000
|
+
<span style="color:#A60">:title</span> => <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">分词</span><span style="color:#710">"</span></span>,
|
1001
|
+
<span style="color:#A60">:content</span> => <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">中文分词比较困难,不像英文那样,直接在空格和标点符号的地方断开就可以了。</span><span style="color:#710">"</span></span>
|
1002
|
+
}
|
1003
|
+
<span style="color:#d70; font-weight:bold">$index</span> << {
|
1004
|
+
<span style="color:#A60">:title</span> => <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">RMMSeg</span><span style="color:#710">"</span></span>,
|
1005
|
+
<span style="color:#A60">:content</span> => <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。</span><span style="color:#710">"</span></span>
|
1006
|
+
}
|
1007
|
+
<span style="color:#d70; font-weight:bold">$index</span> << {
|
1008
|
+
<span style="color:#A60">:title</span> => <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">Ruby 1.9</span><span style="color:#710">"</span></span>,
|
1009
|
+
<span style="color:#A60">:content</span> => <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">Ruby 1.9.0 已经发布了,1.9 的一个重大改进就是对 Unicode 的支持。</span><span style="color:#710">"</span></span>
|
1010
|
+
}
|
1011
|
+
<span style="color:#d70; font-weight:bold">$index</span> << {
|
1012
|
+
<span style="color:#A60">:title</span> => <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">Ferret</span><span style="color:#710">"</span></span>,
|
1013
|
+
<span style="color:#A60">:content</span> => <span style="background-color:#fff0f0"><span style="color:#710"><<END</span></span><span style="background-color:#fff0f0"><span style="color:#D20">
|
1014
|
+
Ferret is a high-performance, full-featured text search engine library
|
1015
|
+
written for Ruby. It is inspired by Apache Lucene Java project. With
|
1016
|
+
the introduction of Ferret, Ruby users now have one of the fastest and
|
1017
|
+
most flexible search libraries available. And it is surprisingly easy
|
1018
|
+
to use.</span><span style="color:#710">
|
1019
|
+
END</span></span>
|
1020
|
+
}
|
1021
|
+
|
1022
|
+
<span style="color:#080; font-weight:bold">def</span> <span style="color:#06B; font-weight:bold">highlight_search</span>(key)
|
1023
|
+
<span style="color:#d70; font-weight:bold">$index</span>.search_each(<span style="background-color:#fff0f0"><span style="color:#710">%Q!</span><span style="color:#D20">content:"</span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>key<span style="font-weight: bold; color: #888">}</span></span><span style="color:#D20">"</span><span style="color:#710">!</span></span>) <span style="color:#080; font-weight:bold">do</span> |id, score|
|
1024
|
+
puts <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">*** Document </span><span style="color:#04D">\"</span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span><span style="color:#d70; font-weight:bold">$index</span>[id][<span style="color:#A60">:title</span>]<span style="font-weight: bold; color: #888">}</span></span><span style="color:#04D">\"</span><span style="color:#D20"> found with a score of </span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>score<span style="font-weight: bold; color: #888">}</span></span><span style="color:#710">"</span></span>
|
1025
|
+
puts <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">-</span><span style="color:#710">"</span></span>*<span style="color:#00D; font-weight:bold">40</span>
|
1026
|
+
highlights = <span style="color:#d70; font-weight:bold">$index</span>.highlight(<span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">content:</span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>key<span style="font-weight: bold; color: #888">}</span></span><span style="color:#710">"</span></span>, id,
|
1027
|
+
<span style="color:#A60">:field</span> => <span style="color:#A60">:content</span>,
|
1028
|
+
<span style="color:#A60">:pre_tag</span> => <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#04D">\033</span><span style="color:#D20">[36m</span><span style="color:#710">"</span></span>,
|
1029
|
+
<span style="color:#A60">:post_tag</span> => <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#04D">\033</span><span style="color:#D20">[m</span><span style="color:#710">"</span></span>)
|
1030
|
+
puts <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>highlights<span style="font-weight: bold; color: #888">}</span></span><span style="color:#710">"</span></span>
|
1031
|
+
puts <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#710">"</span></span>
|
1032
|
+
<span style="color:#080; font-weight:bold">end</span>
|
1033
|
+
<span style="color:#080; font-weight:bold">end</span>
|
1034
|
+
|
1035
|
+
<span style="color:#038; font-weight:bold">ARGV</span>.each { |key|
|
1036
|
+
puts <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#04D">\033</span><span style="color:#D20">[33mSearching for </span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>key<span style="font-weight: bold; color: #888">}</span></span><span style="color:#D20">...</span><span style="color:#04D">\033</span><span style="color:#D20">[m</span><span style="color:#710">"</span></span>
|
1037
|
+
puts <span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#710">"</span></span>
|
1038
|
+
highlight_search(key)
|
1039
|
+
}
|
1040
|
+
|
1041
|
+
<span style="color:#888"># Local Variables:</span>
|
1042
|
+
<span style="color:#888"># coding: utf-8</span>
|
1043
|
+
<span style="color:#888"># End:</span>
|
1044
|
+
</pre>
|
1045
|
+
|
1046
|
+
|
1047
|
+
<p>execute it on the following key words:</p>
|
1048
|
+
|
1049
|
+
|
1050
|
+
<pre>$ ruby ferret_example.rb Ruby 中文</pre>
|
1051
|
+
|
1052
|
+
|
1053
|
+
<p>will generate the following results:</p>
|
1054
|
+
|
1055
|
+
|
1056
|
+
<pre class="code" lang="text">
|
1057
|
+
Searching for Ruby...
|
1058
|
+
|
1059
|
+
*** Document "RMMSeg" found with a score of 0.21875
|
1060
|
+
----------------------------------------
|
1061
|
+
RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。
|
1062
|
+
|
1063
|
+
*** Document "Ruby 1.9" found with a score of 0.21875
|
1064
|
+
----------------------------------------
|
1065
|
+
Ruby 1.9.0 已经发布了,1.9 的一个重大改进就是对 Unicode 的支持。
|
1066
|
+
|
1067
|
+
*** Document "Ferret" found with a score of 0.176776692271233
|
1068
|
+
----------------------------------------
|
1069
|
+
Ferret is a high-performance, full-featured text search engine library
|
1070
|
+
written for Ruby. It is inspired by Apache Lucene Java project. With
|
1071
|
+
the introduction of Ferret, Ruby users now have one of the fastest and
|
1072
|
+
most flexible search libraries available. And it's surprisingly easy
|
1073
|
+
to use.
|
1074
|
+
|
1075
|
+
Searching for 中文...
|
1076
|
+
|
1077
|
+
*** Document "分词" found with a score of 0.281680464744568
|
1078
|
+
----------------------------------------
|
1079
|
+
中文分词比较困难,不像英文那样,直接在空格和标点符号的地方断开就可以了。
|
1080
|
+
|
1081
|
+
*** Document "RMMSeg" found with a score of 0.281680464744568
|
1082
|
+
----------------------------------------
|
1083
|
+
RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。
|
1084
|
+
</pre>
|
1085
|
+
|
1086
|
+
|
1087
|
+
<p>And if you run the example in terminal, you’ll see the result
|
1088
|
+
highlighted as in <a class="xref" href="#Ferret-Example-Screenshot">Figure 1: <em>Ferret Example Screenshot</em></a>.</p>
|
1089
|
+
|
1090
|
+
|
1091
|
+
<p><div class="figure">
|
1092
|
+
<p class="title"><a class="toc" id="Ferret-Example-Screenshot" href="#a-606823268">Figure 1</a>. Ferret Example Screenshot</p>
|
1093
|
+
<div class="content"><img src="http://pluskid.lifegoo.com/wp-content/uploads/2008/02/rmmseg.png" alt="" /></div>
|
1094
|
+
</div></p></div>
|
1095
|
+
</div>
|
1096
|
+
<div class="section">
|
1097
|
+
<h2 class="title">
|
1098
|
+
<a class="toc" id="Customization" href="#a-606825488">3.3</a> Customization
|
1099
|
+
</h2>
|
1100
|
+
<div class="content"><p>RMMSeg can be customized through <code class="code"><span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Config</span></code>. For example, to use your own dictionaries, just set it before starting to do segmentation:</p>
|
1101
|
+
|
1102
|
+
|
1103
|
+
<pre class="code" lang="ruby">
|
1104
|
+
<span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Config</span>.dictionaries = [[<span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">dict1.dic</span><span style="color:#710">"</span></span>, <span style="color:#038; font-weight:bold">true</span>], <span style="color:#888"># with frequency info</span>
|
1105
|
+
[<span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">dict2.dic</span><span style="color:#710">"</span></span>, <span style="color:#038; font-weight:bold">false</span>], <span style="color:#888"># without</span>
|
1106
|
+
[<span style="background-color:#fff0f0"><span style="color:#710">"</span><span style="color:#D20">dict3.dic</span><span style="color:#710">"</span></span>, <span style="color:#038; font-weight:bold">false</span>]]
|
1107
|
+
<span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Config</span>.max_word_length = <span style="color:#00D; font-weight:bold">6</span>
|
1108
|
+
</pre>
|
1109
|
+
|
1110
|
+
|
1111
|
+
<p>Or to use the simple algorithm for more efficient (and less accurate) segmenting:</p>
|
1112
|
+
|
1113
|
+
|
1114
|
+
<pre class="code">
|
1115
|
+
<span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Config</span>.algorithm = <span style="color:#A60">:simple</span>
|
1116
|
+
</pre>
|
1117
|
+
|
1118
|
+
|
1119
|
+
<p>For more information on customization, please refer to the RDoc of <a href="http://rmmseg.rubyforge.org/rmmseg/index.html">RMMSeg::Config</a>.</p></div>
|
975
1120
|
</div></div>
|
976
1121
|
</div>
|
977
1122
|
<div class="chapter">
|
978
1123
|
<h1 class="title">
|
979
1124
|
Chapter
|
980
|
-
<a class="toc" id="Resources" href="#a-
|
1125
|
+
<a class="toc" id="Resources" href="#a-606828108">4</a>
|
981
1126
|
|
982
1127
|
<br/>
|
983
1128
|
|
@@ -985,9 +1130,10 @@ DUCQhrhWJkj394A0gKeUCjVo3r9Zv0r2P3yyQqPd16MPAAAAAElFTkSuQmCC
|
|
985
1130
|
</h1>
|
986
1131
|
|
987
1132
|
<div class="content"><ul>
|
988
|
-
<li><a href="http://
|
989
|
-
<li><a href="http://
|
990
|
-
<li><a href="http://
|
1133
|
+
<li><a href="http://rubyforge.org/projects/rmmseg/">Project Home</a>: The Project page at RubyForge.</li>
|
1134
|
+
<li><a href="http://rmmseg.rubyforge.org/rmmseg/index.html">RDoc of RMMSeg</a>: The auto generated rdoc of RMMSeg.</li>
|
1135
|
+
<li><a href="http://pluskid.lifegoo.com/?p=261">Implementation Details</a>: My blog post about the implementation details of RMMSeg (Chinese).</li>
|
1136
|
+
<li><a href="mailto:pluskid@gmail.com">Author’s Email</a>: Contact me if you have any problem.</li>
|
991
1137
|
</ul></div>
|
992
1138
|
</div>
|
993
1139
|
</div>
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rmmseg
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- pluskid
|
@@ -39,10 +39,12 @@ files:
|
|
39
39
|
- Rakefile
|
40
40
|
- TODO.txt
|
41
41
|
- bin/rmmseg
|
42
|
+
- data/chars.dic
|
43
|
+
- data/punctuation.dic
|
44
|
+
- data/words.dic
|
42
45
|
- lib/rmmseg.rb
|
43
46
|
- lib/rmmseg/algorithm.rb
|
44
47
|
- lib/rmmseg/amibguity.rb
|
45
|
-
- lib/rmmseg/chars.dic
|
46
48
|
- lib/rmmseg/chunk.rb
|
47
49
|
- lib/rmmseg/complex_algorithm.rb
|
48
50
|
- lib/rmmseg/config.rb
|
@@ -56,7 +58,7 @@ files:
|
|
56
58
|
- lib/rmmseg/svwl_rule.rb
|
57
59
|
- lib/rmmseg/token.rb
|
58
60
|
- lib/rmmseg/word.rb
|
59
|
-
-
|
61
|
+
- misc/ferret_example.rb
|
60
62
|
- misc/homepage.erb
|
61
63
|
- misc/homepage.html
|
62
64
|
- spec/chunk_spec.rb
|