rmmseg 0.0.1 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,6 +1,11 @@
1
+ === 0.1.0 / 2008-02-01
2
+
3
+ * Add filter to filter out Chinese punctuations.
4
+
5
+
1
6
  === 0.0.1 / 2008-01-31
2
7
 
3
- * Analyser integration with Ferret.
8
+ * Analyzer integration with Ferret.
4
9
  * rdoc added
5
10
  * Lazily init the +Word+ objects inside the +Dictionary+.
6
11
  * Handle English punctuation correctly.
data/Manifest.txt CHANGED
@@ -4,10 +4,12 @@ README.txt
4
4
  Rakefile
5
5
  TODO.txt
6
6
  bin/rmmseg
7
+ data/chars.dic
8
+ data/punctuation.dic
9
+ data/words.dic
7
10
  lib/rmmseg.rb
8
11
  lib/rmmseg/algorithm.rb
9
12
  lib/rmmseg/amibguity.rb
10
- lib/rmmseg/chars.dic
11
13
  lib/rmmseg/chunk.rb
12
14
  lib/rmmseg/complex_algorithm.rb
13
15
  lib/rmmseg/config.rb
@@ -21,7 +23,7 @@ lib/rmmseg/simple_algorithm.rb
21
23
  lib/rmmseg/svwl_rule.rb
22
24
  lib/rmmseg/token.rb
23
25
  lib/rmmseg/word.rb
24
- lib/rmmseg/words.dic
26
+ misc/ferret_example.rb
25
27
  misc/homepage.erb
26
28
  misc/homepage.html
27
29
  spec/chunk_spec.rb
data/README.txt CHANGED
@@ -23,11 +23,23 @@ following essays:
23
23
 
24
24
  * Provides +rmmseg+ command line tool for quick and easy way to access
25
25
  the word segment feature.
26
- * Provides an +Analyser+ for integrating with Ferret[http://ferret.davebalmain.com/trac].
26
+ * Provides an +Analyzer+ for integrating with Ferret[http://ferret.davebalmain.com/trac].
27
27
 
28
28
  == SYNOPSIS:
29
29
 
30
+ Using the command line tool +rmmseg+ is simple:
30
31
  $ rmmseg --separator _ < input.txt
32
+ passing option +-h+ can get an overview of all supported options.
33
+
34
+ Using the +Analyzer+ for Ferret is even easier:
35
+
36
+ require 'rmmseg'
37
+ require 'rmmseg/ferret'
38
+
39
+ alalyzer = RMMSeg::Ferret::Analyzer.new
40
+ index = Ferret::Index::Index.new(:analyzer => analyzer)
41
+
42
+ For more details, please refer to the {homepage usage section}[http://rmmseg.rubyforge.org/index.html#Usage].
31
43
 
32
44
  == REQUIREMENTS:
33
45
 
data/Rakefile CHANGED
@@ -23,7 +23,7 @@ Hoe.new('rmmseg', RMMSeg::VERSION) do |p|
23
23
  end
24
24
 
25
25
  task :homepage do
26
- sh "gerbil html misc/homepage.erb > misc/homepage.html"
26
+ sh "cd misc && gerbil html homepage.erb > homepage.html"
27
27
  end
28
28
 
29
29
  task :publish_homepage do
data/TODO.txt CHANGED
@@ -1,3 +1,2 @@
1
1
  === TODO
2
2
 
3
- * Add filter to filter out Chinese punctuations.
data/bin/rmmseg CHANGED
@@ -1,7 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- $: << File.join(File.dirname(__FILE__), "..", "lib")
4
-
5
3
  require 'rmmseg'
6
4
  include RMMSeg
7
5
 
File without changes
@@ -0,0 +1,79 @@
1
+
2
+ ×
3
+ π
4
+
5
+
6
+
7
+
8
+
9
+ °
10
+
11
+
12
+
13
+
14
+
15
+ ±
16
+
17
+
18
+
19
+
20
+
21
+
22
+
23
+
24
+
25
+
26
+
27
+
28
+
29
+ ——
30
+
31
+
32
+
33
+
34
+
35
+
36
+
37
+
38
+
39
+
40
+
41
+
42
+
43
+
44
+
45
+
46
+
47
+
48
+ ·
49
+
50
+
51
+
52
+ §
53
+
54
+
55
+
56
+
57
+
58
+
59
+ ÷
60
+
61
+
62
+
63
+
64
+
65
+
66
+
67
+
68
+  
69
+
70
+
71
+
72
+
73
+
74
+
75
+
76
+
77
+
78
+
79
+
File without changes
data/lib/rmmseg.rb CHANGED
@@ -6,7 +6,7 @@ require 'rmmseg/simple_algorithm'
6
6
  require 'rmmseg/complex_algorithm'
7
7
 
8
8
  module RMMSeg
9
- VERSION = '0.0.1'
9
+ VERSION = '0.1.0'
10
10
 
11
11
  # Segment +text+ using the algorithm configured.
12
12
  def segment(text)
data/lib/rmmseg/config.rb CHANGED
@@ -6,8 +6,9 @@ module RMMSeg
6
6
  class Config
7
7
  @algorithm = :complex
8
8
  @on_ambiguity = :select_first
9
- @dictionaries = [[File.join(File.dirname(__FILE__), "chars.dic"), true],
10
- [File.join(File.dirname(__FILE__), "words.dic"), false]]
9
+ data_dir = File.join(File.dirname(__FILE__), "..", "..", "data")
10
+ @dictionaries = [[File.join(data_dir, "chars.dic"), true],
11
+ [File.join(data_dir, "words.dic"), false]]
11
12
  @max_word_length = 4
12
13
 
13
14
  class << self
data/lib/rmmseg/ferret.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  # This file integrate RMMSeg with Ferret
2
+ require 'singleton'
2
3
  require 'rubygems'
3
4
  require 'ferret'
4
5
 
@@ -6,8 +7,25 @@ module RMMSeg
6
7
  module Ferret
7
8
  # The Analyzer class can be used with Ferret .
8
9
  class Analyzer < ::Ferret::Analysis::Analyzer
10
+
11
+ # Construct an Analyzer. Optional block can be used to
12
+ # add more +TokenFilter+s. e.g.
13
+ #
14
+ # analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
15
+ # Ferret::Analysis::LowerCaseFilter.new(tokenizer)
16
+ # }
17
+ #
18
+ def initialize(&brk)
19
+ @brk = brk
20
+ end
21
+
9
22
  def token_stream(field, text)
10
- Tokenizer.new(text)
23
+ t = PunctuationFilter.new(Tokenizer.new(text))
24
+ if @brk
25
+ @brk.call(t)
26
+ else
27
+ t
28
+ end
11
29
  end
12
30
  end
13
31
 
@@ -39,5 +57,58 @@ module RMMSeg
39
57
  @algor = RMMSeg::Config.algorithm_instance(@text)
40
58
  end
41
59
  end
60
+
61
+ # PunctuationFilter filter out the stand alone Chinese
62
+ # punctuation tokens.
63
+ class PunctuationFilter < ::Ferret::Analysis::TokenStream
64
+ # The punctuation dictionary.
65
+ class Dictionary
66
+ include Singleton
67
+
68
+ DIC_FILE = File.join(File.dirname(__FILE__),
69
+ "..",
70
+ "..",
71
+ "data",
72
+ "punctuation.dic")
73
+ def initialize
74
+ @dic = Hash.new
75
+ File.open(DIC_FILE, "r") do |f|
76
+ f.each_line { |line|
77
+ @dic[line.chomp.freeze] = nil
78
+ }
79
+ end
80
+ end
81
+
82
+ def include?(str)
83
+ @dic.has_key?(str)
84
+ end
85
+ end
86
+
87
+ def initialize(stream)
88
+ @stream = stream
89
+ end
90
+
91
+ # Get next token, skip stand alone Chinese punctuations.
92
+ def next
93
+ token = nil
94
+ dic = Dictionary.instance
95
+ loop do
96
+ token = @stream.next
97
+ break if token.nil?
98
+
99
+ break unless dic.include? token.text
100
+ end
101
+
102
+ token
103
+ end
104
+
105
+ def text
106
+ @stream.text
107
+ end
108
+
109
+ def text=(str)
110
+ @stream.text = str
111
+ end
112
+ end
42
113
  end
43
114
  end
@@ -0,0 +1,56 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'rmmseg'
4
+ require 'rmmseg/ferret'
5
+
6
+ analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
7
+ Ferret::Analysis::LowerCaseFilter.new(tokenizer)
8
+ }
9
+ $index = Ferret::Index::Index.new(:analyzer => analyzer,
10
+ :path => '/tmp/index')
11
+
12
+ $index << {
13
+ :title => "分词",
14
+ :content => "中文分词比较困难,不像英文那样,直接在空格和标点符号的地方断开就可以了。"
15
+ }
16
+ $index << {
17
+ :title => "RMMSeg",
18
+ :content => "RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。"
19
+ }
20
+ $index << {
21
+ :title => "Ruby 1.9",
22
+ :content => "Ruby 1.9.0 已经发布了,1.9 的一个重大改进就是对 Unicode 的支持。"
23
+ }
24
+ $index << {
25
+ :title => "Ferret",
26
+ :content => <<END
27
+ Ferret is a high-performance, full-featured text search engine library
28
+ written for Ruby. It is inspired by Apache Lucene Java project. With
29
+ the introduction of Ferret, Ruby users now have one of the fastest and
30
+ most flexible search libraries available. And it is surprisingly easy
31
+ to use.
32
+ END
33
+ }
34
+
35
+ def highlight_search(key)
36
+ $index.search_each(%Q!content:"#{key}"!) do |id, score|
37
+ puts "*** Document \"#{$index[id][:title]}\" found with a score of #{score}"
38
+ puts "-"*40
39
+ highlights = $index.highlight("content:#{key}", id,
40
+ :field => :content,
41
+ :pre_tag => "\033[36m",
42
+ :post_tag => "\033[m")
43
+ puts "#{highlights}"
44
+ puts ""
45
+ end
46
+ end
47
+
48
+ ARGV.each { |key|
49
+ puts "\033[33mSearching for #{key}...\033[m"
50
+ puts ""
51
+ highlight_search(key)
52
+ }
53
+
54
+ # Local Variables:
55
+ # coding: utf-8
56
+ # End:
data/misc/homepage.erb CHANGED
@@ -1,3 +1,4 @@
1
+ <%# -*- mode: text; coding: utf-8 -*- %>
1
2
  <%
2
3
  $title = "RMMSeg Homepage"
3
4
  $authors = { 'pluskid' => 'http://pluskid.lifegoo.com' }
@@ -22,7 +23,7 @@
22
23
  * http://technology.chtsai.org/mmseg/
23
24
  * http://pluskid.lifegoo.com/?p=261
24
25
 
25
- RMMSeg can be used as either a stand alone program or an Analyser of
26
+ RMMSeg can be used as either a stand alone program or an Analyzer of
26
27
  "Ferret":http://ferret.davebalmain.com/trac.
27
28
 
28
29
  <% end %>
@@ -46,7 +47,7 @@
46
47
 
47
48
  sudo gem install rmmseg
48
49
 
49
- Or you can download the gem file manually from RubyForge and install it locally:
50
+ Or you can download the gem file manually from "RubyForge":http://rubyforge.org/projects/rmmseg/ and install it locally:
50
51
 
51
52
  sudo gem install --local rmmseg-x.y.z.gem
52
53
 
@@ -77,15 +78,94 @@
77
78
 
78
79
  rmmseg -h
79
80
 
80
- It reads from STDIN and print result to STDOUT.
81
+ It reads from STDIN and print result to STDOUT. Here is a real
82
+ example:
83
+
84
+ $ echo "我们都喜欢用 Ruby" | rmmseg
85
+ 我们 都 喜欢 用 Ruby
86
+
87
+ <% end %>
88
+
89
+ <% section "Analyzer for Ferret" do %>
90
+ RMMSeg include an analyzer for Ferret. It is simply ready to
91
+ use. Just require it and pass it to Ferret. Here's a complete
92
+ example:
93
+
94
+ <code lang="ruby">
95
+ <%# include ferret_example.rb %>
96
+ </code>
97
+
98
+ execute it on the following key words:
99
+
100
+ $ ruby ferret_example.rb Ruby 中文
101
+
102
+ will generate the following results:
103
+
104
+ <code lang="text">
105
+ Searching for Ruby...
106
+
107
+ *** Document "RMMSeg" found with a score of 0.21875
108
+ ----------------------------------------
109
+ RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。
110
+
111
+ *** Document "Ruby 1.9" found with a score of 0.21875
112
+ ----------------------------------------
113
+ Ruby 1.9.0 已经发布了,1.9 的一个重大改进就是对 Unicode 的支持。
114
+
115
+ *** Document "Ferret" found with a score of 0.176776692271233
116
+ ----------------------------------------
117
+ Ferret is a high-performance, full-featured text search engine library
118
+ written for Ruby. It is inspired by Apache Lucene Java project. With
119
+ the introduction of Ferret, Ruby users now have one of the fastest and
120
+ most flexible search libraries available. And it's surprisingly easy
121
+ to use.
122
+
123
+ Searching for 中文...
124
+
125
+ *** Document "分词" found with a score of 0.281680464744568
126
+ ----------------------------------------
127
+ 中文分词比较困难,不像英文那样,直接在空格和标点符号的地方断开就可以了。
128
+
129
+ *** Document "RMMSeg" found with a score of 0.281680464744568
130
+ ----------------------------------------
131
+ RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。
132
+ </code>
133
+
134
+ And if you run the example in terminal, you'll see the result
135
+ highlighted as in <%= xref "Ferret Example Screenshot" %>.
136
+
137
+ <% figure "Ferret Example Screenshot" do %>
138
+ !http://pluskid.lifegoo.com/wp-content/uploads/2008/02/rmmseg.png!
139
+ <% end %>
140
+
141
+ <% end %>
142
+
143
+ <% section "Customization" do %>
144
+ RMMSeg can be customized through @RMMSeg::Config@. For example, to use your own dictionaries, just set it before starting to do segmentation:
145
+
146
+ <code lang="ruby">
147
+ RMMSeg::Config.dictionaries = [["dict1.dic", true], # with frequency info
148
+ ["dict2.dic", false], # without
149
+ ["dict3.dic", false]]
150
+ RMMSeg::Config.max_word_length = 6
151
+ </code>
152
+
153
+ Or to use the simple algorithm for more efficient (and less accurate) segmenting:
154
+
155
+ <code>
156
+ RMMSeg::Config.algorithm = :simple
157
+ </code>
158
+
159
+ For more information on customization, please refer to the RDoc of "RMMSeg::Config":http://rmmseg.rubyforge.org/rmmseg/index.html.
81
160
  <% end %>
82
161
 
83
162
  <% end %>
84
163
 
85
164
  <% chapter "Resources" do %>
86
- * "Project Home":http://rmmseg.rubyforge.org/: The Project page at RubyForge.
87
- * "Implementation Details":http://pluskid.lifegoo.com/?p=261: My blog post about the implementation details of RMMSeg.
88
- * "Ferret Homepage":http://ferret.davebalmain.com/trac: The homepage of Ferret project.
165
+ * "Project Home":http://rubyforge.org/projects/rmmseg/: The Project page at RubyForge.
166
+ * "RDoc of RMMSeg":http://rmmseg.rubyforge.org/rmmseg/index.html: The auto generated rdoc of RMMSeg.
167
+ * "Implementation Details":http://pluskid.lifegoo.com/?p=261: My blog post about the implementation details of RMMSeg (Chinese).
168
+ * "Author's Email":mailto:pluskid@gmail.com: Contact me if you have any problem.
89
169
  <% end %>
90
170
 
91
171
  <% footer do %>
data/misc/homepage.html CHANGED
@@ -2,7 +2,7 @@
2
2
  <html>
3
3
  <head>
4
4
  <meta http-equiv="content-type" content="text/html; charset=utf-8"/>
5
- <meta name="date" content="31 January 2008"/>
5
+ <meta name="date" content="01 February 2008"/>
6
6
  <meta name="author" content="pluskid"/>
7
7
  <meta name="generator" content="Gerbil 1.1.0"/>
8
8
  <title>RMMSeg Homepage</title>
@@ -763,19 +763,19 @@
763
763
 
764
764
  <h1 class="title">RMMSeg Homepage</h1>
765
765
  <h2 class="authors"><a href="http://pluskid.lifegoo.com">pluskid</a></h2>
766
- <h3 class="date">31 January 2008</h3>
766
+ <h3 class="date">01 February 2008</h3>
767
767
  </div>
768
768
 
769
769
 
770
- <div id="toc"><h1>Contents</h1> <ul><li>1&nbsp;&nbsp;<a id="a-606563428" href="#Introduction">Introduction</a></li><li>2&nbsp;&nbsp;<a id="a-606565568" href="#Setup">Setup</a><ul><li>2.1&nbsp;&nbsp;<a id="a-606567068" href="#Requirements">Requirements</a></li><li>2.2&nbsp;&nbsp;<a id="a-606569178" href="#Installation">Installation</a><ul><li>2.2.1&nbsp;&nbsp;<a id="a-606570778" href="#Using-RubyGems">Using RubyGems</a></li><li>2.2.2&nbsp;&nbsp;<a id="a-606572868" href="#From-Subversion">From Subversion</a></li></ul></li></ul></li><li>3&nbsp;&nbsp;<a id="a-606577658" href="#Usage">Usage</a><ul><li>3.1&nbsp;&nbsp;<a id="a-606579198" href="#Stand-Alone-rmmseg">Stand Alone rmmseg</a></li></ul></li><li>4&nbsp;&nbsp;<a id="a-606581648" href="#Resources">Resources</a></li></ul></div>
770
+ <div id="toc"><h1>Contents</h1> <ul><li>1&nbsp;&nbsp;<a id="a-606801458" href="#Introduction">Introduction</a></li><li>2&nbsp;&nbsp;<a id="a-606803598" href="#Setup">Setup</a><ul><li>2.1&nbsp;&nbsp;<a id="a-606805098" href="#Requirements">Requirements</a></li><li>2.2&nbsp;&nbsp;<a id="a-606807208" href="#Installation">Installation</a><ul><li>2.2.1&nbsp;&nbsp;<a id="a-606808808" href="#Using-RubyGems">Using RubyGems</a></li><li>2.2.2&nbsp;&nbsp;<a id="a-606810898" href="#From-Subversion">From Subversion</a></li></ul></li></ul></li><li>3&nbsp;&nbsp;<a id="a-606815688" href="#Usage">Usage</a><ul><li>3.1&nbsp;&nbsp;<a id="a-606817228" href="#Stand-Alone-rmmseg">Stand Alone rmmseg</a></li><li>3.2&nbsp;&nbsp;<a id="a-606819308" href="#Analyzer-for-Ferret">Analyzer for Ferret</a></li><li>3.3&nbsp;&nbsp;<a id="a-606825488" href="#Customization">Customization</a></li></ul></li><li>4&nbsp;&nbsp;<a id="a-606828108" href="#Resources">Resources</a></li></ul></div>
771
771
 
772
- <div id="lof"><h1>Notes</h1> <ol><li><a id="a-606574508" href="#The-latest-code-might-be-unstable">The latest code might be unstable</a></li></ol></div>
772
+ <div id="lof"><h1>Figures</h1> <ol><li><a id="a-606823268" href="#Ferret-Example-Screenshot">Ferret Example Screenshot</a></li></ol><h1>Notes</h1> <ol><li><a id="a-606812538" href="#The-latest-code-might-be-unstable">The latest code might be unstable</a></li></ol></div>
773
773
 
774
774
  <div id="content">
775
775
  <div class="chapter">
776
776
  <h1 class="title">
777
777
  Chapter
778
- <a class="toc" id="Introduction" href="#a-606563428">1</a>
778
+ <a class="toc" id="Introduction" href="#a-606801458">1</a>
779
779
 
780
780
  <br/>
781
781
 
@@ -805,13 +805,13 @@ following essays:</p>
805
805
  </ul>
806
806
 
807
807
 
808
- <p>RMMSeg can be used as either a stand alone program or an Analyser of
808
+ <p>RMMSeg can be used as either a stand alone program or an Analyzer of
809
809
  <a href="http://ferret.davebalmain.com/trac">Ferret</a>.</p></div>
810
810
  </div>
811
811
  <div class="chapter">
812
812
  <h1 class="title">
813
813
  Chapter
814
- <a class="toc" id="Setup" href="#a-606565568">2</a>
814
+ <a class="toc" id="Setup" href="#a-606803598">2</a>
815
815
 
816
816
  <br/>
817
817
 
@@ -820,7 +820,7 @@ following essays:</p>
820
820
 
821
821
  <div class="content"><div class="section">
822
822
  <h2 class="title">
823
- <a class="toc" id="Requirements" href="#a-606567068">2.1</a>&nbsp;&nbsp;Requirements
823
+ <a class="toc" id="Requirements" href="#a-606805098">2.1</a>&nbsp;&nbsp;Requirements
824
824
  </h2>
825
825
  <div class="content">Your system needs the following software to run RMMSeg.
826
826
 
@@ -850,11 +850,11 @@ following essays:</p>
850
850
  </div>
851
851
  <div class="section">
852
852
  <h2 class="title">
853
- <a class="toc" id="Installation" href="#a-606569178">2.2</a>&nbsp;&nbsp;Installation
853
+ <a class="toc" id="Installation" href="#a-606807208">2.2</a>&nbsp;&nbsp;Installation
854
854
  </h2>
855
855
  <div class="content"><div class="section">
856
856
  <h3 class="title">
857
- <a class="toc" id="Using-RubyGems" href="#a-606570778">2.2.1</a>&nbsp;&nbsp;Using RubyGems
857
+ <a class="toc" id="Using-RubyGems" href="#a-606808808">2.2.1</a>&nbsp;&nbsp;Using RubyGems
858
858
  </h3>
859
859
  <div class="content"><p>To install the gem remotely from <a href="http://rubyforge.org">RubyForge</a> :</p>
860
860
 
@@ -862,18 +862,18 @@ following essays:</p>
862
862
  <pre>sudo gem install rmmseg</pre>
863
863
 
864
864
 
865
- <p>Or you can download the gem file manually from RubyForge and install it locally:</p>
865
+ <p>Or you can download the gem file manually from <a href="http://rubyforge.org/projects/rmmseg/">RubyForge</a> and install it locally:</p>
866
866
 
867
867
 
868
868
  <pre>sudo gem install --local rmmseg-x.y.z.gem</pre></div>
869
869
  </div>
870
870
  <div class="section">
871
871
  <h3 class="title">
872
- <a class="toc" id="From-Subversion" href="#a-606572868">2.2.2</a>&nbsp;&nbsp;From Subversion
872
+ <a class="toc" id="From-Subversion" href="#a-606810898">2.2.2</a>&nbsp;&nbsp;From Subversion
873
873
  </h3>
874
874
  <div class="content"><p>From subversion repository hosted at <a href="http://rmmseg.rubyforge.org/svn/">RubyForge</a>, you can always get the latest source code.
875
875
  <div class="note">
876
- <p class="title"><a class="toc" id="The-latest-code-might-be-unstable" href="#a-606574508">Note 1</a>.&nbsp;&nbsp;The latest code might be unstable</p>
876
+ <p class="title"><a class="toc" id="The-latest-code-might-be-unstable" href="#a-606812538">Note 1</a>.&nbsp;&nbsp;The latest code might be unstable</p>
877
877
 
878
878
  <img src="
879
879
  fAhkiAAAAAlwSFlzAAAN1wAADdcBQiibeAAAABl0RVh0U29mdHdhcmUAd3d3
@@ -954,7 +954,7 @@ DUCQhrhWJkj394A0gKeUCjVo3r9Zv0r2P3yyQqPd16MPAAAAAElFTkSuQmCC
954
954
  <div class="chapter">
955
955
  <h1 class="title">
956
956
  Chapter
957
- <a class="toc" id="Usage" href="#a-606577658">3</a>
957
+ <a class="toc" id="Usage" href="#a-606815688">3</a>
958
958
 
959
959
  <br/>
960
960
 
@@ -963,7 +963,7 @@ DUCQhrhWJkj394A0gKeUCjVo3r9Zv0r2P3yyQqPd16MPAAAAAElFTkSuQmCC
963
963
 
964
964
  <div class="content"><div class="section">
965
965
  <h2 class="title">
966
- <a class="toc" id="Stand-Alone-rmmseg" href="#a-606579198">3.1</a>&nbsp;&nbsp;Stand Alone rmmseg
966
+ <a class="toc" id="Stand-Alone-rmmseg" href="#a-606817228">3.1</a>&nbsp;&nbsp;Stand Alone rmmseg
967
967
  </h2>
968
968
  <div class="content"><p>RMMSeg comes with a script <code class="code">rmmseg</code>. To get the basic usage, just execute it with <code class="code">-h</code> option:</p>
969
969
 
@@ -971,13 +971,158 @@ DUCQhrhWJkj394A0gKeUCjVo3r9Zv0r2P3yyQqPd16MPAAAAAElFTkSuQmCC
971
971
  <pre>rmmseg -h</pre>
972
972
 
973
973
 
974
- <p>It reads from STDIN and print result to STDOUT.</p></div>
974
+ <p>It reads from STDIN and print result to STDOUT. Here is a real
975
+ example:</p>
976
+
977
+
978
+ <pre>$ echo "我们都喜欢用 Ruby" | rmmseg
979
+ 我们 都 喜欢 用 Ruby</pre></div>
980
+ </div>
981
+ <div class="section">
982
+ <h2 class="title">
983
+ <a class="toc" id="Analyzer-for-Ferret" href="#a-606819308">3.2</a>&nbsp;&nbsp;Analyzer for Ferret
984
+ </h2>
985
+ <div class="content"><p>RMMSeg include an analyzer for Ferret. It is simply ready to
986
+ use. Just require it and pass it to Ferret. Here&#8217;s a complete
987
+ example:</p>
988
+
989
+
990
+ <pre class="code" lang="ruby">
991
+ <span style="color:#888">#!/usr/bin/env ruby</span>
992
+ require <span style="background-color:#fff0f0"><span style="color:#710">'</span><span style="color:#D20">rubygems</span><span style="color:#710">'</span></span>
993
+ require <span style="background-color:#fff0f0"><span style="color:#710">'</span><span style="color:#D20">rmmseg</span><span style="color:#710">'</span></span>
994
+ require <span style="background-color:#fff0f0"><span style="color:#710">'</span><span style="color:#D20">rmmseg/ferret</span><span style="color:#710">'</span></span>
995
+
996
+ analyzer = <span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Ferret</span>::<span style="color:#036; font-weight:bold">Analyzer</span>.new
997
+ <span style="color:#d70; font-weight:bold">$index</span> = <span style="color:#036; font-weight:bold">Ferret</span>::<span style="color:#036; font-weight:bold">Index</span>::<span style="color:#036; font-weight:bold">Index</span>.new(<span style="color:#A60">:analyzer</span> =&gt; analyzer)
998
+
999
+ <span style="color:#d70; font-weight:bold">$index</span> &lt;&lt; {
1000
+ <span style="color:#A60">:title</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">分词</span><span style="color:#710">&quot;</span></span>,
1001
+ <span style="color:#A60">:content</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">中文分词比较困难,不像英文那样,直接在空格和标点符号的地方断开就可以了。</span><span style="color:#710">&quot;</span></span>
1002
+ }
1003
+ <span style="color:#d70; font-weight:bold">$index</span> &lt;&lt; {
1004
+ <span style="color:#A60">:title</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">RMMSeg</span><span style="color:#710">&quot;</span></span>,
1005
+ <span style="color:#A60">:content</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。</span><span style="color:#710">&quot;</span></span>
1006
+ }
1007
+ <span style="color:#d70; font-weight:bold">$index</span> &lt;&lt; {
1008
+ <span style="color:#A60">:title</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">Ruby 1.9</span><span style="color:#710">&quot;</span></span>,
1009
+ <span style="color:#A60">:content</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">Ruby 1.9.0 已经发布了,1.9 的一个重大改进就是对 Unicode 的支持。</span><span style="color:#710">&quot;</span></span>
1010
+ }
1011
+ <span style="color:#d70; font-weight:bold">$index</span> &lt;&lt; {
1012
+ <span style="color:#A60">:title</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">Ferret</span><span style="color:#710">&quot;</span></span>,
1013
+ <span style="color:#A60">:content</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&lt;&lt;END</span></span><span style="background-color:#fff0f0"><span style="color:#D20">
1014
+ Ferret is a high-performance, full-featured text search engine library
1015
+ written for Ruby. It is inspired by Apache Lucene Java project. With
1016
+ the introduction of Ferret, Ruby users now have one of the fastest and
1017
+ most flexible search libraries available. And it is surprisingly easy
1018
+ to use.</span><span style="color:#710">
1019
+ END</span></span>
1020
+ }
1021
+
1022
+ <span style="color:#080; font-weight:bold">def</span> <span style="color:#06B; font-weight:bold">highlight_search</span>(key)
1023
+ <span style="color:#d70; font-weight:bold">$index</span>.search_each(<span style="background-color:#fff0f0"><span style="color:#710">%Q!</span><span style="color:#D20">content:&quot;</span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>key<span style="font-weight: bold; color: #888">}</span></span><span style="color:#D20">&quot;</span><span style="color:#710">!</span></span>) <span style="color:#080; font-weight:bold">do</span> |id, score|
1024
+ puts <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">*** Document </span><span style="color:#04D">\&quot;</span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span><span style="color:#d70; font-weight:bold">$index</span>[id][<span style="color:#A60">:title</span>]<span style="font-weight: bold; color: #888">}</span></span><span style="color:#04D">\&quot;</span><span style="color:#D20"> found with a score of </span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>score<span style="font-weight: bold; color: #888">}</span></span><span style="color:#710">&quot;</span></span>
1025
+ puts <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">-</span><span style="color:#710">&quot;</span></span>*<span style="color:#00D; font-weight:bold">40</span>
1026
+ highlights = <span style="color:#d70; font-weight:bold">$index</span>.highlight(<span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">content:</span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>key<span style="font-weight: bold; color: #888">}</span></span><span style="color:#710">&quot;</span></span>, id,
1027
+ <span style="color:#A60">:field</span> =&gt; <span style="color:#A60">:content</span>,
1028
+ <span style="color:#A60">:pre_tag</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#04D">\033</span><span style="color:#D20">[36m</span><span style="color:#710">&quot;</span></span>,
1029
+ <span style="color:#A60">:post_tag</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#04D">\033</span><span style="color:#D20">[m</span><span style="color:#710">&quot;</span></span>)
1030
+ puts <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>highlights<span style="font-weight: bold; color: #888">}</span></span><span style="color:#710">&quot;</span></span>
1031
+ puts <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#710">&quot;</span></span>
1032
+ <span style="color:#080; font-weight:bold">end</span>
1033
+ <span style="color:#080; font-weight:bold">end</span>
1034
+
1035
+ <span style="color:#038; font-weight:bold">ARGV</span>.each { |key|
1036
+ puts <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#04D">\033</span><span style="color:#D20">[33mSearching for </span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>key<span style="font-weight: bold; color: #888">}</span></span><span style="color:#D20">...</span><span style="color:#04D">\033</span><span style="color:#D20">[m</span><span style="color:#710">&quot;</span></span>
1037
+ puts <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#710">&quot;</span></span>
1038
+ highlight_search(key)
1039
+ }
1040
+
1041
+ <span style="color:#888"># Local Variables:</span>
1042
+ <span style="color:#888"># coding: utf-8</span>
1043
+ <span style="color:#888"># End:</span>
1044
+ </pre>
1045
+
1046
+
1047
+ <p>execute it on the following key words:</p>
1048
+
1049
+
1050
+ <pre>$ ruby ferret_example.rb Ruby 中文</pre>
1051
+
1052
+
1053
+ <p>will generate the following results:</p>
1054
+
1055
+
1056
+ <pre class="code" lang="text">
1057
+ Searching for Ruby...
1058
+
1059
+ *** Document &quot;RMMSeg&quot; found with a score of 0.21875
1060
+ ----------------------------------------
1061
+ RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。
1062
+
1063
+ *** Document &quot;Ruby 1.9&quot; found with a score of 0.21875
1064
+ ----------------------------------------
1065
+ Ruby 1.9.0 已经发布了,1.9 的一个重大改进就是对 Unicode 的支持。
1066
+
1067
+ *** Document &quot;Ferret&quot; found with a score of 0.176776692271233
1068
+ ----------------------------------------
1069
+ Ferret is a high-performance, full-featured text search engine library
1070
+ written for Ruby. It is inspired by Apache Lucene Java project. With
1071
+ the introduction of Ferret, Ruby users now have one of the fastest and
1072
+ most flexible search libraries available. And it's surprisingly easy
1073
+ to use.
1074
+
1075
+ Searching for 中文...
1076
+
1077
+ *** Document &quot;分词&quot; found with a score of 0.281680464744568
1078
+ ----------------------------------------
1079
+ 中文分词比较困难,不像英文那样,直接在空格和标点符号的地方断开就可以了。
1080
+
1081
+ *** Document &quot;RMMSeg&quot; found with a score of 0.281680464744568
1082
+ ----------------------------------------
1083
+ RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。
1084
+ </pre>
1085
+
1086
+
1087
+ <p>And if you run the example in terminal, you&#8217;ll see the result
1088
+ highlighted as in <a class="xref" href="#Ferret-Example-Screenshot">Figure 1: <em>Ferret Example Screenshot</em></a>.</p>
1089
+
1090
+
1091
+ <p><div class="figure">
1092
+ <p class="title"><a class="toc" id="Ferret-Example-Screenshot" href="#a-606823268">Figure 1</a>.&nbsp;&nbsp;Ferret Example Screenshot</p>
1093
+ <div class="content"><img src="http://pluskid.lifegoo.com/wp-content/uploads/2008/02/rmmseg.png" alt="" /></div>
1094
+ </div></p></div>
1095
+ </div>
1096
+ <div class="section">
1097
+ <h2 class="title">
1098
+ <a class="toc" id="Customization" href="#a-606825488">3.3</a>&nbsp;&nbsp;Customization
1099
+ </h2>
1100
+ <div class="content"><p>RMMSeg can be customized through <code class="code"><span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Config</span></code>. For example, to use your own dictionaries, just set it before starting to do segmentation:</p>
1101
+
1102
+
1103
+ <pre class="code" lang="ruby">
1104
+ <span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Config</span>.dictionaries = [[<span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">dict1.dic</span><span style="color:#710">&quot;</span></span>, <span style="color:#038; font-weight:bold">true</span>], <span style="color:#888"># with frequency info</span>
1105
+ [<span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">dict2.dic</span><span style="color:#710">&quot;</span></span>, <span style="color:#038; font-weight:bold">false</span>], <span style="color:#888"># without</span>
1106
+ [<span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">dict3.dic</span><span style="color:#710">&quot;</span></span>, <span style="color:#038; font-weight:bold">false</span>]]
1107
+ <span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Config</span>.max_word_length = <span style="color:#00D; font-weight:bold">6</span>
1108
+ </pre>
1109
+
1110
+
1111
+ <p>Or to use the simple algorithm for more efficient (and less accurate) segmenting:</p>
1112
+
1113
+
1114
+ <pre class="code">
1115
+ <span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Config</span>.algorithm = <span style="color:#A60">:simple</span>
1116
+ </pre>
1117
+
1118
+
1119
+ <p>For more information on customization, please refer to the RDoc of <a href="http://rmmseg.rubyforge.org/rmmseg/index.html">RMMSeg::Config</a>.</p></div>
975
1120
  </div></div>
976
1121
  </div>
977
1122
  <div class="chapter">
978
1123
  <h1 class="title">
979
1124
  Chapter
980
- <a class="toc" id="Resources" href="#a-606581648">4</a>
1125
+ <a class="toc" id="Resources" href="#a-606828108">4</a>
981
1126
 
982
1127
  <br/>
983
1128
 
@@ -985,9 +1130,10 @@ DUCQhrhWJkj394A0gKeUCjVo3r9Zv0r2P3yyQqPd16MPAAAAAElFTkSuQmCC
985
1130
  </h1>
986
1131
 
987
1132
  <div class="content"><ul>
988
- <li><a href="http://rmmseg.rubyforge.org/">Project Home</a>: The Project page at RubyForge.</li>
989
- <li><a href="http://pluskid.lifegoo.com/?p=261">Implementation Details</a>: My blog post about the implementation details of RMMSeg.</li>
990
- <li><a href="http://ferret.davebalmain.com/trac">Ferret Homepage</a>: The homepage of Ferret project.</li>
1133
+ <li><a href="http://rubyforge.org/projects/rmmseg/">Project Home</a>: The Project page at RubyForge.</li>
1134
+ <li><a href="http://rmmseg.rubyforge.org/rmmseg/index.html">RDoc of RMMSeg</a>: The auto generated rdoc of RMMSeg.</li>
1135
+ <li><a href="http://pluskid.lifegoo.com/?p=261">Implementation Details</a>: My blog post about the implementation details of RMMSeg (Chinese).</li>
1136
+ <li><a href="mailto:pluskid@gmail.com">Author&#8217;s Email</a>: Contact me if you have any problem.</li>
991
1137
  </ul></div>
992
1138
  </div>
993
1139
  </div>
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rmmseg
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - pluskid
@@ -39,10 +39,12 @@ files:
39
39
  - Rakefile
40
40
  - TODO.txt
41
41
  - bin/rmmseg
42
+ - data/chars.dic
43
+ - data/punctuation.dic
44
+ - data/words.dic
42
45
  - lib/rmmseg.rb
43
46
  - lib/rmmseg/algorithm.rb
44
47
  - lib/rmmseg/amibguity.rb
45
- - lib/rmmseg/chars.dic
46
48
  - lib/rmmseg/chunk.rb
47
49
  - lib/rmmseg/complex_algorithm.rb
48
50
  - lib/rmmseg/config.rb
@@ -56,7 +58,7 @@ files:
56
58
  - lib/rmmseg/svwl_rule.rb
57
59
  - lib/rmmseg/token.rb
58
60
  - lib/rmmseg/word.rb
59
- - lib/rmmseg/words.dic
61
+ - misc/ferret_example.rb
60
62
  - misc/homepage.erb
61
63
  - misc/homepage.html
62
64
  - spec/chunk_spec.rb