rmmseg 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,6 +1,11 @@
1
+ === 0.1.0 / 2008-02-01
2
+
3
+ * Add filter to filter out Chinese punctuations.
4
+
5
+
1
6
  === 0.0.1 / 2008-01-31
2
7
 
3
- * Analyser integration with Ferret.
8
+ * Analyzer integration with Ferret.
4
9
  * rdoc added
5
10
  * Lazily init the +Word+ objects inside the +Dictionary+.
6
11
  * Handle English punctuation correctly.
data/Manifest.txt CHANGED
@@ -4,10 +4,12 @@ README.txt
4
4
  Rakefile
5
5
  TODO.txt
6
6
  bin/rmmseg
7
+ data/chars.dic
8
+ data/punctuation.dic
9
+ data/words.dic
7
10
  lib/rmmseg.rb
8
11
  lib/rmmseg/algorithm.rb
9
12
  lib/rmmseg/amibguity.rb
10
- lib/rmmseg/chars.dic
11
13
  lib/rmmseg/chunk.rb
12
14
  lib/rmmseg/complex_algorithm.rb
13
15
  lib/rmmseg/config.rb
@@ -21,7 +23,7 @@ lib/rmmseg/simple_algorithm.rb
21
23
  lib/rmmseg/svwl_rule.rb
22
24
  lib/rmmseg/token.rb
23
25
  lib/rmmseg/word.rb
24
- lib/rmmseg/words.dic
26
+ misc/ferret_example.rb
25
27
  misc/homepage.erb
26
28
  misc/homepage.html
27
29
  spec/chunk_spec.rb
data/README.txt CHANGED
@@ -23,11 +23,23 @@ following essays:
23
23
 
24
24
  * Provides +rmmseg+ command line tool for quick and easy way to access
25
25
  the word segment feature.
26
- * Provides an +Analyser+ for integrating with Ferret[http://ferret.davebalmain.com/trac].
26
+ * Provides an +Analyzer+ for integrating with Ferret[http://ferret.davebalmain.com/trac].
27
27
 
28
28
  == SYNOPSIS:
29
29
 
30
+ Using the command line tool +rmmseg+ is simple:
30
31
  $ rmmseg --separator _ < input.txt
32
+ passing option +-h+ can get an overview of all supported options.
33
+
34
+ Using the +Analyzer+ for Ferret is even easier:
35
+
36
+ require 'rmmseg'
37
+ require 'rmmseg/ferret'
38
+
39
+ alalyzer = RMMSeg::Ferret::Analyzer.new
40
+ index = Ferret::Index::Index.new(:analyzer => analyzer)
41
+
42
+ For more details, please refer to the {homepage usage section}[http://rmmseg.rubyforge.org/index.html#Usage].
31
43
 
32
44
  == REQUIREMENTS:
33
45
 
data/Rakefile CHANGED
@@ -23,7 +23,7 @@ Hoe.new('rmmseg', RMMSeg::VERSION) do |p|
23
23
  end
24
24
 
25
25
  task :homepage do
26
- sh "gerbil html misc/homepage.erb > misc/homepage.html"
26
+ sh "cd misc && gerbil html homepage.erb > homepage.html"
27
27
  end
28
28
 
29
29
  task :publish_homepage do
data/TODO.txt CHANGED
@@ -1,3 +1,2 @@
1
1
  === TODO
2
2
 
3
- * Add filter to filter out Chinese punctuations.
data/bin/rmmseg CHANGED
@@ -1,7 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- $: << File.join(File.dirname(__FILE__), "..", "lib")
4
-
5
3
  require 'rmmseg'
6
4
  include RMMSeg
7
5
 
File without changes
@@ -0,0 +1,79 @@
1
+
2
+ ×
3
+ π
4
+
5
+
6
+
7
+
8
+
9
+ °
10
+
11
+
12
+
13
+
14
+
15
+ ±
16
+
17
+
18
+
19
+
20
+
21
+
22
+
23
+
24
+
25
+
26
+
27
+
28
+
29
+ ——
30
+
31
+
32
+
33
+
34
+
35
+
36
+
37
+
38
+
39
+
40
+
41
+
42
+
43
+
44
+
45
+
46
+
47
+
48
+ ·
49
+
50
+
51
+
52
+ §
53
+
54
+
55
+
56
+
57
+
58
+
59
+ ÷
60
+
61
+
62
+
63
+
64
+
65
+
66
+
67
+
68
+  
69
+
70
+
71
+
72
+
73
+
74
+
75
+
76
+
77
+
78
+
79
+
File without changes
data/lib/rmmseg.rb CHANGED
@@ -6,7 +6,7 @@ require 'rmmseg/simple_algorithm'
6
6
  require 'rmmseg/complex_algorithm'
7
7
 
8
8
  module RMMSeg
9
- VERSION = '0.0.1'
9
+ VERSION = '0.1.0'
10
10
 
11
11
  # Segment +text+ using the algorithm configured.
12
12
  def segment(text)
data/lib/rmmseg/config.rb CHANGED
@@ -6,8 +6,9 @@ module RMMSeg
6
6
  class Config
7
7
  @algorithm = :complex
8
8
  @on_ambiguity = :select_first
9
- @dictionaries = [[File.join(File.dirname(__FILE__), "chars.dic"), true],
10
- [File.join(File.dirname(__FILE__), "words.dic"), false]]
9
+ data_dir = File.join(File.dirname(__FILE__), "..", "..", "data")
10
+ @dictionaries = [[File.join(data_dir, "chars.dic"), true],
11
+ [File.join(data_dir, "words.dic"), false]]
11
12
  @max_word_length = 4
12
13
 
13
14
  class << self
data/lib/rmmseg/ferret.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  # This file integrate RMMSeg with Ferret
2
+ require 'singleton'
2
3
  require 'rubygems'
3
4
  require 'ferret'
4
5
 
@@ -6,8 +7,25 @@ module RMMSeg
6
7
  module Ferret
7
8
  # The Analyzer class can be used with Ferret .
8
9
  class Analyzer < ::Ferret::Analysis::Analyzer
10
+
11
+ # Construct an Analyzer. Optional block can be used to
12
+ # add more +TokenFilter+s. e.g.
13
+ #
14
+ # analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
15
+ # Ferret::Analysis::LowerCaseFilter.new(tokenizer)
16
+ # }
17
+ #
18
+ def initialize(&brk)
19
+ @brk = brk
20
+ end
21
+
9
22
  def token_stream(field, text)
10
- Tokenizer.new(text)
23
+ t = PunctuationFilter.new(Tokenizer.new(text))
24
+ if @brk
25
+ @brk.call(t)
26
+ else
27
+ t
28
+ end
11
29
  end
12
30
  end
13
31
 
@@ -39,5 +57,58 @@ module RMMSeg
39
57
  @algor = RMMSeg::Config.algorithm_instance(@text)
40
58
  end
41
59
  end
60
+
61
+ # PunctuationFilter filter out the stand alone Chinese
62
+ # punctuation tokens.
63
+ class PunctuationFilter < ::Ferret::Analysis::TokenStream
64
+ # The punctuation dictionary.
65
+ class Dictionary
66
+ include Singleton
67
+
68
+ DIC_FILE = File.join(File.dirname(__FILE__),
69
+ "..",
70
+ "..",
71
+ "data",
72
+ "punctuation.dic")
73
+ def initialize
74
+ @dic = Hash.new
75
+ File.open(DIC_FILE, "r") do |f|
76
+ f.each_line { |line|
77
+ @dic[line.chomp.freeze] = nil
78
+ }
79
+ end
80
+ end
81
+
82
+ def include?(str)
83
+ @dic.has_key?(str)
84
+ end
85
+ end
86
+
87
+ def initialize(stream)
88
+ @stream = stream
89
+ end
90
+
91
+ # Get next token, skip stand alone Chinese punctuations.
92
+ def next
93
+ token = nil
94
+ dic = Dictionary.instance
95
+ loop do
96
+ token = @stream.next
97
+ break if token.nil?
98
+
99
+ break unless dic.include? token.text
100
+ end
101
+
102
+ token
103
+ end
104
+
105
+ def text
106
+ @stream.text
107
+ end
108
+
109
+ def text=(str)
110
+ @stream.text = str
111
+ end
112
+ end
42
113
  end
43
114
  end
@@ -0,0 +1,56 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'rmmseg'
4
+ require 'rmmseg/ferret'
5
+
6
+ analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
7
+ Ferret::Analysis::LowerCaseFilter.new(tokenizer)
8
+ }
9
+ $index = Ferret::Index::Index.new(:analyzer => analyzer,
10
+ :path => '/tmp/index')
11
+
12
+ $index << {
13
+ :title => "分词",
14
+ :content => "中文分词比较困难,不像英文那样,直接在空格和标点符号的地方断开就可以了。"
15
+ }
16
+ $index << {
17
+ :title => "RMMSeg",
18
+ :content => "RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。"
19
+ }
20
+ $index << {
21
+ :title => "Ruby 1.9",
22
+ :content => "Ruby 1.9.0 已经发布了,1.9 的一个重大改进就是对 Unicode 的支持。"
23
+ }
24
+ $index << {
25
+ :title => "Ferret",
26
+ :content => <<END
27
+ Ferret is a high-performance, full-featured text search engine library
28
+ written for Ruby. It is inspired by Apache Lucene Java project. With
29
+ the introduction of Ferret, Ruby users now have one of the fastest and
30
+ most flexible search libraries available. And it is surprisingly easy
31
+ to use.
32
+ END
33
+ }
34
+
35
+ def highlight_search(key)
36
+ $index.search_each(%Q!content:"#{key}"!) do |id, score|
37
+ puts "*** Document \"#{$index[id][:title]}\" found with a score of #{score}"
38
+ puts "-"*40
39
+ highlights = $index.highlight("content:#{key}", id,
40
+ :field => :content,
41
+ :pre_tag => "\033[36m",
42
+ :post_tag => "\033[m")
43
+ puts "#{highlights}"
44
+ puts ""
45
+ end
46
+ end
47
+
48
+ ARGV.each { |key|
49
+ puts "\033[33mSearching for #{key}...\033[m"
50
+ puts ""
51
+ highlight_search(key)
52
+ }
53
+
54
+ # Local Variables:
55
+ # coding: utf-8
56
+ # End:
data/misc/homepage.erb CHANGED
@@ -1,3 +1,4 @@
1
+ <%# -*- mode: text; coding: utf-8 -*- %>
1
2
  <%
2
3
  $title = "RMMSeg Homepage"
3
4
  $authors = { 'pluskid' => 'http://pluskid.lifegoo.com' }
@@ -22,7 +23,7 @@
22
23
  * http://technology.chtsai.org/mmseg/
23
24
  * http://pluskid.lifegoo.com/?p=261
24
25
 
25
- RMMSeg can be used as either a stand alone program or an Analyser of
26
+ RMMSeg can be used as either a stand alone program or an Analyzer of
26
27
  "Ferret":http://ferret.davebalmain.com/trac.
27
28
 
28
29
  <% end %>
@@ -46,7 +47,7 @@
46
47
 
47
48
  sudo gem install rmmseg
48
49
 
49
- Or you can download the gem file manually from RubyForge and install it locally:
50
+ Or you can download the gem file manually from "RubyForge":http://rubyforge.org/projects/rmmseg/ and install it locally:
50
51
 
51
52
  sudo gem install --local rmmseg-x.y.z.gem
52
53
 
@@ -77,15 +78,94 @@
77
78
 
78
79
  rmmseg -h
79
80
 
80
- It reads from STDIN and print result to STDOUT.
81
+ It reads from STDIN and print result to STDOUT. Here is a real
82
+ example:
83
+
84
+ $ echo "我们都喜欢用 Ruby" | rmmseg
85
+ 我们 都 喜欢 用 Ruby
86
+
87
+ <% end %>
88
+
89
+ <% section "Analyzer for Ferret" do %>
90
+ RMMSeg include an analyzer for Ferret. It is simply ready to
91
+ use. Just require it and pass it to Ferret. Here's a complete
92
+ example:
93
+
94
+ <code lang="ruby">
95
+ <%# include ferret_example.rb %>
96
+ </code>
97
+
98
+ execute it on the following key words:
99
+
100
+ $ ruby ferret_example.rb Ruby 中文
101
+
102
+ will generate the following results:
103
+
104
+ <code lang="text">
105
+ Searching for Ruby...
106
+
107
+ *** Document "RMMSeg" found with a score of 0.21875
108
+ ----------------------------------------
109
+ RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。
110
+
111
+ *** Document "Ruby 1.9" found with a score of 0.21875
112
+ ----------------------------------------
113
+ Ruby 1.9.0 已经发布了,1.9 的一个重大改进就是对 Unicode 的支持。
114
+
115
+ *** Document "Ferret" found with a score of 0.176776692271233
116
+ ----------------------------------------
117
+ Ferret is a high-performance, full-featured text search engine library
118
+ written for Ruby. It is inspired by Apache Lucene Java project. With
119
+ the introduction of Ferret, Ruby users now have one of the fastest and
120
+ most flexible search libraries available. And it's surprisingly easy
121
+ to use.
122
+
123
+ Searching for 中文...
124
+
125
+ *** Document "分词" found with a score of 0.281680464744568
126
+ ----------------------------------------
127
+ 中文分词比较困难,不像英文那样,直接在空格和标点符号的地方断开就可以了。
128
+
129
+ *** Document "RMMSeg" found with a score of 0.281680464744568
130
+ ----------------------------------------
131
+ RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。
132
+ </code>
133
+
134
+ And if you run the example in terminal, you'll see the result
135
+ highlighted as in <%= xref "Ferret Example Screenshot" %>.
136
+
137
+ <% figure "Ferret Example Screenshot" do %>
138
+ !http://pluskid.lifegoo.com/wp-content/uploads/2008/02/rmmseg.png!
139
+ <% end %>
140
+
141
+ <% end %>
142
+
143
+ <% section "Customization" do %>
144
+ RMMSeg can be customized through @RMMSeg::Config@. For example, to use your own dictionaries, just set it before starting to do segmentation:
145
+
146
+ <code lang="ruby">
147
+ RMMSeg::Config.dictionaries = [["dict1.dic", true], # with frequency info
148
+ ["dict2.dic", false], # without
149
+ ["dict3.dic", false]]
150
+ RMMSeg::Config.max_word_length = 6
151
+ </code>
152
+
153
+ Or to use the simple algorithm for more efficient (and less accurate) segmenting:
154
+
155
+ <code>
156
+ RMMSeg::Config.algorithm = :simple
157
+ </code>
158
+
159
+ For more information on customization, please refer to the RDoc of "RMMSeg::Config":http://rmmseg.rubyforge.org/rmmseg/index.html.
81
160
  <% end %>
82
161
 
83
162
  <% end %>
84
163
 
85
164
  <% chapter "Resources" do %>
86
- * "Project Home":http://rmmseg.rubyforge.org/: The Project page at RubyForge.
87
- * "Implementation Details":http://pluskid.lifegoo.com/?p=261: My blog post about the implementation details of RMMSeg.
88
- * "Ferret Homepage":http://ferret.davebalmain.com/trac: The homepage of Ferret project.
165
+ * "Project Home":http://rubyforge.org/projects/rmmseg/: The Project page at RubyForge.
166
+ * "RDoc of RMMSeg":http://rmmseg.rubyforge.org/rmmseg/index.html: The auto generated rdoc of RMMSeg.
167
+ * "Implementation Details":http://pluskid.lifegoo.com/?p=261: My blog post about the implementation details of RMMSeg (Chinese).
168
+ * "Author's Email":mailto:pluskid@gmail.com: Contact me if you have any problem.
89
169
  <% end %>
90
170
 
91
171
  <% footer do %>
data/misc/homepage.html CHANGED
@@ -2,7 +2,7 @@
2
2
  <html>
3
3
  <head>
4
4
  <meta http-equiv="content-type" content="text/html; charset=utf-8"/>
5
- <meta name="date" content="31 January 2008"/>
5
+ <meta name="date" content="01 February 2008"/>
6
6
  <meta name="author" content="pluskid"/>
7
7
  <meta name="generator" content="Gerbil 1.1.0"/>
8
8
  <title>RMMSeg Homepage</title>
@@ -763,19 +763,19 @@
763
763
 
764
764
  <h1 class="title">RMMSeg Homepage</h1>
765
765
  <h2 class="authors"><a href="http://pluskid.lifegoo.com">pluskid</a></h2>
766
- <h3 class="date">31 January 2008</h3>
766
+ <h3 class="date">01 February 2008</h3>
767
767
  </div>
768
768
 
769
769
 
770
- <div id="toc"><h1>Contents</h1> <ul><li>1&nbsp;&nbsp;<a id="a-606563428" href="#Introduction">Introduction</a></li><li>2&nbsp;&nbsp;<a id="a-606565568" href="#Setup">Setup</a><ul><li>2.1&nbsp;&nbsp;<a id="a-606567068" href="#Requirements">Requirements</a></li><li>2.2&nbsp;&nbsp;<a id="a-606569178" href="#Installation">Installation</a><ul><li>2.2.1&nbsp;&nbsp;<a id="a-606570778" href="#Using-RubyGems">Using RubyGems</a></li><li>2.2.2&nbsp;&nbsp;<a id="a-606572868" href="#From-Subversion">From Subversion</a></li></ul></li></ul></li><li>3&nbsp;&nbsp;<a id="a-606577658" href="#Usage">Usage</a><ul><li>3.1&nbsp;&nbsp;<a id="a-606579198" href="#Stand-Alone-rmmseg">Stand Alone rmmseg</a></li></ul></li><li>4&nbsp;&nbsp;<a id="a-606581648" href="#Resources">Resources</a></li></ul></div>
770
+ <div id="toc"><h1>Contents</h1> <ul><li>1&nbsp;&nbsp;<a id="a-606801458" href="#Introduction">Introduction</a></li><li>2&nbsp;&nbsp;<a id="a-606803598" href="#Setup">Setup</a><ul><li>2.1&nbsp;&nbsp;<a id="a-606805098" href="#Requirements">Requirements</a></li><li>2.2&nbsp;&nbsp;<a id="a-606807208" href="#Installation">Installation</a><ul><li>2.2.1&nbsp;&nbsp;<a id="a-606808808" href="#Using-RubyGems">Using RubyGems</a></li><li>2.2.2&nbsp;&nbsp;<a id="a-606810898" href="#From-Subversion">From Subversion</a></li></ul></li></ul></li><li>3&nbsp;&nbsp;<a id="a-606815688" href="#Usage">Usage</a><ul><li>3.1&nbsp;&nbsp;<a id="a-606817228" href="#Stand-Alone-rmmseg">Stand Alone rmmseg</a></li><li>3.2&nbsp;&nbsp;<a id="a-606819308" href="#Analyzer-for-Ferret">Analyzer for Ferret</a></li><li>3.3&nbsp;&nbsp;<a id="a-606825488" href="#Customization">Customization</a></li></ul></li><li>4&nbsp;&nbsp;<a id="a-606828108" href="#Resources">Resources</a></li></ul></div>
771
771
 
772
- <div id="lof"><h1>Notes</h1> <ol><li><a id="a-606574508" href="#The-latest-code-might-be-unstable">The latest code might be unstable</a></li></ol></div>
772
+ <div id="lof"><h1>Figures</h1> <ol><li><a id="a-606823268" href="#Ferret-Example-Screenshot">Ferret Example Screenshot</a></li></ol><h1>Notes</h1> <ol><li><a id="a-606812538" href="#The-latest-code-might-be-unstable">The latest code might be unstable</a></li></ol></div>
773
773
 
774
774
  <div id="content">
775
775
  <div class="chapter">
776
776
  <h1 class="title">
777
777
  Chapter
778
- <a class="toc" id="Introduction" href="#a-606563428">1</a>
778
+ <a class="toc" id="Introduction" href="#a-606801458">1</a>
779
779
 
780
780
  <br/>
781
781
 
@@ -805,13 +805,13 @@ following essays:</p>
805
805
  </ul>
806
806
 
807
807
 
808
- <p>RMMSeg can be used as either a stand alone program or an Analyser of
808
+ <p>RMMSeg can be used as either a stand alone program or an Analyzer of
809
809
  <a href="http://ferret.davebalmain.com/trac">Ferret</a>.</p></div>
810
810
  </div>
811
811
  <div class="chapter">
812
812
  <h1 class="title">
813
813
  Chapter
814
- <a class="toc" id="Setup" href="#a-606565568">2</a>
814
+ <a class="toc" id="Setup" href="#a-606803598">2</a>
815
815
 
816
816
  <br/>
817
817
 
@@ -820,7 +820,7 @@ following essays:</p>
820
820
 
821
821
  <div class="content"><div class="section">
822
822
  <h2 class="title">
823
- <a class="toc" id="Requirements" href="#a-606567068">2.1</a>&nbsp;&nbsp;Requirements
823
+ <a class="toc" id="Requirements" href="#a-606805098">2.1</a>&nbsp;&nbsp;Requirements
824
824
  </h2>
825
825
  <div class="content">Your system needs the following software to run RMMSeg.
826
826
 
@@ -850,11 +850,11 @@ following essays:</p>
850
850
  </div>
851
851
  <div class="section">
852
852
  <h2 class="title">
853
- <a class="toc" id="Installation" href="#a-606569178">2.2</a>&nbsp;&nbsp;Installation
853
+ <a class="toc" id="Installation" href="#a-606807208">2.2</a>&nbsp;&nbsp;Installation
854
854
  </h2>
855
855
  <div class="content"><div class="section">
856
856
  <h3 class="title">
857
- <a class="toc" id="Using-RubyGems" href="#a-606570778">2.2.1</a>&nbsp;&nbsp;Using RubyGems
857
+ <a class="toc" id="Using-RubyGems" href="#a-606808808">2.2.1</a>&nbsp;&nbsp;Using RubyGems
858
858
  </h3>
859
859
  <div class="content"><p>To install the gem remotely from <a href="http://rubyforge.org">RubyForge</a> :</p>
860
860
 
@@ -862,18 +862,18 @@ following essays:</p>
862
862
  <pre>sudo gem install rmmseg</pre>
863
863
 
864
864
 
865
- <p>Or you can download the gem file manually from RubyForge and install it locally:</p>
865
+ <p>Or you can download the gem file manually from <a href="http://rubyforge.org/projects/rmmseg/">RubyForge</a> and install it locally:</p>
866
866
 
867
867
 
868
868
  <pre>sudo gem install --local rmmseg-x.y.z.gem</pre></div>
869
869
  </div>
870
870
  <div class="section">
871
871
  <h3 class="title">
872
- <a class="toc" id="From-Subversion" href="#a-606572868">2.2.2</a>&nbsp;&nbsp;From Subversion
872
+ <a class="toc" id="From-Subversion" href="#a-606810898">2.2.2</a>&nbsp;&nbsp;From Subversion
873
873
  </h3>
874
874
  <div class="content"><p>From subversion repository hosted at <a href="http://rmmseg.rubyforge.org/svn/">RubyForge</a>, you can always get the latest source code.
875
875
  <div class="note">
876
- <p class="title"><a class="toc" id="The-latest-code-might-be-unstable" href="#a-606574508">Note 1</a>.&nbsp;&nbsp;The latest code might be unstable</p>
876
+ <p class="title"><a class="toc" id="The-latest-code-might-be-unstable" href="#a-606812538">Note 1</a>.&nbsp;&nbsp;The latest code might be unstable</p>
877
877
 
878
878
  <img src="
879
879
  fAhkiAAAAAlwSFlzAAAN1wAADdcBQiibeAAAABl0RVh0U29mdHdhcmUAd3d3
@@ -954,7 +954,7 @@ DUCQhrhWJkj394A0gKeUCjVo3r9Zv0r2P3yyQqPd16MPAAAAAElFTkSuQmCC
954
954
  <div class="chapter">
955
955
  <h1 class="title">
956
956
  Chapter
957
- <a class="toc" id="Usage" href="#a-606577658">3</a>
957
+ <a class="toc" id="Usage" href="#a-606815688">3</a>
958
958
 
959
959
  <br/>
960
960
 
@@ -963,7 +963,7 @@ DUCQhrhWJkj394A0gKeUCjVo3r9Zv0r2P3yyQqPd16MPAAAAAElFTkSuQmCC
963
963
 
964
964
  <div class="content"><div class="section">
965
965
  <h2 class="title">
966
- <a class="toc" id="Stand-Alone-rmmseg" href="#a-606579198">3.1</a>&nbsp;&nbsp;Stand Alone rmmseg
966
+ <a class="toc" id="Stand-Alone-rmmseg" href="#a-606817228">3.1</a>&nbsp;&nbsp;Stand Alone rmmseg
967
967
  </h2>
968
968
  <div class="content"><p>RMMSeg comes with a script <code class="code">rmmseg</code>. To get the basic usage, just execute it with <code class="code">-h</code> option:</p>
969
969
 
@@ -971,13 +971,158 @@ DUCQhrhWJkj394A0gKeUCjVo3r9Zv0r2P3yyQqPd16MPAAAAAElFTkSuQmCC
971
971
  <pre>rmmseg -h</pre>
972
972
 
973
973
 
974
- <p>It reads from STDIN and print result to STDOUT.</p></div>
974
+ <p>It reads from STDIN and print result to STDOUT. Here is a real
975
+ example:</p>
976
+
977
+
978
+ <pre>$ echo "我们都喜欢用 Ruby" | rmmseg
979
+ 我们 都 喜欢 用 Ruby</pre></div>
980
+ </div>
981
+ <div class="section">
982
+ <h2 class="title">
983
+ <a class="toc" id="Analyzer-for-Ferret" href="#a-606819308">3.2</a>&nbsp;&nbsp;Analyzer for Ferret
984
+ </h2>
985
+ <div class="content"><p>RMMSeg include an analyzer for Ferret. It is simply ready to
986
+ use. Just require it and pass it to Ferret. Here&#8217;s a complete
987
+ example:</p>
988
+
989
+
990
+ <pre class="code" lang="ruby">
991
+ <span style="color:#888">#!/usr/bin/env ruby</span>
992
+ require <span style="background-color:#fff0f0"><span style="color:#710">'</span><span style="color:#D20">rubygems</span><span style="color:#710">'</span></span>
993
+ require <span style="background-color:#fff0f0"><span style="color:#710">'</span><span style="color:#D20">rmmseg</span><span style="color:#710">'</span></span>
994
+ require <span style="background-color:#fff0f0"><span style="color:#710">'</span><span style="color:#D20">rmmseg/ferret</span><span style="color:#710">'</span></span>
995
+
996
+ analyzer = <span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Ferret</span>::<span style="color:#036; font-weight:bold">Analyzer</span>.new
997
+ <span style="color:#d70; font-weight:bold">$index</span> = <span style="color:#036; font-weight:bold">Ferret</span>::<span style="color:#036; font-weight:bold">Index</span>::<span style="color:#036; font-weight:bold">Index</span>.new(<span style="color:#A60">:analyzer</span> =&gt; analyzer)
998
+
999
+ <span style="color:#d70; font-weight:bold">$index</span> &lt;&lt; {
1000
+ <span style="color:#A60">:title</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">分词</span><span style="color:#710">&quot;</span></span>,
1001
+ <span style="color:#A60">:content</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">中文分词比较困难,不像英文那样,直接在空格和标点符号的地方断开就可以了。</span><span style="color:#710">&quot;</span></span>
1002
+ }
1003
+ <span style="color:#d70; font-weight:bold">$index</span> &lt;&lt; {
1004
+ <span style="color:#A60">:title</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">RMMSeg</span><span style="color:#710">&quot;</span></span>,
1005
+ <span style="color:#A60">:content</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。</span><span style="color:#710">&quot;</span></span>
1006
+ }
1007
+ <span style="color:#d70; font-weight:bold">$index</span> &lt;&lt; {
1008
+ <span style="color:#A60">:title</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">Ruby 1.9</span><span style="color:#710">&quot;</span></span>,
1009
+ <span style="color:#A60">:content</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">Ruby 1.9.0 已经发布了,1.9 的一个重大改进就是对 Unicode 的支持。</span><span style="color:#710">&quot;</span></span>
1010
+ }
1011
+ <span style="color:#d70; font-weight:bold">$index</span> &lt;&lt; {
1012
+ <span style="color:#A60">:title</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">Ferret</span><span style="color:#710">&quot;</span></span>,
1013
+ <span style="color:#A60">:content</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&lt;&lt;END</span></span><span style="background-color:#fff0f0"><span style="color:#D20">
1014
+ Ferret is a high-performance, full-featured text search engine library
1015
+ written for Ruby. It is inspired by Apache Lucene Java project. With
1016
+ the introduction of Ferret, Ruby users now have one of the fastest and
1017
+ most flexible search libraries available. And it is surprisingly easy
1018
+ to use.</span><span style="color:#710">
1019
+ END</span></span>
1020
+ }
1021
+
1022
+ <span style="color:#080; font-weight:bold">def</span> <span style="color:#06B; font-weight:bold">highlight_search</span>(key)
1023
+ <span style="color:#d70; font-weight:bold">$index</span>.search_each(<span style="background-color:#fff0f0"><span style="color:#710">%Q!</span><span style="color:#D20">content:&quot;</span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>key<span style="font-weight: bold; color: #888">}</span></span><span style="color:#D20">&quot;</span><span style="color:#710">!</span></span>) <span style="color:#080; font-weight:bold">do</span> |id, score|
1024
+ puts <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">*** Document </span><span style="color:#04D">\&quot;</span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span><span style="color:#d70; font-weight:bold">$index</span>[id][<span style="color:#A60">:title</span>]<span style="font-weight: bold; color: #888">}</span></span><span style="color:#04D">\&quot;</span><span style="color:#D20"> found with a score of </span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>score<span style="font-weight: bold; color: #888">}</span></span><span style="color:#710">&quot;</span></span>
1025
+ puts <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">-</span><span style="color:#710">&quot;</span></span>*<span style="color:#00D; font-weight:bold">40</span>
1026
+ highlights = <span style="color:#d70; font-weight:bold">$index</span>.highlight(<span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">content:</span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>key<span style="font-weight: bold; color: #888">}</span></span><span style="color:#710">&quot;</span></span>, id,
1027
+ <span style="color:#A60">:field</span> =&gt; <span style="color:#A60">:content</span>,
1028
+ <span style="color:#A60">:pre_tag</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#04D">\033</span><span style="color:#D20">[36m</span><span style="color:#710">&quot;</span></span>,
1029
+ <span style="color:#A60">:post_tag</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#04D">\033</span><span style="color:#D20">[m</span><span style="color:#710">&quot;</span></span>)
1030
+ puts <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>highlights<span style="font-weight: bold; color: #888">}</span></span><span style="color:#710">&quot;</span></span>
1031
+ puts <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#710">&quot;</span></span>
1032
+ <span style="color:#080; font-weight:bold">end</span>
1033
+ <span style="color:#080; font-weight:bold">end</span>
1034
+
1035
+ <span style="color:#038; font-weight:bold">ARGV</span>.each { |key|
1036
+ puts <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#04D">\033</span><span style="color:#D20">[33mSearching for </span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>key<span style="font-weight: bold; color: #888">}</span></span><span style="color:#D20">...</span><span style="color:#04D">\033</span><span style="color:#D20">[m</span><span style="color:#710">&quot;</span></span>
1037
+ puts <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#710">&quot;</span></span>
1038
+ highlight_search(key)
1039
+ }
1040
+
1041
+ <span style="color:#888"># Local Variables:</span>
1042
+ <span style="color:#888"># coding: utf-8</span>
1043
+ <span style="color:#888"># End:</span>
1044
+ </pre>
1045
+
1046
+
1047
+ <p>execute it on the following key words:</p>
1048
+
1049
+
1050
+ <pre>$ ruby ferret_example.rb Ruby 中文</pre>
1051
+
1052
+
1053
+ <p>will generate the following results:</p>
1054
+
1055
+
1056
+ <pre class="code" lang="text">
1057
+ Searching for Ruby...
1058
+
1059
+ *** Document &quot;RMMSeg&quot; found with a score of 0.21875
1060
+ ----------------------------------------
1061
+ RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。
1062
+
1063
+ *** Document &quot;Ruby 1.9&quot; found with a score of 0.21875
1064
+ ----------------------------------------
1065
+ Ruby 1.9.0 已经发布了,1.9 的一个重大改进就是对 Unicode 的支持。
1066
+
1067
+ *** Document &quot;Ferret&quot; found with a score of 0.176776692271233
1068
+ ----------------------------------------
1069
+ Ferret is a high-performance, full-featured text search engine library
1070
+ written for Ruby. It is inspired by Apache Lucene Java project. With
1071
+ the introduction of Ferret, Ruby users now have one of the fastest and
1072
+ most flexible search libraries available. And it's surprisingly easy
1073
+ to use.
1074
+
1075
+ Searching for 中文...
1076
+
1077
+ *** Document &quot;分词&quot; found with a score of 0.281680464744568
1078
+ ----------------------------------------
1079
+ 中文分词比较困难,不像英文那样,直接在空格和标点符号的地方断开就可以了。
1080
+
1081
+ *** Document &quot;RMMSeg&quot; found with a score of 0.281680464744568
1082
+ ----------------------------------------
1083
+ RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。
1084
+ </pre>
1085
+
1086
+
1087
+ <p>And if you run the example in terminal, you&#8217;ll see the result
1088
+ highlighted as in <a class="xref" href="#Ferret-Example-Screenshot">Figure 1: <em>Ferret Example Screenshot</em></a>.</p>
1089
+
1090
+
1091
+ <p><div class="figure">
1092
+ <p class="title"><a class="toc" id="Ferret-Example-Screenshot" href="#a-606823268">Figure 1</a>.&nbsp;&nbsp;Ferret Example Screenshot</p>
1093
+ <div class="content"><img src="http://pluskid.lifegoo.com/wp-content/uploads/2008/02/rmmseg.png" alt="" /></div>
1094
+ </div></p></div>
1095
+ </div>
1096
+ <div class="section">
1097
+ <h2 class="title">
1098
+ <a class="toc" id="Customization" href="#a-606825488">3.3</a>&nbsp;&nbsp;Customization
1099
+ </h2>
1100
+ <div class="content"><p>RMMSeg can be customized through <code class="code"><span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Config</span></code>. For example, to use your own dictionaries, just set it before starting to do segmentation:</p>
1101
+
1102
+
1103
+ <pre class="code" lang="ruby">
1104
+ <span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Config</span>.dictionaries = [[<span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">dict1.dic</span><span style="color:#710">&quot;</span></span>, <span style="color:#038; font-weight:bold">true</span>], <span style="color:#888"># with frequency info</span>
1105
+ [<span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">dict2.dic</span><span style="color:#710">&quot;</span></span>, <span style="color:#038; font-weight:bold">false</span>], <span style="color:#888"># without</span>
1106
+ [<span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">dict3.dic</span><span style="color:#710">&quot;</span></span>, <span style="color:#038; font-weight:bold">false</span>]]
1107
+ <span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Config</span>.max_word_length = <span style="color:#00D; font-weight:bold">6</span>
1108
+ </pre>
1109
+
1110
+
1111
+ <p>Or to use the simple algorithm for more efficient (and less accurate) segmenting:</p>
1112
+
1113
+
1114
+ <pre class="code">
1115
+ <span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Config</span>.algorithm = <span style="color:#A60">:simple</span>
1116
+ </pre>
1117
+
1118
+
1119
+ <p>For more information on customization, please refer to the RDoc of <a href="http://rmmseg.rubyforge.org/rmmseg/index.html">RMMSeg::Config</a>.</p></div>
975
1120
  </div></div>
976
1121
  </div>
977
1122
  <div class="chapter">
978
1123
  <h1 class="title">
979
1124
  Chapter
980
- <a class="toc" id="Resources" href="#a-606581648">4</a>
1125
+ <a class="toc" id="Resources" href="#a-606828108">4</a>
981
1126
 
982
1127
  <br/>
983
1128
 
@@ -985,9 +1130,10 @@ DUCQhrhWJkj394A0gKeUCjVo3r9Zv0r2P3yyQqPd16MPAAAAAElFTkSuQmCC
985
1130
  </h1>
986
1131
 
987
1132
  <div class="content"><ul>
988
- <li><a href="http://rmmseg.rubyforge.org/">Project Home</a>: The Project page at RubyForge.</li>
989
- <li><a href="http://pluskid.lifegoo.com/?p=261">Implementation Details</a>: My blog post about the implementation details of RMMSeg.</li>
990
- <li><a href="http://ferret.davebalmain.com/trac">Ferret Homepage</a>: The homepage of Ferret project.</li>
1133
+ <li><a href="http://rubyforge.org/projects/rmmseg/">Project Home</a>: The Project page at RubyForge.</li>
1134
+ <li><a href="http://rmmseg.rubyforge.org/rmmseg/index.html">RDoc of RMMSeg</a>: The auto generated rdoc of RMMSeg.</li>
1135
+ <li><a href="http://pluskid.lifegoo.com/?p=261">Implementation Details</a>: My blog post about the implementation details of RMMSeg (Chinese).</li>
1136
+ <li><a href="mailto:pluskid@gmail.com">Author&#8217;s Email</a>: Contact me if you have any problem.</li>
991
1137
  </ul></div>
992
1138
  </div>
993
1139
  </div>
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rmmseg
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - pluskid
@@ -39,10 +39,12 @@ files:
39
39
  - Rakefile
40
40
  - TODO.txt
41
41
  - bin/rmmseg
42
+ - data/chars.dic
43
+ - data/punctuation.dic
44
+ - data/words.dic
42
45
  - lib/rmmseg.rb
43
46
  - lib/rmmseg/algorithm.rb
44
47
  - lib/rmmseg/amibguity.rb
45
- - lib/rmmseg/chars.dic
46
48
  - lib/rmmseg/chunk.rb
47
49
  - lib/rmmseg/complex_algorithm.rb
48
50
  - lib/rmmseg/config.rb
@@ -56,7 +58,7 @@ files:
56
58
  - lib/rmmseg/svwl_rule.rb
57
59
  - lib/rmmseg/token.rb
58
60
  - lib/rmmseg/word.rb
59
- - lib/rmmseg/words.dic
61
+ - misc/ferret_example.rb
60
62
  - misc/homepage.erb
61
63
  - misc/homepage.html
62
64
  - spec/chunk_spec.rb