RubyGems - rmmseg - Versions diffs - 0.0.1 → 0.1.0 - Mend

rmmseg 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

data/History.txt +6 -1
data/Manifest.txt +4 -2
data/README.txt +13 -1
data/Rakefile +1 -1
data/TODO.txt +0 -1
data/bin/rmmseg +0 -2
data/{lib/rmmseg → data}/chars.dic +0 -0
data/data/punctuation.dic +79 -0
data/{lib/rmmseg → data}/words.dic +0 -0
data/lib/rmmseg.rb +1 -1
data/lib/rmmseg/config.rb +3 -2
data/lib/rmmseg/ferret.rb +72 -1
data/misc/ferret_example.rb +56 -0
data/misc/homepage.erb +86 -6
data/misc/homepage.html +166 -20
metadata +5 -3

data/History.txt CHANGED Viewed

@@ -1,6 +1,11 @@
+=== 0.1.0 / 2008-02-01
+* Add filter to filter out Chinese punctuations.
 === 0.0.1 / 2008-01-31
-* Analyser integration with Ferret.
+* Analyzer integration with Ferret.
 * rdoc added
 * Lazily init the +Word+ objects inside the +Dictionary+.
 * Handle English punctuation correctly.

data/Manifest.txt CHANGED Viewed

@@ -4,10 +4,12 @@ README.txt
 Rakefile
 TODO.txt
 bin/rmmseg
+data/chars.dic
+data/punctuation.dic
+data/words.dic
 lib/rmmseg.rb
 lib/rmmseg/algorithm.rb
 lib/rmmseg/amibguity.rb
-lib/rmmseg/chars.dic
 lib/rmmseg/chunk.rb
 lib/rmmseg/complex_algorithm.rb
 lib/rmmseg/config.rb
@@ -21,7 +23,7 @@ lib/rmmseg/simple_algorithm.rb
 lib/rmmseg/svwl_rule.rb
 lib/rmmseg/token.rb
 lib/rmmseg/word.rb
-lib/rmmseg/words.dic
+misc/ferret_example.rb
 misc/homepage.erb
 misc/homepage.html
 spec/chunk_spec.rb

data/README.txt CHANGED Viewed

@@ -23,11 +23,23 @@ following essays:
 * Provides +rmmseg+ command line tool for quick and easy way to access
   the word segment feature.
-* Provides an +Analyser+ for integrating with Ferret[http://ferret.davebalmain.com/trac].
+* Provides an +Analyzer+ for integrating with Ferret[http://ferret.davebalmain.com/trac].
 == SYNOPSIS:
+Using the command line tool +rmmseg+ is simple:
   $ rmmseg --separator _ < input.txt
+passing option +-h+ can get an overview of all supported options.
+Using the +Analyzer+ for Ferret is even easier:
+  require 'rmmseg'
+  require 'rmmseg/ferret'
+  alalyzer = RMMSeg::Ferret::Analyzer.new
+  index = Ferret::Index::Index.new(:analyzer => analyzer)
+For more details, please refer to the {homepage usage section}[http://rmmseg.rubyforge.org/index.html#Usage].
 == REQUIREMENTS:

data/Rakefile CHANGED Viewed

@@ -23,7 +23,7 @@ Hoe.new('rmmseg', RMMSeg::VERSION) do |p|
 end
 task :homepage do
-  sh "gerbil html misc/homepage.erb > misc/homepage.html"
+  sh "cd misc && gerbil html homepage.erb > homepage.html"
 end
 task :publish_homepage do

data/TODO.txt CHANGED Viewed

@@ -1,3 +1,2 @@
 === TODO
-* Add filter to filter out Chinese punctuations.

data/bin/rmmseg CHANGED Viewed

@@ -1,7 +1,5 @@
 #!/usr/bin/env ruby
-$: << File.join(File.dirname(__FILE__), "..", "lib")
 require 'rmmseg'
 include RMMSeg

data/{lib/rmmseg → data}/chars.dic RENAMED Viewed

File without changes

data/data/punctuation.dic ADDED Viewed

@@ -0,0 +1,79 @@
+｛
+×
+π
+）
+〖
+；
+〗
+＜
+°
+“
+＋
+◆
+♀
+＝
+±
+←
+｝
+，
+”
+㎡
+◇
+＞
+↑
+～
+△
+？
+♂
+‰
+——
+→
+■
+￥
+－
+＠
+≈
+↓
+□
+〈
+′
+〉
+／
+★
+《
+○
+″
+☆
+》
+·
+∶
+！
+『
+§
+●
+』
+…
+【
+℃
+＃
+÷
+】
+№
+＄
+※
+≤
+‖
+％
+≥
+＆
+、
+‘
+〔
+。
+’
+√
+（
+〕
+￡
+：

data/{lib/rmmseg → data}/words.dic RENAMED Viewed

File without changes

data/lib/rmmseg.rb CHANGED Viewed

@@ -6,7 +6,7 @@ require 'rmmseg/simple_algorithm'
 require 'rmmseg/complex_algorithm'
 module RMMSeg
-  VERSION = '0.0.1'
+  VERSION = '0.1.0'
   # Segment +text+ using the algorithm configured.
   def segment(text)

data/lib/rmmseg/config.rb CHANGED Viewed

@@ -6,8 +6,9 @@ module RMMSeg
   class Config
     @algorithm = :complex
     @on_ambiguity = :select_first
-    @dictionaries = [[File.join(File.dirname(__FILE__), "chars.dic"), true],
-                     [File.join(File.dirname(__FILE__), "words.dic"), false]]
+    data_dir = File.join(File.dirname(__FILE__), "..", "..", "data")
+    @dictionaries = [[File.join(data_dir, "chars.dic"), true],
+                     [File.join(data_dir, "words.dic"), false]]
     @max_word_length = 4
     class << self

data/lib/rmmseg/ferret.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 # This file integrate RMMSeg with Ferret
+require 'singleton'
 require 'rubygems'
 require 'ferret'
@@ -6,8 +7,25 @@ module RMMSeg
   module Ferret
     # The Analyzer class can be used with Ferret .
     class Analyzer < ::Ferret::Analysis::Analyzer
+      # Construct an Analyzer. Optional block can be used to
+      # add more +TokenFilter+s. e.g.
+      #
+      #   analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
+      #     Ferret::Analysis::LowerCaseFilter.new(tokenizer)
+      #   }
+      #
+      def initialize(&brk)
+        @brk = brk
+      end
       def token_stream(field, text)
-        Tokenizer.new(text)
+        t = PunctuationFilter.new(Tokenizer.new(text))
+        if @brk
+          @brk.call(t)
+        else
+          t
+        end
       end
     end
@@ -39,5 +57,58 @@ module RMMSeg
         @algor = RMMSeg::Config.algorithm_instance(@text)
       end
     end
+    # PunctuationFilter filter out the stand alone Chinese
+    # punctuation tokens.
+    class PunctuationFilter < ::Ferret::Analysis::TokenStream
+      # The punctuation dictionary.
+      class Dictionary
+        include Singleton
+        DIC_FILE = File.join(File.dirname(__FILE__),
+                             "..",
+                             "..",
+                             "data",
+                             "punctuation.dic")
+        def initialize
+          @dic = Hash.new
+          File.open(DIC_FILE, "r") do |f|
+            f.each_line { |line|
+              @dic[line.chomp.freeze] = nil
+            }
+          end
+        end
+        def include?(str)
+          @dic.has_key?(str)
+        end
+      end
+      def initialize(stream)
+        @stream = stream
+      end
+      # Get next token, skip stand alone Chinese punctuations.
+      def next
+        token = nil
+        dic = Dictionary.instance
+        loop do
+          token = @stream.next
+          break if token.nil?
+          break unless dic.include? token.text
+        end
+        token
+      end
+      def text
+        @stream.text
+      end
+      def text=(str)
+        @stream.text = str
+      end
+    end
   end
 end

data/misc/ferret_example.rb ADDED Viewed

@@ -0,0 +1,56 @@
+#!/usr/bin/env ruby
+require 'rubygems'
+require 'rmmseg'
+require 'rmmseg/ferret'
+analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
+  Ferret::Analysis::LowerCaseFilter.new(tokenizer)
+}
+$index = Ferret::Index::Index.new(:analyzer => analyzer,
+                                  :path => '/tmp/index')
+$index << {
+  :title => "分词",
+  :content => "中文分词比较困难，不像英文那样，直接在空格和标点符号的地方断开就可以了。"
+}
+$index << {
+  :title => "RMMSeg",
+  :content => "RMMSeg 我近日做的一个 Ruby 中文分词实现，下一步是和 Ferret 进行集成。"
+}
+$index << {
+  :title => "Ruby 1.9",
+  :content => "Ruby 1.9.0 已经发布了，1.9 的一个重大改进就是对 Unicode 的支持。"
+}
+$index << {
+  :title => "Ferret",
+  :content => <<END
+Ferret is a high-performance, full-featured text search engine library
+written for Ruby. It is inspired by Apache Lucene Java project. With
+the introduction of Ferret, Ruby users now have one of the fastest and
+most flexible search libraries available. And it is surprisingly easy
+to use.
+END
+}
+def highlight_search(key)
+  $index.search_each(%Q!content:"#{key}"!) do |id, score|
+    puts "*** Document \"#{$index[id][:title]}\" found with a score of #{score}"
+    puts "-"*40
+    highlights = $index.highlight("content:#{key}", id,
+                                  :field => :content,
+                                  :pre_tag => "\033[36m",
+                                  :post_tag => "\033[m")
+    puts "#{highlights}"
+    puts ""
+  end
+end
+ARGV.each { |key|
+  puts "\033[33mSearching for #{key}...\033[m"
+  puts ""
+  highlight_search(key)
+}
+# Local Variables:
+# coding: utf-8
+# End:

data/misc/homepage.erb CHANGED Viewed

@@ -1,3 +1,4 @@
+<%# -*- mode: text; coding: utf-8 -*- %>
 <%
   $title = "RMMSeg Homepage"
   $authors = { 'pluskid' => 'http://pluskid.lifegoo.com' }
@@ -22,7 +23,7 @@
   * http://technology.chtsai.org/mmseg/
   * http://pluskid.lifegoo.com/?p=261
-  RMMSeg can be used as either a stand alone program or an Analyser of
+  RMMSeg can be used as either a stand alone program or an Analyzer of
   "Ferret":http://ferret.davebalmain.com/trac.
 <% end %>
@@ -46,7 +47,7 @@
         sudo gem install rmmseg
-      Or you can download the gem file manually from RubyForge and install it locally:
+      Or you can download the gem file manually from "RubyForge":http://rubyforge.org/projects/rmmseg/ and install it locally:
         sudo gem install --local rmmseg-x.y.z.gem
@@ -77,15 +78,94 @@
       rmmseg -h
-    It reads from STDIN and print result to STDOUT.
+    It reads from STDIN and print result to STDOUT. Here is a real
+    example:
+      $ echo "我们都喜欢用 Ruby" | rmmseg
+      我们 都 喜欢 用 Ruby
+  <% end %>
+  <% section "Analyzer for Ferret" do %>
+    RMMSeg include an analyzer for Ferret. It is simply ready to
+    use. Just require it and pass it to Ferret. Here's a complete
+    example:
+    <code lang="ruby">
+    <%# include ferret_example.rb %>
+    </code>
+    execute it on the following key words:
+      $ ruby ferret_example.rb Ruby 中文
+    will generate the following results:
+    <code lang="text">
+    Searching for Ruby...
+    *** Document "RMMSeg" found with a score of 0.21875
+    ----------------------------------------
+    RMMSeg 我近日做的一个 Ruby 中文分词实现，下一步是和 Ferret 进行集成。
+    *** Document "Ruby 1.9" found with a score of 0.21875
+    ----------------------------------------
+    Ruby 1.9.0 已经发布了，1.9 的一个重大改进就是对 Unicode 的支持。
+    *** Document "Ferret" found with a score of 0.176776692271233
+    ----------------------------------------
+    Ferret is a high-performance, full-featured text search engine library
+    written for Ruby. It is inspired by Apache Lucene Java project. With
+    the introduction of Ferret, Ruby users now have one of the fastest and
+    most flexible search libraries available. And it's surprisingly easy
+    to use.
+    Searching for 中文...
+    *** Document "分词" found with a score of 0.281680464744568
+    ----------------------------------------
+    中文分词比较困难，不像英文那样，直接在空格和标点符号的地方断开就可以了。
+    *** Document "RMMSeg" found with a score of 0.281680464744568
+    ----------------------------------------
+    RMMSeg 我近日做的一个 Ruby 中文分词实现，下一步是和 Ferret 进行集成。
+    </code>
+    And if you run the example in terminal, you'll see the result
+    highlighted as in <%= xref "Ferret Example Screenshot" %>.
+    <% figure "Ferret Example Screenshot" do %>
+      !http://pluskid.lifegoo.com/wp-content/uploads/2008/02/rmmseg.png!
+    <% end %>
+  <% end %>
+  <% section "Customization" do %>
+    RMMSeg can be customized through @RMMSeg::Config@. For example, to use your own dictionaries, just set it before starting to do segmentation:
+    <code lang="ruby">
+    RMMSeg::Config.dictionaries = [["dict1.dic", true],  # with frequency info
+                                   ["dict2.dic", false], # without
+                                   ["dict3.dic", false]]
+    RMMSeg::Config.max_word_length = 6
+    </code>
+    Or to use the simple algorithm for more efficient (and less accurate) segmenting:
+    <code>
+    RMMSeg::Config.algorithm = :simple
+    </code>
+    For more information on customization, please refer to the RDoc of "RMMSeg::Config":http://rmmseg.rubyforge.org/rmmseg/index.html.
   <% end %>
 <% end %>
 <% chapter "Resources" do %>
-  * "Project Home":http://rmmseg.rubyforge.org/: The Project page at RubyForge.
-  * "Implementation Details":http://pluskid.lifegoo.com/?p=261: My blog post about the implementation details of RMMSeg.
-  * "Ferret Homepage":http://ferret.davebalmain.com/trac: The homepage of Ferret project.
+  * "Project Home":http://rubyforge.org/projects/rmmseg/: The Project page at RubyForge.
+  * "RDoc of RMMSeg":http://rmmseg.rubyforge.org/rmmseg/index.html: The auto generated rdoc of RMMSeg.
+  * "Implementation Details":http://pluskid.lifegoo.com/?p=261: My blog post about the implementation details of RMMSeg (Chinese).
+  * "Author's Email":mailto:pluskid@gmail.com: Contact me if you have any problem.
 <% end %>
 <% footer do %>

data/misc/homepage.html CHANGED Viewed

@@ -2,7 +2,7 @@
 <html>
   <head>
     <meta http-equiv="content-type" content="text/html; charset=utf-8"/>
-    <meta name="date" content="31 January 2008"/>
+    <meta name="date" content="01 February 2008"/>
     <meta name="author" content="pluskid"/>
     <meta name="generator" content="Gerbil 1.1.0"/>
     <title>RMMSeg Homepage</title>
@@ -763,19 +763,19 @@
       <h1 class="title">RMMSeg Homepage</h1>
       <h2 class="authors"><a href="http://pluskid.lifegoo.com">pluskid</a></h2>
-      <h3 class="date">31 January 2008</h3>
+      <h3 class="date">01 February 2008</h3>
     </div>
-    <div id="toc"><h1>Contents</h1> <ul><li>1&nbsp;&nbsp;<a id="a-606563428" href="#Introduction">Introduction</a></li><li>2&nbsp;&nbsp;<a id="a-606565568" href="#Setup">Setup</a><ul><li>2.1&nbsp;&nbsp;<a id="a-606567068" href="#Requirements">Requirements</a></li><li>2.2&nbsp;&nbsp;<a id="a-606569178" href="#Installation">Installation</a><ul><li>2.2.1&nbsp;&nbsp;<a id="a-606570778" href="#Using-RubyGems">Using RubyGems</a></li><li>2.2.2&nbsp;&nbsp;<a id="a-606572868" href="#From-Subversion">From Subversion</a></li></ul></li></ul></li><li>3&nbsp;&nbsp;<a id="a-606577658" href="#Usage">Usage</a><ul><li>3.1&nbsp;&nbsp;<a id="a-606579198" href="#Stand-Alone-rmmseg">Stand Alone rmmseg</a></li></ul></li><li>4&nbsp;&nbsp;<a id="a-606581648" href="#Resources">Resources</a></li></ul></div>
+    <div id="toc"><h1>Contents</h1> <ul><li>1&nbsp;&nbsp;<a id="a-606801458" href="#Introduction">Introduction</a></li><li>2&nbsp;&nbsp;<a id="a-606803598" href="#Setup">Setup</a><ul><li>2.1&nbsp;&nbsp;<a id="a-606805098" href="#Requirements">Requirements</a></li><li>2.2&nbsp;&nbsp;<a id="a-606807208" href="#Installation">Installation</a><ul><li>2.2.1&nbsp;&nbsp;<a id="a-606808808" href="#Using-RubyGems">Using RubyGems</a></li><li>2.2.2&nbsp;&nbsp;<a id="a-606810898" href="#From-Subversion">From Subversion</a></li></ul></li></ul></li><li>3&nbsp;&nbsp;<a id="a-606815688" href="#Usage">Usage</a><ul><li>3.1&nbsp;&nbsp;<a id="a-606817228" href="#Stand-Alone-rmmseg">Stand Alone rmmseg</a></li><li>3.2&nbsp;&nbsp;<a id="a-606819308" href="#Analyzer-for-Ferret">Analyzer for Ferret</a></li><li>3.3&nbsp;&nbsp;<a id="a-606825488" href="#Customization">Customization</a></li></ul></li><li>4&nbsp;&nbsp;<a id="a-606828108" href="#Resources">Resources</a></li></ul></div>
-    <div id="lof"><h1>Notes</h1> <ol><li><a id="a-606574508" href="#The-latest-code-might-be-unstable">The latest code might be unstable</a></li></ol></div>
+    <div id="lof"><h1>Figures</h1> <ol><li><a id="a-606823268" href="#Ferret-Example-Screenshot">Ferret Example Screenshot</a></li></ol><h1>Notes</h1> <ol><li><a id="a-606812538" href="#The-latest-code-might-be-unstable">The latest code might be unstable</a></li></ol></div>
     <div id="content">
 <div class="chapter">
   <h1 class="title">
     Chapter
-    <a class="toc" id="Introduction" href="#a-606563428">1</a>
+    <a class="toc" id="Introduction" href="#a-606801458">1</a>
     <br/>
@@ -805,13 +805,13 @@ following essays:</p>
 	</ul>
-	<p>RMMSeg can be used as either a stand alone program or an Analyser of
+	<p>RMMSeg can be used as either a stand alone program or an Analyzer of
 <a href="http://ferret.davebalmain.com/trac">Ferret</a>.</p></div>
 </div>
 <div class="chapter">
   <h1 class="title">
     Chapter
-    <a class="toc" id="Setup" href="#a-606565568">2</a>
+    <a class="toc" id="Setup" href="#a-606803598">2</a>
     <br/>
@@ -820,7 +820,7 @@ following essays:</p>
   <div class="content"><div class="section">
   <h2 class="title">
-    <a class="toc" id="Requirements" href="#a-606567068">2.1</a>&nbsp;&nbsp;Requirements
+    <a class="toc" id="Requirements" href="#a-606805098">2.1</a>&nbsp;&nbsp;Requirements
   </h2>
   <div class="content">Your system needs the following software to run RMMSeg.
@@ -850,11 +850,11 @@ following essays:</p>
 </div>
 <div class="section">
   <h2 class="title">
-    <a class="toc" id="Installation" href="#a-606569178">2.2</a>&nbsp;&nbsp;Installation
+    <a class="toc" id="Installation" href="#a-606807208">2.2</a>&nbsp;&nbsp;Installation
   </h2>
   <div class="content"><div class="section">
   <h3 class="title">
-    <a class="toc" id="Using-RubyGems" href="#a-606570778">2.2.1</a>&nbsp;&nbsp;Using RubyGems
+    <a class="toc" id="Using-RubyGems" href="#a-606808808">2.2.1</a>&nbsp;&nbsp;Using RubyGems
   </h3>
   <div class="content"><p>To install the gem remotely from <a href="http://rubyforge.org">RubyForge</a> :</p>
@@ -862,18 +862,18 @@ following essays:</p>
 	<pre>sudo gem install rmmseg</pre>
-	<p>Or you can download the gem file manually from RubyForge and install it locally:</p>
+	<p>Or you can download the gem file manually from <a href="http://rubyforge.org/projects/rmmseg/">RubyForge</a> and install it locally:</p>
 	<pre>sudo gem install --local rmmseg-x.y.z.gem</pre></div>
 </div>
 <div class="section">
   <h3 class="title">
-    <a class="toc" id="From-Subversion" href="#a-606572868">2.2.2</a>&nbsp;&nbsp;From Subversion
+    <a class="toc" id="From-Subversion" href="#a-606810898">2.2.2</a>&nbsp;&nbsp;From Subversion
   </h3>
   <div class="content"><p>From subversion repository hosted at <a href="http://rmmseg.rubyforge.org/svn/">RubyForge</a>, you can always get the latest source code.
 <div class="note">
-  <p class="title"><a class="toc" id="The-latest-code-might-be-unstable" href="#a-606574508">Note 1</a>.&nbsp;&nbsp;The latest code might be unstable</p>
+  <p class="title"><a class="toc" id="The-latest-code-might-be-unstable" href="#a-606812538">Note 1</a>.&nbsp;&nbsp;The latest code might be unstable</p>
   <img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAADAAAAAwCAYAAABXAvmHAAAABHNCSVQICAgI
 fAhkiAAAAAlwSFlzAAAN1wAADdcBQiibeAAAABl0RVh0U29mdHdhcmUAd3d3
@@ -954,7 +954,7 @@ DUCQhrhWJkj394A0gKeUCjVo3r9Zv0r2P3yyQqPd16MPAAAAAElFTkSuQmCC
 <div class="chapter">
   <h1 class="title">
     Chapter
-    <a class="toc" id="Usage" href="#a-606577658">3</a>
+    <a class="toc" id="Usage" href="#a-606815688">3</a>
     <br/>
@@ -963,7 +963,7 @@ DUCQhrhWJkj394A0gKeUCjVo3r9Zv0r2P3yyQqPd16MPAAAAAElFTkSuQmCC
   <div class="content"><div class="section">
   <h2 class="title">
-    <a class="toc" id="Stand-Alone-rmmseg" href="#a-606579198">3.1</a>&nbsp;&nbsp;Stand Alone rmmseg
+    <a class="toc" id="Stand-Alone-rmmseg" href="#a-606817228">3.1</a>&nbsp;&nbsp;Stand Alone rmmseg
   </h2>
   <div class="content"><p>RMMSeg comes with a script <code class="code">rmmseg</code>. To get the basic usage, just execute it with <code class="code">-h</code> option:</p>
@@ -971,13 +971,158 @@ DUCQhrhWJkj394A0gKeUCjVo3r9Zv0r2P3yyQqPd16MPAAAAAElFTkSuQmCC
 	<pre>rmmseg -h</pre>
-	<p>It reads from STDIN and print result to STDOUT.</p></div>
+	<p>It reads from STDIN and print result to STDOUT. Here is a real
+example:</p>
+	<pre>$ echo "我们都喜欢用 Ruby" | rmmseg
+我们 都 喜欢 用 Ruby</pre></div>
+</div>
+<div class="section">
+  <h2 class="title">
+    <a class="toc" id="Analyzer-for-Ferret" href="#a-606819308">3.2</a>&nbsp;&nbsp;Analyzer for Ferret
+  </h2>
+  <div class="content"><p>RMMSeg include an analyzer for Ferret. It is simply ready to
+use. Just require it and pass it to Ferret. Here&#8217;s a complete
+example:</p>
+	<pre class="code" lang="ruby">
+<span style="color:#888">#!/usr/bin/env ruby</span>
+require <span style="background-color:#fff0f0"><span style="color:#710">'</span><span style="color:#D20">rubygems</span><span style="color:#710">'</span></span>
+require <span style="background-color:#fff0f0"><span style="color:#710">'</span><span style="color:#D20">rmmseg</span><span style="color:#710">'</span></span>
+require <span style="background-color:#fff0f0"><span style="color:#710">'</span><span style="color:#D20">rmmseg/ferret</span><span style="color:#710">'</span></span>
+analyzer = <span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Ferret</span>::<span style="color:#036; font-weight:bold">Analyzer</span>.new
+<span style="color:#d70; font-weight:bold">$index</span> = <span style="color:#036; font-weight:bold">Ferret</span>::<span style="color:#036; font-weight:bold">Index</span>::<span style="color:#036; font-weight:bold">Index</span>.new(<span style="color:#A60">:analyzer</span> =&gt; analyzer)
+<span style="color:#d70; font-weight:bold">$index</span> &lt;&lt; {
+<span style="color:#A60">:title</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">分词</span><span style="color:#710">&quot;</span></span>,
+<span style="color:#A60">:content</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">中文分词比较困难，不像英文那样，直接在空格和标点符号的地方断开就可以了。</span><span style="color:#710">&quot;</span></span>
+}
+<span style="color:#d70; font-weight:bold">$index</span> &lt;&lt; {
+<span style="color:#A60">:title</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">RMMSeg</span><span style="color:#710">&quot;</span></span>,
+<span style="color:#A60">:content</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">RMMSeg 我近日做的一个 Ruby 中文分词实现，下一步是和 Ferret 进行集成。</span><span style="color:#710">&quot;</span></span>
+}
+<span style="color:#d70; font-weight:bold">$index</span> &lt;&lt; {
+<span style="color:#A60">:title</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">Ruby 1.9</span><span style="color:#710">&quot;</span></span>,
+<span style="color:#A60">:content</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">Ruby 1.9.0 已经发布了，1.9 的一个重大改进就是对 Unicode 的支持。</span><span style="color:#710">&quot;</span></span>
+}
+<span style="color:#d70; font-weight:bold">$index</span> &lt;&lt; {
+<span style="color:#A60">:title</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">Ferret</span><span style="color:#710">&quot;</span></span>,
+<span style="color:#A60">:content</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&lt;&lt;END</span></span><span style="background-color:#fff0f0"><span style="color:#D20">
+Ferret is a high-performance, full-featured text search engine library
+written for Ruby. It is inspired by Apache Lucene Java project. With
+the introduction of Ferret, Ruby users now have one of the fastest and
+most flexible search libraries available. And it is surprisingly easy
+to use.</span><span style="color:#710">
+END</span></span>
+}
+<span style="color:#080; font-weight:bold">def</span> <span style="color:#06B; font-weight:bold">highlight_search</span>(key)
+<span style="color:#d70; font-weight:bold">$index</span>.search_each(<span style="background-color:#fff0f0"><span style="color:#710">%Q!</span><span style="color:#D20">content:&quot;</span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>key<span style="font-weight: bold; color: #888">}</span></span><span style="color:#D20">&quot;</span><span style="color:#710">!</span></span>) <span style="color:#080; font-weight:bold">do</span> |id, score|
+puts <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">*** Document </span><span style="color:#04D">\&quot;</span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span><span style="color:#d70; font-weight:bold">$index</span>[id][<span style="color:#A60">:title</span>]<span style="font-weight: bold; color: #888">}</span></span><span style="color:#04D">\&quot;</span><span style="color:#D20"> found with a score of </span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>score<span style="font-weight: bold; color: #888">}</span></span><span style="color:#710">&quot;</span></span>
+puts <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">-</span><span style="color:#710">&quot;</span></span>*<span style="color:#00D; font-weight:bold">40</span>
+highlights = <span style="color:#d70; font-weight:bold">$index</span>.highlight(<span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">content:</span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>key<span style="font-weight: bold; color: #888">}</span></span><span style="color:#710">&quot;</span></span>, id,
+                              <span style="color:#A60">:field</span> =&gt; <span style="color:#A60">:content</span>,
+                              <span style="color:#A60">:pre_tag</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#04D">\033</span><span style="color:#D20">[36m</span><span style="color:#710">&quot;</span></span>,
+                              <span style="color:#A60">:post_tag</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#04D">\033</span><span style="color:#D20">[m</span><span style="color:#710">&quot;</span></span>)
+puts <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>highlights<span style="font-weight: bold; color: #888">}</span></span><span style="color:#710">&quot;</span></span>
+puts <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#710">&quot;</span></span>
+<span style="color:#080; font-weight:bold">end</span>
+<span style="color:#080; font-weight:bold">end</span>
+<span style="color:#038; font-weight:bold">ARGV</span>.each { |key|
+puts <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#04D">\033</span><span style="color:#D20">[33mSearching for </span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>key<span style="font-weight: bold; color: #888">}</span></span><span style="color:#D20">...</span><span style="color:#04D">\033</span><span style="color:#D20">[m</span><span style="color:#710">&quot;</span></span>
+puts <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#710">&quot;</span></span>
+highlight_search(key)
+}
+<span style="color:#888"># Local Variables:</span>
+<span style="color:#888"># coding: utf-8</span>
+<span style="color:#888"># End:</span>
+</pre>
+	<p>execute it on the following key words:</p>
+	<pre>$ ruby ferret_example.rb Ruby 中文</pre>
+	<p>will generate the following results:</p>
+	<pre class="code" lang="text">
+Searching for Ruby...
+*** Document &quot;RMMSeg&quot; found with a score of 0.21875
+----------------------------------------
+RMMSeg 我近日做的一个 Ruby 中文分词实现，下一步是和 Ferret 进行集成。
+*** Document &quot;Ruby 1.9&quot; found with a score of 0.21875
+----------------------------------------
+Ruby 1.9.0 已经发布了，1.9 的一个重大改进就是对 Unicode 的支持。
+*** Document &quot;Ferret&quot; found with a score of 0.176776692271233
+----------------------------------------
+Ferret is a high-performance, full-featured text search engine library
+written for Ruby. It is inspired by Apache Lucene Java project. With
+the introduction of Ferret, Ruby users now have one of the fastest and
+most flexible search libraries available. And it's surprisingly easy
+to use.
+Searching for 中文...
+*** Document &quot;分词&quot; found with a score of 0.281680464744568
+----------------------------------------
+中文分词比较困难，不像英文那样，直接在空格和标点符号的地方断开就可以了。
+*** Document &quot;RMMSeg&quot; found with a score of 0.281680464744568
+----------------------------------------
+RMMSeg 我近日做的一个 Ruby 中文分词实现，下一步是和 Ferret 进行集成。
+</pre>
+	<p>And if you run the example in terminal, you&#8217;ll see the result
+highlighted as in <a class="xref" href="#Ferret-Example-Screenshot">Figure 1: <em>Ferret Example Screenshot</em></a>.</p>
+	<p><div class="figure">
+  <p class="title"><a class="toc" id="Ferret-Example-Screenshot" href="#a-606823268">Figure 1</a>.&nbsp;&nbsp;Ferret Example Screenshot</p>
+  <div class="content"><img src="http://pluskid.lifegoo.com/wp-content/uploads/2008/02/rmmseg.png" alt="" /></div>
+</div></p></div>
+</div>
+<div class="section">
+  <h2 class="title">
+    <a class="toc" id="Customization" href="#a-606825488">3.3</a>&nbsp;&nbsp;Customization
+  </h2>
+  <div class="content"><p>RMMSeg can be customized through <code class="code"><span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Config</span></code>. For example, to use your own dictionaries, just set it before starting to do segmentation:</p>
+	<pre class="code" lang="ruby">
+<span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Config</span>.dictionaries = [[<span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">dict1.dic</span><span style="color:#710">&quot;</span></span>, <span style="color:#038; font-weight:bold">true</span>],  <span style="color:#888"># with frequency info</span>
+                               [<span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">dict2.dic</span><span style="color:#710">&quot;</span></span>, <span style="color:#038; font-weight:bold">false</span>], <span style="color:#888"># without</span>
+                               [<span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">dict3.dic</span><span style="color:#710">&quot;</span></span>, <span style="color:#038; font-weight:bold">false</span>]]
+<span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Config</span>.max_word_length = <span style="color:#00D; font-weight:bold">6</span>
+</pre>
+	<p>Or to use the simple algorithm for more efficient (and less accurate) segmenting:</p>
+	<pre class="code">
+<span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Config</span>.algorithm = <span style="color:#A60">:simple</span>
+</pre>
+	<p>For more information on customization, please refer to the RDoc of <a href="http://rmmseg.rubyforge.org/rmmseg/index.html">RMMSeg::Config</a>.</p></div>
 </div></div>
 </div>
 <div class="chapter">
   <h1 class="title">
     Chapter
-    <a class="toc" id="Resources" href="#a-606581648">4</a>
+    <a class="toc" id="Resources" href="#a-606828108">4</a>
     <br/>
@@ -985,9 +1130,10 @@ DUCQhrhWJkj394A0gKeUCjVo3r9Zv0r2P3yyQqPd16MPAAAAAElFTkSuQmCC
   </h1>
   <div class="content"><ul>
-	<li><a href="http://rmmseg.rubyforge.org/">Project Home</a>: The Project page at RubyForge.</li>
-		<li><a href="http://pluskid.lifegoo.com/?p=261">Implementation Details</a>: My blog post about the implementation details of RMMSeg.</li>
-		<li><a href="http://ferret.davebalmain.com/trac">Ferret Homepage</a>: The homepage of Ferret project.</li>
+	<li><a href="http://rubyforge.org/projects/rmmseg/">Project Home</a>: The Project page at RubyForge.</li>
+		<li><a href="http://rmmseg.rubyforge.org/rmmseg/index.html">RDoc of RMMSeg</a>: The auto generated rdoc of RMMSeg.</li>
+		<li><a href="http://pluskid.lifegoo.com/?p=261">Implementation Details</a>: My blog post about the implementation details of RMMSeg (Chinese).</li>
+		<li><a href="mailto:pluskid@gmail.com">Author&#8217;s Email</a>: Contact me if you have any problem.</li>
 	</ul></div>
 </div>
 </div>

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: rmmseg
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.1.0
 platform: ruby
 authors:
 - pluskid
@@ -39,10 +39,12 @@ files:
 - Rakefile
 - TODO.txt
 - bin/rmmseg
+- data/chars.dic
+- data/punctuation.dic
+- data/words.dic
 - lib/rmmseg.rb
 - lib/rmmseg/algorithm.rb
 - lib/rmmseg/amibguity.rb
-- lib/rmmseg/chars.dic
 - lib/rmmseg/chunk.rb
 - lib/rmmseg/complex_algorithm.rb
 - lib/rmmseg/config.rb
@@ -56,7 +58,7 @@ files:
 - lib/rmmseg/svwl_rule.rb
 - lib/rmmseg/token.rb
 - lib/rmmseg/word.rb
-- lib/rmmseg/words.dic
+- misc/ferret_example.rb
 - misc/homepage.erb
 - misc/homepage.html
 - spec/chunk_spec.rb