RubyGems - rmmseg - Versions diffs - 0.0.1 → 0.1.0 - Mend

rmmseg 0.0.1 → 0.1.0

Files changed (16) hide show

data/History.txt +6 -1
data/Manifest.txt +4 -2
data/README.txt +13 -1
data/Rakefile +1 -1
data/TODO.txt +0 -1
data/bin/rmmseg +0 -2
data/{lib/rmmseg → data}/chars.dic +0 -0
data/data/punctuation.dic +79 -0
data/{lib/rmmseg → data}/words.dic +0 -0
data/lib/rmmseg.rb +1 -1
data/lib/rmmseg/config.rb +3 -2
data/lib/rmmseg/ferret.rb +72 -1
data/misc/ferret_example.rb +56 -0
data/misc/homepage.erb +86 -6
data/misc/homepage.html +166 -20
metadata +5 -3

data/History.txt CHANGED Viewed

@@ -1,6 +1,11 @@
+=== 0.1.0 / 2008-02-01
+* Add filter to filter out Chinese punctuations.
 === 0.0.1 / 2008-01-31
-* Analyser integration with Ferret.
+* Analyzer integration with Ferret.
 * rdoc added
 * Lazily init the +Word+ objects inside the +Dictionary+.
 * Handle English punctuation correctly.

data/Manifest.txt CHANGED Viewed

@@ -4,10 +4,12 @@ README.txt
 Rakefile
 TODO.txt
 bin/rmmseg
+data/chars.dic
+data/punctuation.dic
+data/words.dic
 lib/rmmseg.rb
 lib/rmmseg/algorithm.rb
 lib/rmmseg/amibguity.rb
-lib/rmmseg/chars.dic
 lib/rmmseg/chunk.rb
 lib/rmmseg/complex_algorithm.rb
 lib/rmmseg/config.rb
@@ -21,7 +23,7 @@ lib/rmmseg/simple_algorithm.rb
 lib/rmmseg/svwl_rule.rb
 lib/rmmseg/token.rb
 lib/rmmseg/word.rb
-lib/rmmseg/words.dic
+misc/ferret_example.rb
 misc/homepage.erb
 misc/homepage.html
 spec/chunk_spec.rb

data/README.txt CHANGED Viewed

@@ -23,11 +23,23 @@ following essays:
 * Provides +rmmseg+ command line tool for quick and easy way to access
   the word segment feature.
-* Provides an +Analyser+ for integrating with Ferret[http://ferret.davebalmain.com/trac].
+* Provides an +Analyzer+ for integrating with Ferret[http://ferret.davebalmain.com/trac].
 == SYNOPSIS:
+Using the command line tool +rmmseg+ is simple:
   $ rmmseg --separator _ < input.txt
+passing option +-h+ can get an overview of all supported options.
+Using the +Analyzer+ for Ferret is even easier:
+  require 'rmmseg'
+  require 'rmmseg/ferret'
+  alalyzer = RMMSeg::Ferret::Analyzer.new
+  index = Ferret::Index::Index.new(:analyzer => analyzer)
+For more details, please refer to the {homepage usage section}[http://rmmseg.rubyforge.org/index.html#Usage].
 == REQUIREMENTS:

data/Rakefile CHANGED Viewed

@@ -23,7 +23,7 @@ Hoe.new('rmmseg', RMMSeg::VERSION) do |p|
 end
 task :homepage do
-  sh "gerbil html misc/homepage.erb > misc/homepage.html"
+  sh "cd misc && gerbil html homepage.erb > homepage.html"
 end
 task :publish_homepage do

data/TODO.txt CHANGED Viewed

@@ -1,3 +1,2 @@
 === TODO
-* Add filter to filter out Chinese punctuations.

data/bin/rmmseg CHANGED Viewed

@@ -1,7 +1,5 @@
 #!/usr/bin/env ruby
-$: << File.join(File.dirname(__FILE__), "..", "lib")
 require 'rmmseg'
 include RMMSeg

data/{lib/rmmseg → data}/chars.dic RENAMED Viewed

File without changes

data/data/punctuation.dic ADDED Viewed

@@ -0,0 +1,79 @@
+｛
+×
+π
+）
+〖
+；
+〗
+＜
+°
+“
+＋
+◆
+♀
+＝
+±
+←
+｝
+，
+”
+㎡
+◇
+＞
+↑
+～
+△
+？
+♂
+‰
+——
+→
+■
+￥
+－
+＠
+≈
+↓
+□
+〈
+′
+〉
+／
+★
+《
+○
+″
+☆
+》
+·
+∶
+！
+『
+§
+●
+』
+…
+【
+℃
+＃
+÷
+】
+№
+＄
+※
+≤
+‖
+％
+≥
+＆
+、
+‘
+〔
+。
+’
+√
+（
+〕
+￡
+：

data/{lib/rmmseg → data}/words.dic RENAMED Viewed

File without changes

data/lib/rmmseg.rb CHANGED Viewed

@@ -6,7 +6,7 @@ require 'rmmseg/simple_algorithm'
 require 'rmmseg/complex_algorithm'
 module RMMSeg
-  VERSION = '0.0.1'
+  VERSION = '0.1.0'
   # Segment +text+ using the algorithm configured.
   def segment(text)

data/lib/rmmseg/config.rb CHANGED Viewed

@@ -6,8 +6,9 @@ module RMMSeg
   class Config
     @algorithm = :complex
     @on_ambiguity = :select_first
-    @dictionaries = [[File.join(File.dirname(__FILE__), "chars.dic"), true],
-                     [File.join(File.dirname(__FILE__), "words.dic"), false]]
+    data_dir = File.join(File.dirname(__FILE__), "..", "..", "data")
+    @dictionaries = [[File.join(data_dir, "chars.dic"), true],
+                     [File.join(data_dir, "words.dic"), false]]
     @max_word_length = 4
     class << self

data/lib/rmmseg/ferret.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 # This file integrate RMMSeg with Ferret
+require 'singleton'
 require 'rubygems'
 require 'ferret'
@@ -6,8 +7,25 @@ module RMMSeg
   module Ferret
     # The Analyzer class can be used with Ferret .
     class Analyzer < ::Ferret::Analysis::Analyzer
+      # Construct an Analyzer. Optional block can be used to
+      # add more +TokenFilter+s. e.g.
+      #
+      #   analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
+      #     Ferret::Analysis::LowerCaseFilter.new(tokenizer)
+      #   }
+      #
+      def initialize(&brk)
+        @brk = brk
+      end
       def token_stream(field, text)
-        Tokenizer.new(text)
+        t = PunctuationFilter.new(Tokenizer.new(text))
+        if @brk
+          @brk.call(t)
+        else
+          t
+        end
       end
     end
@@ -39,5 +57,58 @@ module RMMSeg
         @algor = RMMSeg::Config.algorithm_instance(@text)
       end
     end
+    # PunctuationFilter filter out the stand alone Chinese
+    # punctuation tokens.
+    class PunctuationFilter < ::Ferret::Analysis::TokenStream
+      # The punctuation dictionary.
+      class Dictionary
+        include Singleton
+        DIC_FILE = File.join(File.dirname(__FILE__),
+                             "..",
+                             "..",
+                             "data",
+                             "punctuation.dic")
+        def initialize
+          @dic = Hash.new
+          File.open(DIC_FILE, "r") do |f|
+            f.each_line { |line|
+              @dic[line.chomp.freeze] = nil
+            }
+          end
+        end
+        def include?(str)
+          @dic.has_key?(str)
+        end
+      end
+      def initialize(stream)
+        @stream = stream
+      end
+      # Get next token, skip stand alone Chinese punctuations.
+      def next
+        token = nil
+        dic = Dictionary.instance
+        loop do
+          token = @stream.next
+          break if token.nil?
+          break unless dic.include? token.text
+        end
+        token
+      end
+      def text
+        @stream.text
+      end
+      def text=(str)
+        @stream.text = str
+      end
+    end
   end
 end

data/misc/ferret_example.rb ADDED Viewed

@@ -0,0 +1,56 @@
+#!/usr/bin/env ruby
+require 'rubygems'
+require 'rmmseg'
+require 'rmmseg/ferret'
+analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
+  Ferret::Analysis::LowerCaseFilter.new(tokenizer)
+}
+$index = Ferret::Index::Index.new(:analyzer => analyzer,
+                                  :path => '/tmp/index')
+$index << {
+  :title => "分词",
+  :content => "中文分词比较困难，不像英文那样，直接在空格和标点符号的地方断开就可以了。"
+}
+$index << {
+  :title => "RMMSeg",
+  :content => "RMMSeg 我近日做的一个 Ruby 中文分词实现，下一步是和 Ferret 进行集成。"
+}
+$index << {
+  :title => "Ruby 1.9",
+  :content => "Ruby 1.9.0 已经发布了，1.9 的一个重大改进就是对 Unicode 的支持。"
+}
+$index << {
+  :title => "Ferret",
+  :content => <<END
+Ferret is a high-performance, full-featured text search engine library
+written for Ruby. It is inspired by Apache Lucene Java project. With
+the introduction of Ferret, Ruby users now have one of the fastest and
+most flexible search libraries available. And it is surprisingly easy
+to use.
+END
+}
+def highlight_search(key)
+  $index.search_each(%Q!content:"#{key}"!) do |id, score|
+    puts "*** Document \"#{$index[id][:title]}\" found with a score of #{score}"
+    puts "-"*40
+    highlights = $index.highlight("content:#{key}", id,
+                                  :field => :content,
+                                  :pre_tag => "\033[36m",
+                                  :post_tag => "\033[m")
+    puts "#{highlights}"
+    puts ""
+  end
+end
+ARGV.each { |key|
+  puts "\033[33mSearching for #{key}...\033[m"
+  puts ""
+  highlight_search(key)
+}
+# Local Variables:
+# coding: utf-8
+# End:

data/misc/homepage.erb CHANGED Viewed

@@ -1,3 +1,4 @@
+<%# -*- mode: text; coding: utf-8 -*- %>
 <%
   $title = "RMMSeg Homepage"
   $authors = { 'pluskid' => 'http://pluskid.lifegoo.com' }
@@ -22,7 +23,7 @@
   * http://technology.chtsai.org/mmseg/
   * http://pluskid.lifegoo.com/?p=261
-  RMMSeg can be used as either a stand alone program or an Analyser of
+  RMMSeg can be used as either a stand alone program or an Analyzer of
   "Ferret":http://ferret.davebalmain.com/trac.
 <% end %>
@@ -46,7 +47,7 @@
         sudo gem install rmmseg
-      Or you can download the gem file manually from RubyForge and install it locally:
+      Or you can download the gem file manually from "RubyForge":http://rubyforge.org/projects/rmmseg/ and install it locally:
         sudo gem install --local rmmseg-x.y.z.gem
@@ -77,15 +78,94 @@
       rmmseg -h
-    It reads from STDIN and print result to STDOUT.
+    It reads from STDIN and print result to STDOUT. Here is a real
+    example:
+      $ echo "我们都喜欢用 Ruby" | rmmseg
+      我们 都 喜欢 用 Ruby
+  <% end %>
+  <% section "Analyzer for Ferret" do %>
+    RMMSeg include an analyzer for Ferret. It is simply ready to
+    use. Just require it and pass it to Ferret. Here's a complete
+    example:
+    <code lang="ruby">
+    <%# include ferret_example.rb %>
+    </code>
+    execute it on the following key words:
+      $ ruby ferret_example.rb Ruby 中文
+    will generate the following results:
+    <code lang="text">
+    Searching for Ruby...
+    *** Document "RMMSeg" found with a score of 0.21875
+    ----------------------------------------
+    RMMSeg 我近日做的一个 Ruby 中文分词实现，下一步是和 Ferret 进行集成。
+    *** Document "Ruby 1.9" found with a score of 0.21875
+    ----------------------------------------
+    Ruby 1.9.0 已经发布了，1.9 的一个重大改进就是对 Unicode 的支持。
+    *** Document "Ferret" found with a score of 0.176776692271233
+    ----------------------------------------
+    Ferret is a high-performance, full-featured text search engine library
+    written for Ruby. It is inspired by Apache Lucene Java project. With
+    the introduction of Ferret, Ruby users now have one of the fastest and
+    most flexible search libraries available. And it's surprisingly easy
+    to use.
+    Searching for 中文...
+    *** Document "分词" found with a score of 0.281680464744568
+    ----------------------------------------
+    中文分词比较困难，不像英文那样，直接在空格和标点符号的地方断开就可以了。
+    *** Document "RMMSeg" found with a score of 0.281680464744568
+    ----------------------------------------
+    RMMSeg 我近日做的一个 Ruby 中文分词实现，下一步是和 Ferret 进行集成。
+    </code>
+    And if you run the example in terminal, you'll see the result
+    highlighted as in <%= xref "Ferret Example Screenshot" %>.
+    <% figure "Ferret Example Screenshot" do %>
+      !http://pluskid.lifegoo.com/wp-content/uploads/2008/02/rmmseg.png!
+    <% end %>
+  <% end %>
+  <% section "Customization" do %>
+    RMMSeg can be customized through @RMMSeg::Config@. For example, to use your own dictionaries, just set it before starting to do segmentation:
+    <code lang="ruby">
+    RMMSeg::Config.dictionaries = [["dict1.dic", true],  # with frequency info
+                                   ["dict2.dic", false], # without
+                                   ["dict3.dic", false]]
+    RMMSeg::Config.max_word_length = 6
+    </code>
+    Or to use the simple algorithm for more efficient (and less accurate) segmenting:
+    <code>
+    RMMSeg::Config.algorithm = :simple
+    </code>
+    For more information on customization, please refer to the RDoc of "RMMSeg::Config":http://rmmseg.rubyforge.org/rmmseg/index.html.
   <% end %>
 <% end %>
 <% chapter "Resources" do %>
-  * "Project Home":http://rmmseg.rubyforge.org/: The Project page at RubyForge.
-  * "Implementation Details":http://pluskid.lifegoo.com/?p=261: My blog post about the implementation details of RMMSeg.
-  * "Ferret Homepage":http://ferret.davebalmain.com/trac: The homepage of Ferret project.
+  * "Project Home":http://rubyforge.org/projects/rmmseg/: The Project page at RubyForge.
+  * "RDoc of RMMSeg":http://rmmseg.rubyforge.org/rmmseg/index.html: The auto generated rdoc of RMMSeg.
+  * "Implementation Details":http://pluskid.lifegoo.com/?p=261: My blog post about the implementation details of RMMSeg (Chinese).
+  * "Author's Email":mailto:pluskid@gmail.com: Contact me if you have any problem.
 <% end %>
 <% footer do %>

data/misc/homepage.html CHANGED Viewed

@@ -2,7 +2,7 @@
 <html>
   <head>
     <meta http-equiv="content-type" content="text/html; charset=utf-8"/>
-    <meta name="date" content="31 January 2008"/>
+    <meta name="date" content="01 February 2008"/>
     <meta name="author" content="pluskid"/>
     <meta name="generator" content="Gerbil 1.1.0"/>
     <title>RMMSeg Homepage</title>
@@ -763,19 +763,19 @@
       <h1 class="title">RMMSeg Homepage</h1>
       <h2 class="authors"><a href="http://pluskid.lifegoo.com">pluskid</a></h2>
-      <h3 class="date">31 January 2008</h3>
+      <h3 class="date">01 February 2008</h3>
     </div>
-    <div id="toc"><h1>Contents</h1> <ul><li>1&nbsp;&nbsp;<a id="a-606563428" href="#Introduction">Introduction</a></li><li>2&nbsp;&nbsp;<a id="a-606565568" href="#Setup">Setup</a><ul><li>2.1&nbsp;&nbsp;<a id="a-606567068" href="#Requirements">Requirements</a></li><li>2.2&nbsp;&nbsp;<a id="a-606569178" href="#Installation">Installation</a><ul><li>2.2.1&nbsp;&nbsp;<a id="a-606570778" href="#Using-RubyGems">Using RubyGems</a></li><li>2.2.2&nbsp;&nbsp;<a id="a-606572868" href="#From-Subversion">From Subversion</a></li></ul></li></ul></li><li>3&nbsp;&nbsp;<a id="a-606577658" href="#Usage">Usage</a><ul><li>3.1&nbsp;&nbsp;<a id="a-606579198" href="#Stand-Alone-rmmseg">Stand Alone rmmseg</a></li></ul></li><li>4&nbsp;&nbsp;<a id="a-606581648" href="#Resources">Resources</a></li></ul></div>
+    <div id="toc"><h1>Contents</h1> <ul><li>1&nbsp;&nbsp;<a id="a-606801458" href="#Introduction">Introduction</a></li><li>2&nbsp;&nbsp;<a id="a-606803598" href="#Setup">Setup</a><ul><li>2.1&nbsp;&nbsp;<a id="a-606805098" href="#Requirements">Requirements</a></li><li>2.2&nbsp;&nbsp;<a id="a-606807208" href="#Installation">Installation</a><ul><li>2.2.1&nbsp;&nbsp;<a id="a-606808808" href="#Using-RubyGems">Using RubyGems</a></li><li>2.2.2&nbsp;&nbsp;<a id="a-606810898" href="#From-Subversion">From Subversion</a></li></ul></li></ul></li><li>3&nbsp;&nbsp;<a id="a-606815688" href="#Usage">Usage</a><ul><li>3.1&nbsp;&nbsp;<a id="a-606817228" href="#Stand-Alone-rmmseg">Stand Alone rmmseg</a></li><li>3.2&nbsp;&nbsp;<a id="a-606819308" href="#Analyzer-for-Ferret">Analyzer for Ferret</a></li><li>3.3&nbsp;&nbsp;<a id="a-606825488" href="#Customization">Customization</a></li></ul></li><li>4&nbsp;&nbsp;<a id="a-606828108" href="#Resources">Resources</a></li></ul></div>
-    <div id="lof"><h1>Notes</h1> <ol><li><a id="a-606574508" href="#The-latest-code-might-be-unstable">The latest code might be unstable</a></li></ol></div>
+    <div id="lof"><h1>Figures</h1> <ol><li><a id="a-606823268" href="#Ferret-Example-Screenshot">Ferret Example Screenshot</a></li></ol><h1>Notes</h1> <ol><li><a id="a-606812538" href="#The-latest-code-might-be-unstable">The latest code might be unstable</a></li></ol></div>
     <div id="content">
 <div class="chapter">
   <h1 class="title">
     Chapter
-    <a class="toc" id="Introduction" href="#a-606563428">1</a>
+    <a class="toc" id="Introduction" href="#a-606801458">1</a>
     <br/>
@@ -805,13 +805,13 @@ following essays:</p>
 	</ul>
-	<p>RMMSeg can be used as either a stand alone program or an Analyser of
+	<p>RMMSeg can be used as either a stand alone program or an Analyzer of
 <a href="http://ferret.davebalmain.com/trac">Ferret</a>.</p></div>
 </div>
 <div class="chapter">
   <h1 class="title">
     Chapter
-    <a class="toc" id="Setup" href="#a-606565568">2</a>
+    <a class="toc" id="Setup" href="#a-606803598">2</a>
     <br/>
@@ -820,7 +820,7 @@ following essays:</p>
   <div class="content"><div class="section">
   <h2 class="title">
-    <a class="toc" id="Requirements" href="#a-606567068">2.1</a>&nbsp;&nbsp;Requirements
+    <a class="toc" id="Requirements" href="#a-606805098">2.1</a>&nbsp;&nbsp;Requirements
   </h2>
   <div class="content">Your system needs the following software to run RMMSeg.
@@ -850,11 +850,11 @@ following essays:</p>
 </div>
 <div class="section">
   <h2 class="title">
-    <a class="toc" id="Installation" href="#a-606569178">2.2</a>&nbsp;&nbsp;Installation
+    <a class="toc" id="Installation" href="#a-606807208">2.2</a>&nbsp;&nbsp;Installation
   </h2>
   <div class="content"><div class="section">
   <h3 class="title">
-    <a class="toc" id="Using-RubyGems" href="#a-606570778">2.2.1</a>&nbsp;&nbsp;Using RubyGems
+    <a class="toc" id="Using-RubyGems" href="#a-606808808">2.2.1</a>&nbsp;&nbsp;Using RubyGems
   </h3>
   <div class="content"><p>To install the gem remotely from <a href="http://rubyforge.org">RubyForge</a> :</p>
@@ -862,18 +862,18 @@ following essays:</p>
 	<pre>sudo gem install rmmseg</pre>
-	<p>Or you can download the gem file manually from RubyForge and install it locally:</p>
+	<p>Or you can download the gem file manually from <a href="http://rubyforge.org/projects/rmmseg/">RubyForge</a> and install it locally:</p>
 	<pre>sudo gem install --local rmmseg-x.y.z.gem</pre></div>
 </div>
 <div class="section">
   <h3 class="title">
-    <a class="toc" id="From-Subversion" href="#a-606572868">2.2.2</a>&nbsp;&nbsp;From Subversion
+    <a class="toc" id="From-Subversion" href="#a-606810898">2.2.2</a>&nbsp;&nbsp;From Subversion
   </h3>
   <div class="content"><p>From subversion repository hosted at <a href="http://rmmseg.rubyforge.org/svn/">RubyForge</a>, you can always get the latest source code.
 <div class="note">
-  <p class="title"><a class="toc" id="The-latest-code-might-be-unstable" href="#a-606574508">Note 1</a>.&nbsp;&nbsp;The latest code might be unstable</p>
+  <p class="title"><a class="toc" id="The-latest-code-might-be-unstable" href="#a-606812538">Note 1</a>.&nbsp;&nbsp;The latest code might be unstable</p>
   <img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAADAAAAAwCAYAAABXAvmHAAAABHNCSVQICAgI
 fAhkiAAAAAlwSFlzAAAN1wAADdcBQiibeAAAABl0RVh0U29mdHdhcmUAd3d3
@@ -954,7 +954,7 @@ DUCQhrhWJkj394A0gKeUCjVo3r9Zv0r2P3yyQqPd16MPAAAAAElFTkSuQmCC
 <div class="chapter">
   <h1 class="title">
     Chapter
-    <a class="toc" id="Usage" href="#a-606577658">3</a>
+    <a class="toc" id="Usage" href="#a-606815688">3</a>
     <br/>
@@ -963,7 +963,7 @@ DUCQhrhWJkj394A0gKeUCjVo3r9Zv0r2P3yyQqPd16MPAAAAAElFTkSuQmCC
   <div class="content"><div class="section">
   <h2 class="title">
-    <a class="toc" id="Stand-Alone-rmmseg" href="#a-606579198">3.1</a>&nbsp;&nbsp;Stand Alone rmmseg
+    <a class="toc" id="Stand-Alone-rmmseg" href="#a-606817228">3.1</a>&nbsp;&nbsp;Stand Alone rmmseg
   </h2>
   <div class="content"><p>RMMSeg comes with a script <code class="code">rmmseg</code>. To get the basic usage, just execute it with <code class="code">-h</code> option:</p>
@@ -971,13 +971,158 @@ DUCQhrhWJkj394A0gKeUCjVo3r9Zv0r2P3yyQqPd16MPAAAAAElFTkSuQmCC
 	<pre>rmmseg -h</pre>
-	<p>It reads from STDIN and print result to STDOUT.</p></div>
+	<p>It reads from STDIN and print result to STDOUT. Here is a real
+example:</p>
+	<pre>$ echo "我们都喜欢用 Ruby" | rmmseg
+我们 都 喜欢 用 Ruby</pre></div>
+</div>
+<div class="section">
+  <h2 class="title">
+    <a class="toc" id="Analyzer-for-Ferret" href="#a-606819308">3.2</a>&nbsp;&nbsp;Analyzer for Ferret
+  </h2>
+  <div class="content"><p>RMMSeg include an analyzer for Ferret. It is simply ready to
+use. Just require it and pass it to Ferret. Here&#8217;s a complete
+example:</p>
+	<pre class="code" lang="ruby">
+<span style="color:#888">#!/usr/bin/env ruby</span>
+require <span style="background-color:#fff0f0"><span style="color:#710">'</span><span style="color:#D20">rubygems</span><span style="color:#710">'</span></span>
+require <span style="background-color:#fff0f0"><span style="color:#710">'</span><span style="color:#D20">rmmseg</span><span style="color:#710">'</span></span>
+require <span style="background-color:#fff0f0"><span style="color:#710">'</span><span style="color:#D20">rmmseg/ferret</span><span style="color:#710">'</span></span>
+analyzer = <span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Ferret</span>::<span style="color:#036; font-weight:bold">Analyzer</span>.new
+<span style="color:#d70; font-weight:bold">$index</span> = <span style="color:#036; font-weight:bold">Ferret</span>::<span style="color:#036; font-weight:bold">Index</span>::<span style="color:#036; font-weight:bold">Index</span>.new(<span style="color:#A60">:analyzer</span> =&gt; analyzer)
+<span style="color:#d70; font-weight:bold">$index</span> &lt;&lt; {
+<span style="color:#A60">:title</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">分词</span><span style="color:#710">&quot;</span></span>,
+<span style="color:#A60">:content</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">中文分词比较困难，不像英文那样，直接在空格和标点符号的地方断开就可以了。</span><span style="color:#710">&quot;</span></span>
+}
+<span style="color:#d70; font-weight:bold">$index</span> &lt;&lt; {
+<span style="color:#A60">:title</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">RMMSeg</span><span style="color:#710">&quot;</span></span>,
+<span style="color:#A60">:content</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">RMMSeg 我近日做的一个 Ruby 中文分词实现，下一步是和 Ferret 进行集成。</span><span style="color:#710">&quot;</span></span>
+}
+<span style="color:#d70; font-weight:bold">$index</span> &lt;&lt; {
+<span style="color:#A60">:title</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">Ruby 1.9</span><span style="color:#710">&quot;</span></span>,
+<span style="color:#A60">:content</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">Ruby 1.9.0 已经发布了，1.9 的一个重大改进就是对 Unicode 的支持。</span><span style="color:#710">&quot;</span></span>
+}
+<span style="color:#d70; font-weight:bold">$index</span> &lt;&lt; {
+<span style="color:#A60">:title</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">Ferret</span><span style="color:#710">&quot;</span></span>,
+<span style="color:#A60">:content</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&lt;&lt;END</span></span><span style="background-color:#fff0f0"><span style="color:#D20">
+Ferret is a high-performance, full-featured text search engine library
+written for Ruby. It is inspired by Apache Lucene Java project. With
+the introduction of Ferret, Ruby users now have one of the fastest and
+most flexible search libraries available. And it is surprisingly easy
+to use.</span><span style="color:#710">
+END</span></span>
+}
+<span style="color:#080; font-weight:bold">def</span> <span style="color:#06B; font-weight:bold">highlight_search</span>(key)
+<span style="color:#d70; font-weight:bold">$index</span>.search_each(<span style="background-color:#fff0f0"><span style="color:#710">%Q!</span><span style="color:#D20">content:&quot;</span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>key<span style="font-weight: bold; color: #888">}</span></span><span style="color:#D20">&quot;</span><span style="color:#710">!</span></span>) <span style="color:#080; font-weight:bold">do</span> |id, score|
+puts <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">*** Document </span><span style="color:#04D">\&quot;</span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span><span style="color:#d70; font-weight:bold">$index</span>[id][<span style="color:#A60">:title</span>]<span style="font-weight: bold; color: #888">}</span></span><span style="color:#04D">\&quot;</span><span style="color:#D20"> found with a score of </span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>score<span style="font-weight: bold; color: #888">}</span></span><span style="color:#710">&quot;</span></span>
+puts <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">-</span><span style="color:#710">&quot;</span></span>*<span style="color:#00D; font-weight:bold">40</span>
+highlights = <span style="color:#d70; font-weight:bold">$index</span>.highlight(<span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">content:</span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>key<span style="font-weight: bold; color: #888">}</span></span><span style="color:#710">&quot;</span></span>, id,
+                              <span style="color:#A60">:field</span> =&gt; <span style="color:#A60">:content</span>,
+                              <span style="color:#A60">:pre_tag</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#04D">\033</span><span style="color:#D20">[36m</span><span style="color:#710">&quot;</span></span>,
+                              <span style="color:#A60">:post_tag</span> =&gt; <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#04D">\033</span><span style="color:#D20">[m</span><span style="color:#710">&quot;</span></span>)
+puts <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>highlights<span style="font-weight: bold; color: #888">}</span></span><span style="color:#710">&quot;</span></span>
+puts <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#710">&quot;</span></span>
+<span style="color:#080; font-weight:bold">end</span>
+<span style="color:#080; font-weight:bold">end</span>
+<span style="color:#038; font-weight:bold">ARGV</span>.each { |key|
+puts <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#04D">\033</span><span style="color:#D20">[33mSearching for </span><span style="background: #eee"><span style="font-weight: bold; color: #888">#{</span>key<span style="font-weight: bold; color: #888">}</span></span><span style="color:#D20">...</span><span style="color:#04D">\033</span><span style="color:#D20">[m</span><span style="color:#710">&quot;</span></span>
+puts <span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#710">&quot;</span></span>
+highlight_search(key)
+}
+<span style="color:#888"># Local Variables:</span>
+<span style="color:#888"># coding: utf-8</span>
+<span style="color:#888"># End:</span>
+</pre>
+	<p>execute it on the following key words:</p>
+	<pre>$ ruby ferret_example.rb Ruby 中文</pre>
+	<p>will generate the following results:</p>
+	<pre class="code" lang="text">
+Searching for Ruby...
+*** Document &quot;RMMSeg&quot; found with a score of 0.21875
+----------------------------------------
+RMMSeg 我近日做的一个 Ruby 中文分词实现，下一步是和 Ferret 进行集成。
+*** Document &quot;Ruby 1.9&quot; found with a score of 0.21875
+----------------------------------------
+Ruby 1.9.0 已经发布了，1.9 的一个重大改进就是对 Unicode 的支持。
+*** Document &quot;Ferret&quot; found with a score of 0.176776692271233
+----------------------------------------
+Ferret is a high-performance, full-featured text search engine library
+written for Ruby. It is inspired by Apache Lucene Java project. With
+the introduction of Ferret, Ruby users now have one of the fastest and
+most flexible search libraries available. And it's surprisingly easy
+to use.
+Searching for 中文...
+*** Document &quot;分词&quot; found with a score of 0.281680464744568
+----------------------------------------
+中文分词比较困难，不像英文那样，直接在空格和标点符号的地方断开就可以了。
+*** Document &quot;RMMSeg&quot; found with a score of 0.281680464744568
+----------------------------------------
+RMMSeg 我近日做的一个 Ruby 中文分词实现，下一步是和 Ferret 进行集成。
+</pre>
+	<p>And if you run the example in terminal, you&#8217;ll see the result
+highlighted as in <a class="xref" href="#Ferret-Example-Screenshot">Figure 1: <em>Ferret Example Screenshot</em></a>.</p>
+	<p><div class="figure">
+  <p class="title"><a class="toc" id="Ferret-Example-Screenshot" href="#a-606823268">Figure 1</a>.&nbsp;&nbsp;Ferret Example Screenshot</p>
+  <div class="content"><img src="http://pluskid.lifegoo.com/wp-content/uploads/2008/02/rmmseg.png" alt="" /></div>
+</div></p></div>
+</div>
+<div class="section">
+  <h2 class="title">
+    <a class="toc" id="Customization" href="#a-606825488">3.3</a>&nbsp;&nbsp;Customization
+  </h2>
+  <div class="content"><p>RMMSeg can be customized through <code class="code"><span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Config</span></code>. For example, to use your own dictionaries, just set it before starting to do segmentation:</p>
+	<pre class="code" lang="ruby">
+<span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Config</span>.dictionaries = [[<span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">dict1.dic</span><span style="color:#710">&quot;</span></span>, <span style="color:#038; font-weight:bold">true</span>],  <span style="color:#888"># with frequency info</span>
+                               [<span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">dict2.dic</span><span style="color:#710">&quot;</span></span>, <span style="color:#038; font-weight:bold">false</span>], <span style="color:#888"># without</span>
+                               [<span style="background-color:#fff0f0"><span style="color:#710">&quot;</span><span style="color:#D20">dict3.dic</span><span style="color:#710">&quot;</span></span>, <span style="color:#038; font-weight:bold">false</span>]]
+<span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Config</span>.max_word_length = <span style="color:#00D; font-weight:bold">6</span>
+</pre>
+	<p>Or to use the simple algorithm for more efficient (and less accurate) segmenting:</p>
+	<pre class="code">
+<span style="color:#036; font-weight:bold">RMMSeg</span>::<span style="color:#036; font-weight:bold">Config</span>.algorithm = <span style="color:#A60">:simple</span>
+</pre>
+	<p>For more information on customization, please refer to the RDoc of <a href="http://rmmseg.rubyforge.org/rmmseg/index.html">RMMSeg::Config</a>.</p></div>
 </div></div>
 </div>
 <div class="chapter">
   <h1 class="title">
     Chapter
-    <a class="toc" id="Resources" href="#a-606581648">4</a>
+    <a class="toc" id="Resources" href="#a-606828108">4</a>
     <br/>
@@ -985,9 +1130,10 @@ DUCQhrhWJkj394A0gKeUCjVo3r9Zv0r2P3yyQqPd16MPAAAAAElFTkSuQmCC
   </h1>
   <div class="content"><ul>
-	<li><a href="http://rmmseg.rubyforge.org/">Project Home</a>: The Project page at RubyForge.</li>
-		<li><a href="http://pluskid.lifegoo.com/?p=261">Implementation Details</a>: My blog post about the implementation details of RMMSeg.</li>
-		<li><a href="http://ferret.davebalmain.com/trac">Ferret Homepage</a>: The homepage of Ferret project.</li>
+	<li><a href="http://rubyforge.org/projects/rmmseg/">Project Home</a>: The Project page at RubyForge.</li>
+		<li><a href="http://rmmseg.rubyforge.org/rmmseg/index.html">RDoc of RMMSeg</a>: The auto generated rdoc of RMMSeg.</li>
+		<li><a href="http://pluskid.lifegoo.com/?p=261">Implementation Details</a>: My blog post about the implementation details of RMMSeg (Chinese).</li>
+		<li><a href="mailto:pluskid@gmail.com">Author&#8217;s Email</a>: Contact me if you have any problem.</li>
 	</ul></div>
 </div>
 </div>

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: rmmseg
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.1.0
 platform: ruby
 authors:
 - pluskid
@@ -39,10 +39,12 @@ files:
 - Rakefile
 - TODO.txt
 - bin/rmmseg
+- data/chars.dic
+- data/punctuation.dic
+- data/words.dic
 - lib/rmmseg.rb
 - lib/rmmseg/algorithm.rb
 - lib/rmmseg/amibguity.rb
-- lib/rmmseg/chars.dic
 - lib/rmmseg/chunk.rb
 - lib/rmmseg/complex_algorithm.rb
 - lib/rmmseg/config.rb
@@ -56,7 +58,7 @@ files:
 - lib/rmmseg/svwl_rule.rb
 - lib/rmmseg/token.rb
 - lib/rmmseg/word.rb
-- lib/rmmseg/words.dic
+- misc/ferret_example.rb
 - misc/homepage.erb
 - misc/homepage.html
 - spec/chunk_spec.rb