RubyGems - lijia-rmmseg-cpp - Versions diffs - 10.2.9.2 - Mend

lijia-rmmseg-cpp 10.2.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

data/ext/rmmseg/token.h ADDED

@@ -0,0 +1,19 @@
+#ifndef _TOKEN_H_
+#define _TOKEN_H_
+namespace rmmseg
+{
+    struct Token
+    {
+        Token(const char *txt, int len)
+            :text(txt), length(len) { }
+        // `text' may or may not be nul-terminated, its length
+        // should be stored in the `length' field.
+        //
+        // if length is 0, this is an empty token
+        const char *text;
+        int length;
+    };
+}
+#endif /* _TOKEN_H_ */

data/ext/rmmseg/word.h ADDED

@@ -0,0 +1,44 @@
+#ifndef _WORD_H_
+#define _WORD_H_
+#include <climits>
+#include <cstring>
+#include "memory.h"
+namespace rmmseg
+{
+    const int word_embed_len = 4; /* at least 1 char (3 bytes+'\0') */
+    struct Word
+    {
+        unsigned char   nbytes;   /* number of bytes */
+        char            length;   /* number of characters */
+        unsigned short  freq;
+        char            text[word_embed_len];
+    };
+    /**
+     * text: the text of the word.
+     * length: number of characters (not bytes).
+     * freq: the frequency of the word.
+     */
+    inline Word *make_word(const char *text, int length=1,
+                           int freq=0, int nbytes=-1)
+    {
+        if (freq > USHRT_MAX)
+            freq = USHRT_MAX;   /* avoid overflow */
+        if (nbytes == -1)
+            nbytes = std::strlen(text);
+        Word *w = static_cast<Word *>(pool_alloc(sizeof(Word)
+                                                 + nbytes+1
+                                                 - word_embed_len));
+        w->nbytes = nbytes;
+        w->length = length;
+        w->freq = freq;
+        std::strncpy(w->text, text, nbytes);
+        w->text[nbytes] = '\0';
+        return w;
+    }
+}
+#endif /* _WORD_H_ */

data/lib/rmmseg.rb ADDED

@@ -0,0 +1,3 @@
+require File.join(File.dirname(__FILE__), 'rmmseg', 'dictionary')
+require File.join(File.dirname(__FILE__), '..',
+                  'ext', 'rmmseg', 'rmmseg')

data/lib/rmmseg/dictionary.rb ADDED

@@ -0,0 +1,59 @@
+module RMMSeg
+  module Dictionary
+    @dictionaries = [
+                     [:chars, File.join(File.dirname(__FILE__),
+                                        "..", "..", "data", "chars.dic")],
+                     [:words, File.join(File.dirname(__FILE__),
+                                        "..", "..", "data", "words.dic")]
+                    ]
+    class << self
+      #
+      # An array of dictionaries used by RMMSeg. Each entry is of the
+      # following form:
+      #
+      #   [type, path]
+      #
+      # where +type+ can either <tt>:chars</tt> or <tt>:words</tt>. +path+ is the path
+      # to the dictionary file.
+      #
+      # The format of <tt>:chars</tt> dictionary is a collection of lines of the
+      # following form:
+      #
+      #   freq char
+      #
+      # Where +frequency+ is a number <b>less than 65535</b>. +char+ is the
+      # character. They are spearated by <b>exactly one space</b>.
+      #
+      # The format of <tt>:words</tt> dictionary is similar:
+      #
+      #   length word
+      #
+      # except the first number is not the frequency, but the number of
+      # characters (not number of bytes) in the word.
+      #
+      # There's a script (convert.rb) in the tools directory that can be used
+      # to convert and normalize dictionaries.
+      attr_accessor :dictionaries
+      # Add a user defined dictionary, +type+ can be
+      # +:chars+ or <tt>:words</tt>. See doc of dictionaries.
+      def add_dictionary(path, type)
+        @dictionaries << [type, path]
+      end
+      # Load dictionaries. Call this method after set up the path of the
+      # dictionaries needed to load and before any Algorithm object is
+      # created.
+      def load_dictionaries()
+        @dictionaries.each do |type, path|
+          if type == :chars
+            load_chars(path)
+          elsif type == :words
+            load_words(path)
+          end
+        end
+      end
+    end
+  end
+end

data/lib/rmmseg/ferret.rb ADDED

@@ -0,0 +1,64 @@
+require 'rubygems'
+require 'rmmseg'
+require 'ferret'
+module RMMSeg
+  module Ferret
+        # The Analyzer class can be used with Ferret .
+    class Analyzer < ::Ferret::Analysis::Analyzer
+      # Construct an Analyzer. Optional block can be used to
+      # add more +TokenFilter+s. e.g.
+      #
+      #   analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
+      #     Ferret::Analysis::LowerCaseFilter.new(tokenizer)
+      #   }
+      #
+      def initialize(&brk)
+        @brk = brk
+      end
+      def token_stream(field, text)
+        t = Tokenizer.new(text)
+        if @brk
+          @brk.call(t)
+        else
+          t
+        end
+      end
+    end
+    # The Tokenizer tokenize text with RMMSeg::Algorithm.
+    class Tokenizer < ::Ferret::Analysis::TokenStream
+      # Create a new Tokenizer to tokenize +text+
+      def initialize(str)
+        self.text = str
+      end
+      # Get next token
+      def next
+        tok = @algor.next_token
+        if tok.nil?
+          return nil
+        else
+          @token.text = tok.text
+          @token.start = tok.start
+          @token.end = tok.end
+          return @token
+        end
+      end
+      # Get the text being tokenized
+      def text
+        @text
+      end
+      # Set the text to be tokenized
+      def text=(str)
+        @token = ::Ferret::Analysis::Token.new("", 0, 0)
+        @text = str
+        @algor = Algorithm.new(@text)
+      end
+    end
+  end
+end

data/misc/convert.rb ADDED

@@ -0,0 +1,114 @@
+#!/usr/bin/ruby
+# A utility used to convert the old RMMSeg dictionary
+# to rmmseg-cpp format.
+# There are several constrains for the new rmmseg-cpp
+# dictionary format.
+#  - length of word should be specified in the dict
+#  - number and string should be separated by ONE space
+#  - there should be a newline at the end of file
+# $KCODE='u'
+# require 'jcode'
+def usage(msg=nil)
+  puts "***ERROR: #{msg}\n\n" if msg
+  puts <<EOT
+Usage:
+#{$0} action type input.dic output.dic
+  action: either 'convert' or 'normalize'
+           - 'convert' is used to convert the dict from
+             old RMMSeg format.
+           - 'normalize' is used to normalize an existing
+             rmmseg-cpp dict.
+  type:   either 'words' or 'chars'
+EOT
+  exit(0)
+end
+usage if ARGV.size != 4
+usage("unknown action #{ARGV[0]}") if ! ['convert', 'normalize'].include? ARGV[0]
+usage("unknown type #{ARGV[1]}") if ! ['words', 'chars'].include? ARGV[1]
+def output(data)
+  File.open(ARGV[3], "w") do |f|
+    data.each do |num, word|
+      f.puts "#{num} #{word}" if word
+    end
+  end
+end
+def read_RMMSeg_chars
+  max = 0
+  File.readlines(ARGV[2]).map do |line|
+    if line =~ /^(.)\s+(\d+)$/
+      n = $2.to_i
+      max = n if n > max
+      [n, $1]
+    else
+      [nil, nil]
+    end
+  end.map do |num, word|
+    if word
+      [num*65535/max, word]
+    else
+      [nil, nil]
+    end
+  end
+end
+def read_RMMSeg_words
+  File.readlines(ARGV[2]).map do |line|
+    line.chomp!
+    if !line.empty?
+      [line.size, line]
+    else
+      [nil, nil]
+    end
+  end
+end
+def read_rmmseg_cpp_chars
+  max = 0
+  File.readlines(ARGV[2]).map do |line|
+    if line =~ /^(\d+)\s+(.)$/
+      n = $1.to_i
+      max = n if n > max
+      [n, $2]
+    else
+      [nil, nil]
+    end
+  end.map do |num, word|
+    if word
+      [num*65535/max, word]
+    else
+      [nil, nil]
+    end
+  end
+end
+def read_rmmseg_cpp_words
+  File.readlines(ARGV[2]).map do |line|
+    if line =~ /^(\d+)\s+(\w+)$/
+      [$1, $2]
+    else
+      [nil, nil]
+    end
+  end
+end
+case ARGV[0,2]
+when ['convert', 'chars']
+  output(read_RMMSeg_chars)
+when ['convert', 'words']
+  output(read_RMMSeg_words)
+when ['normalize', 'chars']
+  output(read_rmmseg_cpp_chars)
+when ['normalize', 'words']
+  output(read_rmmseg_cpp_words)
+end

data/misc/ferret_example.rb ADDED

@@ -0,0 +1,59 @@
+#!/usr/bin/env ruby
+require 'rubygems'
+require 'rmmseg'
+require 'rmmseg/ferret'
+# dictionaries needed to be explicitly loaded
+RMMSeg::Dictionary.load_dictionaries
+analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
+  Ferret::Analysis::LowerCaseFilter.new(tokenizer)
+}
+$index = Ferret::Index::Index.new(:analyzer => analyzer)
+$index << {
+  :title => "分词",
+  :content => "中文分词比较困难，不像英文那样，直接在空格和标点符号的地方断开就可以了。"
+}
+$index << {
+  :title => "RMMSeg",
+  :content => "RMMSeg 我近日做的一个 Ruby 中文分词实现，下一步是和 Ferret 进行集成。"
+}
+$index << {
+  :title => "Ruby 1.9",
+  :content => "Ruby 1.9.0 已经发布了，1.9 的一个重大改进就是对 Unicode 的支持。"
+}
+$index << {
+  :title => "Ferret",
+  :content => <<END
+Ferret is a high-performance, full-featured text search engine library
+written for Ruby. It is inspired by Apache Lucene Java project. With
+the introduction of Ferret, Ruby users now have one of the fastest and
+most flexible search libraries available. And it is surprisingly easy
+to use.
+END
+}
+def highlight_search(key)
+  $index.search_each(%Q!content:"#{key}"!) do |id, score|
+    puts "*** Document \"#{$index[id][:title]}\" found with a score of #{score}"
+    puts "-"*40
+    highlights = $index.highlight("content:#{key}", id,
+                                  :field => :content,
+                                  :pre_tag => "\033[36m",
+                                  :post_tag => "\033[m")
+    puts "#{highlights}"
+    puts ""
+  end
+end
+ARGV.each { |key|
+  puts "\033[33mSearching for #{key}...\033[m"
+  puts ""
+  highlight_search(key)
+}
+# Local Variables:
+# coding: utf-8
+# End:

data/misc/homepage.erb ADDED

@@ -0,0 +1,196 @@
+<%# -*- mode: text; coding: utf-8 -*- %>
+<%
+  $title = "rmmseg-cpp Homepage"
+  $authors = { 'pluskid' => 'http://blog.pluskid.org' }
+%>
+<% chapter "Introduction" do %>
+  rmmseg-cpp is a high performance Chinese word segmentation utility for
+  Ruby. It features full "Ferret":http://ferret.davebalmain.com/ integration
+  as well as support for normal Ruby program usage.
+  rmmseg-cpp is a re-written of the original
+  "RMMSeg":http://rmmseg.rubyforge.org/ gem in C++. RMMSeg is written
+  in pure Ruby. Though I tried hard to tweak RMMSeg, it just consumes
+  lots of memory and the segmenting process is rather slow.
+  The interface is almost identical to RMMSeg but the performance is
+  much better. This gem is always preferable in production
+  use. However, if you want to understand how the MMSEG segmenting
+  algorithm works, the source code of RMMSeg is a better choice than
+  this.
+<% end %>
+<% chapter "Setup" do %>
+  <% section "Requirements" do %>
+    Your system needs the following software to run RMMSeg.
+    |_. Software                  |_. Notes                            |
+    | "Ruby":http://ruby-lang.org | Version 1.8.x is required          |
+    | RubyGems                    | rmmseg-cpp is released as a gem    |
+    | g++                         | Used to build the native extension |
+  <% end %>
+  <% section "Installation" do %>
+    <% section "Using RubyGems" do %>
+      To install the gem remotely from "RubyForge":http://rubyforge.org:
+        sudo gem install rmmseg-cpp
+      Or you can download the gem file manually from
+      "RubyForge":http://rubyforge.org/projects/rmmseg-cpp/ and
+      install it locally:
+        sudo gem install --local rmmseg-cpp-x.y.z.gem
+    <% end %>
+    <% section "From Git" do %>
+      To build the gem manually from the latest source code. You'll
+      need to have *git* and *rake* installed.
+      <% warning "The latest source code may be unstable" do %>
+        While I tried to avoid such kind of problems, the source
+        code from the repository might still be broken sometimes.
+        It is generally not recommended to follow the source code.
+      <% end %>
+      The source code of rmmseg-cpp is hosted at
+      "GitHub":http://github.com/pluskid/rmmseg-cpp/. You can get the
+      source code by git clone:
+        git clone git://github.com/pluskid/rmmseg-cpp.git
+      then you can use Rake to build and install the gem:
+        cd rmmseg-cpp
+        rake gem:install
+    <% end %>
+  <% end %>
+<% end %>
+<% chapter "Usage" do %>
+  <% section "Stand Alone rmmseg" do %>
+    rmmseg-cpp comes with a script *rmmseg*. To get the basic usage, just execute it with <tt>-h</tt> option:
+      rmmseg -h
+    It reads from STDIN and print result to STDOUT. Here is a real
+    example:
+      $ echo "我们都喜欢用 Ruby" | rmmseg
+      我们 都 喜欢 用 Ruby
+  <% end %>
+  <% section "Use in Ruby program" do %>
+    <% section "Initialize" do %>
+      To use rmmseg-cpp in Ruby program, you'll first load it with RubyGems:
+      <code>
+      require 'rubygems'
+      require 'rmmseg'
+      </code>
+      Then you may customize the dictionaries used by rmmseg-cpp
+      (see "the rdoc":http://rmmseg-cpp.rubyforge.org/rdoc/classes/RMMSeg/Dictionary.html on
+      how to add your own dictionaries) and load all dictionaries:
+      <code>
+      RMMSeg::Dictionary.load_dictionaries
+      </code>
+      Now rmmseg-cpp will be ready to do segmenting. If you want to load your own customized
+      dictionaries, please customize <tt>RMMSeg::Dictionary.dictionaries</tt> before calling
+      <tt>load_dictionaries</tt>. e.g.
+      <code>
+      RMMSeg::Dictionary.dictionaries = [[:chars, "my_chars.dic"],
+                                         [:words, "my_words.dic"],
+                                         [:words, "my_words2.dic"]]
+      </code>
+      The basic format for char-dictionary and word-dictionary are similar. For each line,
+      there is a number, then *a* space, then the string. Note there *SHOULD* be a newline
+      at the end of the dictionary file. And the number in char-dictionary and word-dictionary
+      has different meaning.
+      In char-dictionary, the number means the frequency of the character. In word-dictionary,
+      the number mean the number of characters in the word. Note that this is NOT the number
+      of *bytes* in the word.
+    <% end %>
+    <% section "Ferret Integration" do %>
+      To use rmmseg-cpp with Ferret, you'll need to @require@ the
+      Ferret support of rmmseg-cpp (Of course you'll also have to
+      got Ferret installed. If you have problems running the belowing
+      example, please try to update to the latest version of both
+      Ferret and rmmseg-cpp first):
+      <code>
+      require 'rmmseg/ferret'
+      </code>
+      rmmseg-cpp comes with a ready to use Ferret analyzer:
+      <code>
+      analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
+        Ferret::Analysis::LowerCaseFilter.new(tokenizer)
+      }
+      index = Ferret::Index::Index.new(:analyzer => analyzer)
+      </code>
+      A complete example can be found in <tt>misc/ferret_example.rb</tt>. The result
+      of running that example is shown in <%= xref "Ferret Example Screenshot" %>.
+      <% figure "Ferret Example Screenshot" do %>
+        !http://lifegoo.pluskid.org/wp-content/uploads/2008/02/rmmseg.png!
+      <% end %>
+    <% end %>
+    <% section "Normal Ruby program" do %>
+      rmmseg-cpp can also be used in normal Ruby programs. Just create
+      an @Algorithm@ object and call @next_token@ until a @nil@ is returned:
+      <code>
+      algor = RMMSeg::Algorithm.new(text)
+      loop do
+        tok = algor.next_token
+        break if tok.nil?
+        puts "#{tok.text} [#{tok.start}..#{tok.end}]"
+      end
+      </code>
+    <% end %>
+  <% end %>
+<% end %>
+<% chapter "Who use it" do %>
+  <% tip "Expand this list" do %>
+    If you used rmmseg-cpp and would like your project to
+    appear in this list, please "contact me":mailto:pluskid@gmail.com.
+  <% end %>
+  * "JavaEye":http://www.javaeye.com/: One of the biggest software developper
+    community in China.
+<% end %>
+<% chapter "Resources" do %>
+  * "Project Home":http://rubyforge.org/projects/rmmseg-cpp/: The Project page at RubyForge.
+  * "RDoc of rmmseg-cpp":http://rmmseg-cpp.rubyforge.org/rdoc/index.html: The auto generated rdoc of RMMSeg.
+  * "Free Mind":http://blog.pluskid.org/: The author's blog.
+  * "Author's Email":mailto:pluskid@gmail.com: Contact me if you have any problem.
+<% end %>