RubyGems - rmmseg-cpp - Versions diffs - 0.2.5 - Mend

rmmseg-cpp 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

data/ext/rmmseg/token.h ADDED Viewed

@@ -0,0 +1,19 @@
+#ifndef _TOKEN_H_
+#define _TOKEN_H_
+namespace rmmseg
+{
+    struct Token
+    {
+        Token(const char *txt, int len)
+            :text(txt), length(len) { }
+        // `text' may or may not be nul-terminated, its length
+        // should be stored in the `length' field.
+        //
+        // if length is 0, this is an empty token
+        const char *text;
+        int length;
+    };
+}
+#endif /* _TOKEN_H_ */

data/ext/rmmseg/word.h ADDED Viewed

@@ -0,0 +1,44 @@
+#ifndef _WORD_H_
+#define _WORD_H_
+#include <climits>
+#include <cstring>
+#include "memory.h"
+namespace rmmseg
+{
+    const int word_embed_len = 4; /* at least 1 char (3 bytes+'\0') */
+    struct Word
+    {
+        unsigned char   nbytes;   /* number of bytes */
+        char            length;   /* number of characters */
+        unsigned short  freq;
+        char            text[word_embed_len];
+    };
+    /**
+     * text: the text of the word.
+     * length: number of characters (not bytes).
+     * freq: the frequency of the word.
+     */
+    inline Word *make_word(const char *text, int length=1,
+                           int freq=0, int nbytes=-1)
+    {
+        if (freq > USHRT_MAX)
+            freq = USHRT_MAX;   /* avoid overflow */
+        if (nbytes == -1)
+            nbytes = strlen(text);
+        Word *w = static_cast<Word *>(pool_alloc(sizeof(Word)
+                                                 + nbytes+1
+                                                 - word_embed_len));
+        w->nbytes = std::strlen(text);
+        w->length = length;
+        w->freq = freq;
+        std::strncpy(w->text, text, nbytes);
+        w->text[nbytes] = '\0';
+        return w;
+    }
+}
+#endif /* _WORD_H_ */

data/lib/rmmseg.rb ADDED Viewed

@@ -0,0 +1,3 @@
+require File.join(File.dirname(__FILE__), 'rmmseg', 'dictionary')
+require File.join(File.dirname(__FILE__), '..',
+                  'ext', 'rmmseg', 'rmmseg')

data/lib/rmmseg/dictionary.rb ADDED Viewed

@@ -0,0 +1,59 @@
+module RMMSeg
+  module Dictionary
+    @dictionaries = [
+                     [:chars, File.join(File.dirname(__FILE__),
+                                        "..", "..", "data", "chars.dic")],
+                     [:words, File.join(File.dirname(__FILE__),
+                                        "..", "..", "data", "words.dic")]
+                    ]
+    class << self
+      #
+      # An array of dictionaries used by RMMSeg. Each entry is of the
+      # following form:
+      #
+      #   [type, path]
+      #
+      # where +type+ can either <tt>:chars</tt> or <tt>:words</tt>. +path+ is the path
+      # to the dictionary file.
+      #
+      # The format of <tt>:chars</tt> dictionary is a collection of lines of the
+      # following form:
+      #
+      #   freq char
+      #
+      # Where +frequency+ is a number <b>less than 65535</b>. +char+ is the
+      # character. They are spearated by <b>exactly one space</b>.
+      #
+      # The format of <tt>:words</tt> dictionary is similar:
+      #
+      #   length word
+      #
+      # except the first number is not the frequency, but the number of
+      # characters (not number of bytes) in the word.
+      #
+      # There's a script (convert.rb) in the tools directory that can be used
+      # to convert and normalize dictionaries.
+      attr_accessor :dictionaries
+      # Add a user defined dictionary, +type+ can be
+      # +:chars+ or <tt>:words</tt>. See doc of dictionaries.
+      def add_dictionary(path, type)
+        @dictionaries << [type, path]
+      end
+      # Load dictionaries. Call this method after set up the path of the
+      # dictionaries needed to load and before any Algorithm object is
+      # created.
+      def load_dictionaries()
+        @dictionaries.each do |type, path|
+          if type == :chars
+            load_chars(path)
+          elsif type == :words
+            load_words(path)
+          end
+        end
+      end
+    end
+  end
+end

data/lib/rmmseg/ferret.rb ADDED Viewed

@@ -0,0 +1,64 @@
+require 'rubygems'
+require 'rmmseg'
+require 'ferret'
+module RMMSeg
+  module Ferret
+        # The Analyzer class can be used with Ferret .
+    class Analyzer < ::Ferret::Analysis::Analyzer
+      # Construct an Analyzer. Optional block can be used to
+      # add more +TokenFilter+s. e.g.
+      #
+      #   analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
+      #     Ferret::Analysis::LowerCaseFilter.new(tokenizer)
+      #   }
+      #
+      def initialize(&brk)
+        @brk = brk
+      end
+      def token_stream(field, text)
+        t = Tokenizer.new(text)
+        if @brk
+          @brk.call(t)
+        else
+          t
+        end
+      end
+    end
+    # The Tokenizer tokenize text with RMMSeg::Algorithm.
+    class Tokenizer < ::Ferret::Analysis::TokenStream
+      # Create a new Tokenizer to tokenize +text+
+      def initialize(str)
+        self.text = str
+      end
+      # Get next token
+      def next
+        tok = @algor.next_token
+        if tok.nil?
+          return nil
+        else
+          @token.text = tok.text
+          @token.start = tok.start
+          @token.end = tok.end
+          return @token
+        end
+      end
+      # Get the text being tokenized
+      def text
+        @text
+      end
+      # Set the text to be tokenized
+      def text=(str)
+        @token = ::Ferret::Analysis::Token.new("", 0, 0)
+        @text = str
+        @algor = Algorithm.new(@text)
+      end
+    end
+  end
+end

data/misc/convert.rb ADDED Viewed

@@ -0,0 +1,114 @@
+#!/usr/bin/ruby
+# A utility used to convert the old RMMSeg dictionary
+# to rmmseg-cpp format.
+# There are several constrains for the new rmmseg-cpp
+# dictionary format.
+#  - length of word should be specified in the dict
+#  - number and string should be separated by ONE space
+#  - there should be a newline at the end of file
+$KCODE='u'
+require 'jcode'
+def usage(msg=nil)
+  puts "***ERROR: #{msg}\n\n" if msg
+  puts <<EOT
+Usage:
+#{$0} action type input.dic output.dic
+  action: either 'convert' or 'normalize'
+           - 'convert' is used to convert the dict from
+             old RMMSeg format.
+           - 'normalize' is used to normalize an existing
+             rmmseg-cpp dict.
+  type:   either 'words' or 'chars'
+EOT
+  exit(0)
+end
+usage if ARGV.size != 4
+usage("unknown action #{ARGV[0]}") if ! ['convert', 'normalize'].include? ARGV[0]
+usage("unknown type #{ARGV[1]}") if ! ['words', 'chars'].include? ARGV[1]
+def output(data)
+  File.open(ARGV[3], "w") do |f|
+    data.each do |num, word|
+      f.puts "#{num} #{word}" if word
+    end
+  end
+end
+def read_RMMSeg_chars
+  max = 0
+  File.readlines(ARGV[2]).map do |line|
+    if line =~ /^(.)\s+(\d+)$/
+      n = $2.to_i
+      max = n if n > max
+      [n, $1]
+    else
+      [nil, nil]
+    end
+  end.map do |num, word|
+    if word
+      [num*65535/max, word]
+    else
+      [nil, nil]
+    end
+  end
+end
+def read_RMMSeg_words
+  File.readlines(ARGV[2]).map do |line|
+    line.chomp!
+    if !line.empty?
+      [line.jlength, line]
+    else
+      [nil, nil]
+    end
+  end
+end
+def read_rmmseg_cpp_chars
+  max = 0
+  File.readlines(ARGV[2]).map do |line|
+    if line =~ /^(\d+)\s+(.)$/
+      n = $1.to_i
+      max = n if n > max
+      [n, $2]
+    else
+      [nil, nil]
+    end
+  end.map do |num, word|
+    if word
+      [num*65535/max, word]
+    else
+      [nil, nil]
+    end
+  end
+end
+def read_rmmseg_cpp_words
+  File.readlines(ARGV[2]).map do |line|
+    if line =~ /^(\d+)\s+(\w+)$/
+      [$1, $2]
+    else
+      [nil, nil]
+    end
+  end
+end
+case ARGV[0,2]
+when ['convert', 'chars']
+  output(read_RMMSeg_chars)
+when ['convert', 'words']
+  output(read_RMMSeg_words)
+when ['normalize', 'chars']
+  output(read_rmmseg_cpp_chars)
+when ['normalize', 'words']
+  output(read_rmmseg_cpp_words)
+end

data/misc/ferret_example.rb ADDED Viewed

@@ -0,0 +1,59 @@
+#!/usr/bin/env ruby
+require 'rubygems'
+require 'rmmseg'
+require 'rmmseg/ferret'
+# dictionaries needed to be explicitly loaded
+RMMSeg::Dictionary.load_dictionaries
+analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
+  Ferret::Analysis::LowerCaseFilter.new(tokenizer)
+}
+$index = Ferret::Index::Index.new(:analyzer => analyzer)
+$index << {
+  :title => "分词",
+  :content => "中文分词比较困难，不像英文那样，直接在空格和标点符号的地方断开就可以了。"
+}
+$index << {
+  :title => "RMMSeg",
+  :content => "RMMSeg 我近日做的一个 Ruby 中文分词实现，下一步是和 Ferret 进行集成。"
+}
+$index << {
+  :title => "Ruby 1.9",
+  :content => "Ruby 1.9.0 已经发布了，1.9 的一个重大改进就是对 Unicode 的支持。"
+}
+$index << {
+  :title => "Ferret",
+  :content => <<END
+Ferret is a high-performance, full-featured text search engine library
+written for Ruby. It is inspired by Apache Lucene Java project. With
+the introduction of Ferret, Ruby users now have one of the fastest and
+most flexible search libraries available. And it is surprisingly easy
+to use.
+END
+}
+def highlight_search(key)
+  $index.search_each(%Q!content:"#{key}"!) do |id, score|
+    puts "*** Document \"#{$index[id][:title]}\" found with a score of #{score}"
+    puts "-"*40
+    highlights = $index.highlight("content:#{key}", id,
+                                  :field => :content,
+                                  :pre_tag => "\033[36m",
+                                  :post_tag => "\033[m")
+    puts "#{highlights}"
+    puts ""
+  end
+end
+ARGV.each { |key|
+  puts "\033[33mSearching for #{key}...\033[m"
+  puts ""
+  highlight_search(key)
+}
+# Local Variables:
+# coding: utf-8
+# End:

data/spec/rmmseg_spec.rb ADDED Viewed

@@ -0,0 +1,8 @@
+# $Id$
+require File.join(File.dirname(__FILE__), %w[spec_helper])
+describe Rmmseg do
+end
+# EOF

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,17 @@
+# $Id$
+require File.expand_path(
+    File.join(File.dirname(__FILE__), %w[.. lib rmmseg]))
+Spec::Runner.configure do |config|
+  # == Mock Framework
+  #
+  # RSpec uses it's own mocking framework by default. If you prefer to
+  # use mocha, flexmock or RR, uncomment the appropriate line:
+  #
+  # config.mock_with :mocha
+  # config.mock_with :flexmock
+  # config.mock_with :rr
+end
+# EOF

data/tasks/ann.rake ADDED Viewed

@@ -0,0 +1,81 @@
+# $Id$
+begin
+  require 'bones/smtp_tls'
+rescue LoadError
+  require 'net/smtp'
+end
+require 'time'
+namespace :ann do
+  # A prerequisites task that all other tasks depend upon
+  task :prereqs
+  file PROJ.ann.file do
+    ann = PROJ.ann
+    puts "Generating #{ann.file}"
+    File.open(ann.file,'w') do |fd|
+      fd.puts("#{PROJ.name} version #{PROJ.version}")
+      fd.puts("    by #{Array(PROJ.authors).first}") if PROJ.authors
+      fd.puts("    #{PROJ.url}") if PROJ.url.valid?
+      fd.puts("    (the \"#{PROJ.release_name}\" release)") if PROJ.release_name
+      fd.puts
+      fd.puts("== DESCRIPTION")
+      fd.puts
+      fd.puts(PROJ.description)
+      fd.puts
+      fd.puts(PROJ.changes.sub(%r/^.*$/, '== CHANGES'))
+      fd.puts
+      ann.paragraphs.each do |p|
+        fd.puts "== #{p.upcase}"
+        fd.puts
+        fd.puts paragraphs_of(PROJ.readme_file, p).join("\n\n")
+        fd.puts
+      end
+      fd.puts ann.text if ann.text
+    end
+  end
+  desc "Create an announcement file"
+  task :announcement => ['ann:prereqs', PROJ.ann.file]
+  desc "Send an email announcement"
+  task :email => ['ann:prereqs', PROJ.ann.file] do
+    ann = PROJ.ann
+    from = ann.email[:from] || PROJ.email
+    to   = Array(ann.email[:to])
+    ### build a mail header for RFC 822
+    rfc822msg =  "From: #{from}\n"
+    rfc822msg << "To: #{to.join(',')}\n"
+    rfc822msg << "Subject: [ANN] #{PROJ.name} #{PROJ.version}"
+    rfc822msg << " (#{PROJ.release_name})" if PROJ.release_name
+    rfc822msg << "\n"
+    rfc822msg << "Date: #{Time.new.rfc822}\n"
+    rfc822msg << "Message-Id: "
+    rfc822msg << "<#{"%.8f" % Time.now.to_f}@#{ann.email[:domain]}>\n\n"
+    rfc822msg << File.read(ann.file)
+    params = [:server, :port, :domain, :acct, :passwd, :authtype].map do |key|
+      ann.email[key]
+    end
+    params[3] = PROJ.email if params[3].nil?
+    if params[4].nil?
+      STDOUT.write "Please enter your e-mail password (#{params[3]}): "
+      params[4] = STDIN.gets.chomp
+    end
+    ### send email
+    Net::SMTP.start(*params) {|smtp| smtp.sendmail(rfc822msg, from, to)}
+  end
+end  # namespace :ann
+desc 'Alias to ann:announcement'
+task :ann => 'ann:announcement'
+CLOBBER << PROJ.ann.file
+# EOF