rmmseg-cpp 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,19 @@
1
+ #ifndef _TOKEN_H_
2
+ #define _TOKEN_H_
3
+
4
+ namespace rmmseg
5
+ {
6
+ struct Token
7
+ {
8
+ Token(const char *txt, int len)
9
+ :text(txt), length(len) { }
10
+ // `text' may or may not be nul-terminated, its length
11
+ // should be stored in the `length' field.
12
+ //
13
+ // if length is 0, this is an empty token
14
+ const char *text;
15
+ int length;
16
+ };
17
+ }
18
+
19
+ #endif /* _TOKEN_H_ */
data/ext/rmmseg/word.h ADDED
@@ -0,0 +1,44 @@
1
+ #ifndef _WORD_H_
2
+ #define _WORD_H_
3
+
4
+ #include <climits>
5
+ #include <cstring>
6
+
7
+ #include "memory.h"
8
+
9
+ namespace rmmseg
10
+ {
11
+ const int word_embed_len = 4; /* at least 1 char (3 bytes+'\0') */
12
+ struct Word
13
+ {
14
+ unsigned char nbytes; /* number of bytes */
15
+ char length; /* number of characters */
16
+ unsigned short freq;
17
+ char text[word_embed_len];
18
+ };
19
+
20
+ /**
21
+ * text: the text of the word.
22
+ * length: number of characters (not bytes).
23
+ * freq: the frequency of the word.
24
+ */
25
+ inline Word *make_word(const char *text, int length=1,
26
+ int freq=0, int nbytes=-1)
27
+ {
28
+ if (freq > USHRT_MAX)
29
+ freq = USHRT_MAX; /* avoid overflow */
30
+ if (nbytes == -1)
31
+ nbytes = strlen(text);
32
+ Word *w = static_cast<Word *>(pool_alloc(sizeof(Word)
33
+ + nbytes+1
34
+ - word_embed_len));
35
+ w->nbytes = std::strlen(text);
36
+ w->length = length;
37
+ w->freq = freq;
38
+ std::strncpy(w->text, text, nbytes);
39
+ w->text[nbytes] = '\0';
40
+ return w;
41
+ }
42
+ }
43
+
44
+ #endif /* _WORD_H_ */
data/lib/rmmseg.rb ADDED
@@ -0,0 +1,3 @@
1
+ require File.join(File.dirname(__FILE__), 'rmmseg', 'dictionary')
2
+ require File.join(File.dirname(__FILE__), '..',
3
+ 'ext', 'rmmseg', 'rmmseg')
@@ -0,0 +1,59 @@
1
+ module RMMSeg
2
+ module Dictionary
3
+ @dictionaries = [
4
+ [:chars, File.join(File.dirname(__FILE__),
5
+ "..", "..", "data", "chars.dic")],
6
+ [:words, File.join(File.dirname(__FILE__),
7
+ "..", "..", "data", "words.dic")]
8
+ ]
9
+
10
+ class << self
11
+ #
12
+ # An array of dictionaries used by RMMSeg. Each entry is of the
13
+ # following form:
14
+ #
15
+ # [type, path]
16
+ #
17
+ # where +type+ can either <tt>:chars</tt> or <tt>:words</tt>. +path+ is the path
18
+ # to the dictionary file.
19
+ #
20
+ # The format of <tt>:chars</tt> dictionary is a collection of lines of the
21
+ # following form:
22
+ #
23
+ # freq char
24
+ #
25
+ # Where +frequency+ is a number <b>less than 65535</b>. +char+ is the
26
+ # character. They are spearated by <b>exactly one space</b>.
27
+ #
28
+ # The format of <tt>:words</tt> dictionary is similar:
29
+ #
30
+ # length word
31
+ #
32
+ # except the first number is not the frequency, but the number of
33
+ # characters (not number of bytes) in the word.
34
+ #
35
+ # There's a script (convert.rb) in the tools directory that can be used
36
+ # to convert and normalize dictionaries.
37
+ attr_accessor :dictionaries
38
+
39
+ # Add a user defined dictionary, +type+ can be
40
+ # +:chars+ or <tt>:words</tt>. See doc of dictionaries.
41
+ def add_dictionary(path, type)
42
+ @dictionaries << [type, path]
43
+ end
44
+
45
+ # Load dictionaries. Call this method after set up the path of the
46
+ # dictionaries needed to load and before any Algorithm object is
47
+ # created.
48
+ def load_dictionaries()
49
+ @dictionaries.each do |type, path|
50
+ if type == :chars
51
+ load_chars(path)
52
+ elsif type == :words
53
+ load_words(path)
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,64 @@
1
+ require 'rubygems'
2
+ require 'rmmseg'
3
+ require 'ferret'
4
+
5
+ module RMMSeg
6
+ module Ferret
7
+ # The Analyzer class can be used with Ferret .
8
+ class Analyzer < ::Ferret::Analysis::Analyzer
9
+
10
+ # Construct an Analyzer. Optional block can be used to
11
+ # add more +TokenFilter+s. e.g.
12
+ #
13
+ # analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
14
+ # Ferret::Analysis::LowerCaseFilter.new(tokenizer)
15
+ # }
16
+ #
17
+ def initialize(&brk)
18
+ @brk = brk
19
+ end
20
+
21
+ def token_stream(field, text)
22
+ t = Tokenizer.new(text)
23
+ if @brk
24
+ @brk.call(t)
25
+ else
26
+ t
27
+ end
28
+ end
29
+ end
30
+
31
+ # The Tokenizer tokenize text with RMMSeg::Algorithm.
32
+ class Tokenizer < ::Ferret::Analysis::TokenStream
33
+ # Create a new Tokenizer to tokenize +text+
34
+ def initialize(str)
35
+ self.text = str
36
+ end
37
+
38
+ # Get next token
39
+ def next
40
+ tok = @algor.next_token
41
+ if tok.nil?
42
+ return nil
43
+ else
44
+ @token.text = tok.text
45
+ @token.start = tok.start
46
+ @token.end = tok.end
47
+ return @token
48
+ end
49
+ end
50
+
51
+ # Get the text being tokenized
52
+ def text
53
+ @text
54
+ end
55
+
56
+ # Set the text to be tokenized
57
+ def text=(str)
58
+ @token = ::Ferret::Analysis::Token.new("", 0, 0)
59
+ @text = str
60
+ @algor = Algorithm.new(@text)
61
+ end
62
+ end
63
+ end
64
+ end
data/misc/convert.rb ADDED
@@ -0,0 +1,114 @@
1
+ #!/usr/bin/ruby
2
+
3
+ # A utility used to convert the old RMMSeg dictionary
4
+ # to rmmseg-cpp format.
5
+
6
+ # There are several constrains for the new rmmseg-cpp
7
+ # dictionary format.
8
+ # - length of word should be specified in the dict
9
+ # - number and string should be separated by ONE space
10
+ # - there should be a newline at the end of file
11
+
12
+ $KCODE='u'
13
+ require 'jcode'
14
+
15
+ def usage(msg=nil)
16
+ puts "***ERROR: #{msg}\n\n" if msg
17
+ puts <<EOT
18
+ Usage:
19
+
20
+ #{$0} action type input.dic output.dic
21
+
22
+ action: either 'convert' or 'normalize'
23
+ - 'convert' is used to convert the dict from
24
+ old RMMSeg format.
25
+ - 'normalize' is used to normalize an existing
26
+ rmmseg-cpp dict.
27
+
28
+ type: either 'words' or 'chars'
29
+
30
+ EOT
31
+ exit(0)
32
+ end
33
+
34
+ usage if ARGV.size != 4
35
+ usage("unknown action #{ARGV[0]}") if ! ['convert', 'normalize'].include? ARGV[0]
36
+ usage("unknown type #{ARGV[1]}") if ! ['words', 'chars'].include? ARGV[1]
37
+
38
+ def output(data)
39
+ File.open(ARGV[3], "w") do |f|
40
+ data.each do |num, word|
41
+ f.puts "#{num} #{word}" if word
42
+ end
43
+ end
44
+ end
45
+
46
+ def read_RMMSeg_chars
47
+ max = 0
48
+ File.readlines(ARGV[2]).map do |line|
49
+ if line =~ /^(.)\s+(\d+)$/
50
+ n = $2.to_i
51
+ max = n if n > max
52
+ [n, $1]
53
+ else
54
+ [nil, nil]
55
+ end
56
+ end.map do |num, word|
57
+ if word
58
+ [num*65535/max, word]
59
+ else
60
+ [nil, nil]
61
+ end
62
+ end
63
+ end
64
+
65
+ def read_RMMSeg_words
66
+ File.readlines(ARGV[2]).map do |line|
67
+ line.chomp!
68
+ if !line.empty?
69
+ [line.jlength, line]
70
+ else
71
+ [nil, nil]
72
+ end
73
+ end
74
+ end
75
+
76
+ def read_rmmseg_cpp_chars
77
+ max = 0
78
+ File.readlines(ARGV[2]).map do |line|
79
+ if line =~ /^(\d+)\s+(.)$/
80
+ n = $1.to_i
81
+ max = n if n > max
82
+ [n, $2]
83
+ else
84
+ [nil, nil]
85
+ end
86
+ end.map do |num, word|
87
+ if word
88
+ [num*65535/max, word]
89
+ else
90
+ [nil, nil]
91
+ end
92
+ end
93
+ end
94
+
95
+ def read_rmmseg_cpp_words
96
+ File.readlines(ARGV[2]).map do |line|
97
+ if line =~ /^(\d+)\s+(\w+)$/
98
+ [$1, $2]
99
+ else
100
+ [nil, nil]
101
+ end
102
+ end
103
+ end
104
+
105
+ case ARGV[0,2]
106
+ when ['convert', 'chars']
107
+ output(read_RMMSeg_chars)
108
+ when ['convert', 'words']
109
+ output(read_RMMSeg_words)
110
+ when ['normalize', 'chars']
111
+ output(read_rmmseg_cpp_chars)
112
+ when ['normalize', 'words']
113
+ output(read_rmmseg_cpp_words)
114
+ end
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'rmmseg'
4
+ require 'rmmseg/ferret'
5
+
6
+ # dictionaries needed to be explicitly loaded
7
+ RMMSeg::Dictionary.load_dictionaries
8
+
9
+ analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
10
+ Ferret::Analysis::LowerCaseFilter.new(tokenizer)
11
+ }
12
+
13
+ $index = Ferret::Index::Index.new(:analyzer => analyzer)
14
+
15
+ $index << {
16
+ :title => "分词",
17
+ :content => "中文分词比较困难,不像英文那样,直接在空格和标点符号的地方断开就可以了。"
18
+ }
19
+ $index << {
20
+ :title => "RMMSeg",
21
+ :content => "RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。"
22
+ }
23
+ $index << {
24
+ :title => "Ruby 1.9",
25
+ :content => "Ruby 1.9.0 已经发布了,1.9 的一个重大改进就是对 Unicode 的支持。"
26
+ }
27
+ $index << {
28
+ :title => "Ferret",
29
+ :content => <<END
30
+ Ferret is a high-performance, full-featured text search engine library
31
+ written for Ruby. It is inspired by Apache Lucene Java project. With
32
+ the introduction of Ferret, Ruby users now have one of the fastest and
33
+ most flexible search libraries available. And it is surprisingly easy
34
+ to use.
35
+ END
36
+ }
37
+
38
+ def highlight_search(key)
39
+ $index.search_each(%Q!content:"#{key}"!) do |id, score|
40
+ puts "*** Document \"#{$index[id][:title]}\" found with a score of #{score}"
41
+ puts "-"*40
42
+ highlights = $index.highlight("content:#{key}", id,
43
+ :field => :content,
44
+ :pre_tag => "\033[36m",
45
+ :post_tag => "\033[m")
46
+ puts "#{highlights}"
47
+ puts ""
48
+ end
49
+ end
50
+
51
+ ARGV.each { |key|
52
+ puts "\033[33mSearching for #{key}...\033[m"
53
+ puts ""
54
+ highlight_search(key)
55
+ }
56
+
57
+ # Local Variables:
58
+ # coding: utf-8
59
+ # End:
@@ -0,0 +1,8 @@
1
+ # $Id$
2
+
3
+ require File.join(File.dirname(__FILE__), %w[spec_helper])
4
+
5
+ describe Rmmseg do
6
+ end
7
+
8
+ # EOF
@@ -0,0 +1,17 @@
1
+ # $Id$
2
+
3
+ require File.expand_path(
4
+ File.join(File.dirname(__FILE__), %w[.. lib rmmseg]))
5
+
6
+ Spec::Runner.configure do |config|
7
+ # == Mock Framework
8
+ #
9
+ # RSpec uses it's own mocking framework by default. If you prefer to
10
+ # use mocha, flexmock or RR, uncomment the appropriate line:
11
+ #
12
+ # config.mock_with :mocha
13
+ # config.mock_with :flexmock
14
+ # config.mock_with :rr
15
+ end
16
+
17
+ # EOF
data/tasks/ann.rake ADDED
@@ -0,0 +1,81 @@
1
+ # $Id$
2
+
3
+ begin
4
+ require 'bones/smtp_tls'
5
+ rescue LoadError
6
+ require 'net/smtp'
7
+ end
8
+ require 'time'
9
+
10
+ namespace :ann do
11
+
12
+ # A prerequisites task that all other tasks depend upon
13
+ task :prereqs
14
+
15
+ file PROJ.ann.file do
16
+ ann = PROJ.ann
17
+ puts "Generating #{ann.file}"
18
+ File.open(ann.file,'w') do |fd|
19
+ fd.puts("#{PROJ.name} version #{PROJ.version}")
20
+ fd.puts(" by #{Array(PROJ.authors).first}") if PROJ.authors
21
+ fd.puts(" #{PROJ.url}") if PROJ.url.valid?
22
+ fd.puts(" (the \"#{PROJ.release_name}\" release)") if PROJ.release_name
23
+ fd.puts
24
+ fd.puts("== DESCRIPTION")
25
+ fd.puts
26
+ fd.puts(PROJ.description)
27
+ fd.puts
28
+ fd.puts(PROJ.changes.sub(%r/^.*$/, '== CHANGES'))
29
+ fd.puts
30
+ ann.paragraphs.each do |p|
31
+ fd.puts "== #{p.upcase}"
32
+ fd.puts
33
+ fd.puts paragraphs_of(PROJ.readme_file, p).join("\n\n")
34
+ fd.puts
35
+ end
36
+ fd.puts ann.text if ann.text
37
+ end
38
+ end
39
+
40
+ desc "Create an announcement file"
41
+ task :announcement => ['ann:prereqs', PROJ.ann.file]
42
+
43
+ desc "Send an email announcement"
44
+ task :email => ['ann:prereqs', PROJ.ann.file] do
45
+ ann = PROJ.ann
46
+ from = ann.email[:from] || PROJ.email
47
+ to = Array(ann.email[:to])
48
+
49
+ ### build a mail header for RFC 822
50
+ rfc822msg = "From: #{from}\n"
51
+ rfc822msg << "To: #{to.join(',')}\n"
52
+ rfc822msg << "Subject: [ANN] #{PROJ.name} #{PROJ.version}"
53
+ rfc822msg << " (#{PROJ.release_name})" if PROJ.release_name
54
+ rfc822msg << "\n"
55
+ rfc822msg << "Date: #{Time.new.rfc822}\n"
56
+ rfc822msg << "Message-Id: "
57
+ rfc822msg << "<#{"%.8f" % Time.now.to_f}@#{ann.email[:domain]}>\n\n"
58
+ rfc822msg << File.read(ann.file)
59
+
60
+ params = [:server, :port, :domain, :acct, :passwd, :authtype].map do |key|
61
+ ann.email[key]
62
+ end
63
+
64
+ params[3] = PROJ.email if params[3].nil?
65
+
66
+ if params[4].nil?
67
+ STDOUT.write "Please enter your e-mail password (#{params[3]}): "
68
+ params[4] = STDIN.gets.chomp
69
+ end
70
+
71
+ ### send email
72
+ Net::SMTP.start(*params) {|smtp| smtp.sendmail(rfc822msg, from, to)}
73
+ end
74
+ end # namespace :ann
75
+
76
+ desc 'Alias to ann:announcement'
77
+ task :ann => 'ann:announcement'
78
+
79
+ CLOBBER << PROJ.ann.file
80
+
81
+ # EOF