lijia-rmmseg-cpp 10.2.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,19 @@
1
+ #ifndef _TOKEN_H_
2
+ #define _TOKEN_H_
3
+
4
+ namespace rmmseg
5
+ {
6
+ struct Token
7
+ {
8
+ Token(const char *txt, int len)
9
+ :text(txt), length(len) { }
10
+ // `text' may or may not be nul-terminated, its length
11
+ // should be stored in the `length' field.
12
+ //
13
+ // if length is 0, this is an empty token
14
+ const char *text;
15
+ int length;
16
+ };
17
+ }
18
+
19
+ #endif /* _TOKEN_H_ */
@@ -0,0 +1,44 @@
1
+ #ifndef _WORD_H_
2
+ #define _WORD_H_
3
+
4
+ #include <climits>
5
+ #include <cstring>
6
+
7
+ #include "memory.h"
8
+
9
+ namespace rmmseg
10
+ {
11
+ const int word_embed_len = 4; /* at least 1 char (3 bytes+'\0') */
12
+ struct Word
13
+ {
14
+ unsigned char nbytes; /* number of bytes */
15
+ char length; /* number of characters */
16
+ unsigned short freq;
17
+ char text[word_embed_len];
18
+ };
19
+
20
+ /**
21
+ * text: the text of the word.
22
+ * length: number of characters (not bytes).
23
+ * freq: the frequency of the word.
24
+ */
25
+ inline Word *make_word(const char *text, int length=1,
26
+ int freq=0, int nbytes=-1)
27
+ {
28
+ if (freq > USHRT_MAX)
29
+ freq = USHRT_MAX; /* avoid overflow */
30
+ if (nbytes == -1)
31
+ nbytes = std::strlen(text);
32
+ Word *w = static_cast<Word *>(pool_alloc(sizeof(Word)
33
+ + nbytes+1
34
+ - word_embed_len));
35
+ w->nbytes = nbytes;
36
+ w->length = length;
37
+ w->freq = freq;
38
+ std::strncpy(w->text, text, nbytes);
39
+ w->text[nbytes] = '\0';
40
+ return w;
41
+ }
42
+ }
43
+
44
+ #endif /* _WORD_H_ */
@@ -0,0 +1,3 @@
1
+ require File.join(File.dirname(__FILE__), 'rmmseg', 'dictionary')
2
+ require File.join(File.dirname(__FILE__), '..',
3
+ 'ext', 'rmmseg', 'rmmseg')
@@ -0,0 +1,59 @@
1
+ module RMMSeg
2
+ module Dictionary
3
+ @dictionaries = [
4
+ [:chars, File.join(File.dirname(__FILE__),
5
+ "..", "..", "data", "chars.dic")],
6
+ [:words, File.join(File.dirname(__FILE__),
7
+ "..", "..", "data", "words.dic")]
8
+ ]
9
+
10
+ class << self
11
+ #
12
+ # An array of dictionaries used by RMMSeg. Each entry is of the
13
+ # following form:
14
+ #
15
+ # [type, path]
16
+ #
17
+ # where +type+ can either <tt>:chars</tt> or <tt>:words</tt>. +path+ is the path
18
+ # to the dictionary file.
19
+ #
20
+ # The format of <tt>:chars</tt> dictionary is a collection of lines of the
21
+ # following form:
22
+ #
23
+ # freq char
24
+ #
25
+ # Where +frequency+ is a number <b>less than 65535</b>. +char+ is the
26
+ # character. They are spearated by <b>exactly one space</b>.
27
+ #
28
+ # The format of <tt>:words</tt> dictionary is similar:
29
+ #
30
+ # length word
31
+ #
32
+ # except the first number is not the frequency, but the number of
33
+ # characters (not number of bytes) in the word.
34
+ #
35
+ # There's a script (convert.rb) in the tools directory that can be used
36
+ # to convert and normalize dictionaries.
37
+ attr_accessor :dictionaries
38
+
39
+ # Add a user defined dictionary, +type+ can be
40
+ # +:chars+ or <tt>:words</tt>. See doc of dictionaries.
41
+ def add_dictionary(path, type)
42
+ @dictionaries << [type, path]
43
+ end
44
+
45
+ # Load dictionaries. Call this method after set up the path of the
46
+ # dictionaries needed to load and before any Algorithm object is
47
+ # created.
48
+ def load_dictionaries()
49
+ @dictionaries.each do |type, path|
50
+ if type == :chars
51
+ load_chars(path)
52
+ elsif type == :words
53
+ load_words(path)
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,64 @@
1
+ require 'rubygems'
2
+ require 'rmmseg'
3
+ require 'ferret'
4
+
5
+ module RMMSeg
6
+ module Ferret
7
+ # The Analyzer class can be used with Ferret .
8
+ class Analyzer < ::Ferret::Analysis::Analyzer
9
+
10
+ # Construct an Analyzer. Optional block can be used to
11
+ # add more +TokenFilter+s. e.g.
12
+ #
13
+ # analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
14
+ # Ferret::Analysis::LowerCaseFilter.new(tokenizer)
15
+ # }
16
+ #
17
+ def initialize(&brk)
18
+ @brk = brk
19
+ end
20
+
21
+ def token_stream(field, text)
22
+ t = Tokenizer.new(text)
23
+ if @brk
24
+ @brk.call(t)
25
+ else
26
+ t
27
+ end
28
+ end
29
+ end
30
+
31
+ # The Tokenizer tokenize text with RMMSeg::Algorithm.
32
+ class Tokenizer < ::Ferret::Analysis::TokenStream
33
+ # Create a new Tokenizer to tokenize +text+
34
+ def initialize(str)
35
+ self.text = str
36
+ end
37
+
38
+ # Get next token
39
+ def next
40
+ tok = @algor.next_token
41
+ if tok.nil?
42
+ return nil
43
+ else
44
+ @token.text = tok.text
45
+ @token.start = tok.start
46
+ @token.end = tok.end
47
+ return @token
48
+ end
49
+ end
50
+
51
+ # Get the text being tokenized
52
+ def text
53
+ @text
54
+ end
55
+
56
+ # Set the text to be tokenized
57
+ def text=(str)
58
+ @token = ::Ferret::Analysis::Token.new("", 0, 0)
59
+ @text = str
60
+ @algor = Algorithm.new(@text)
61
+ end
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,114 @@
1
+ #!/usr/bin/ruby
2
+
3
+ # A utility used to convert the old RMMSeg dictionary
4
+ # to rmmseg-cpp format.
5
+
6
+ # There are several constrains for the new rmmseg-cpp
7
+ # dictionary format.
8
+ # - length of word should be specified in the dict
9
+ # - number and string should be separated by ONE space
10
+ # - there should be a newline at the end of file
11
+
12
+ # $KCODE='u'
13
+ # require 'jcode'
14
+
15
+ def usage(msg=nil)
16
+ puts "***ERROR: #{msg}\n\n" if msg
17
+ puts <<EOT
18
+ Usage:
19
+
20
+ #{$0} action type input.dic output.dic
21
+
22
+ action: either 'convert' or 'normalize'
23
+ - 'convert' is used to convert the dict from
24
+ old RMMSeg format.
25
+ - 'normalize' is used to normalize an existing
26
+ rmmseg-cpp dict.
27
+
28
+ type: either 'words' or 'chars'
29
+
30
+ EOT
31
+ exit(0)
32
+ end
33
+
34
+ usage if ARGV.size != 4
35
+ usage("unknown action #{ARGV[0]}") if ! ['convert', 'normalize'].include? ARGV[0]
36
+ usage("unknown type #{ARGV[1]}") if ! ['words', 'chars'].include? ARGV[1]
37
+
38
+ def output(data)
39
+ File.open(ARGV[3], "w") do |f|
40
+ data.each do |num, word|
41
+ f.puts "#{num} #{word}" if word
42
+ end
43
+ end
44
+ end
45
+
46
+ def read_RMMSeg_chars
47
+ max = 0
48
+ File.readlines(ARGV[2]).map do |line|
49
+ if line =~ /^(.)\s+(\d+)$/
50
+ n = $2.to_i
51
+ max = n if n > max
52
+ [n, $1]
53
+ else
54
+ [nil, nil]
55
+ end
56
+ end.map do |num, word|
57
+ if word
58
+ [num*65535/max, word]
59
+ else
60
+ [nil, nil]
61
+ end
62
+ end
63
+ end
64
+
65
+ def read_RMMSeg_words
66
+ File.readlines(ARGV[2]).map do |line|
67
+ line.chomp!
68
+ if !line.empty?
69
+ [line.size, line]
70
+ else
71
+ [nil, nil]
72
+ end
73
+ end
74
+ end
75
+
76
+ def read_rmmseg_cpp_chars
77
+ max = 0
78
+ File.readlines(ARGV[2]).map do |line|
79
+ if line =~ /^(\d+)\s+(.)$/
80
+ n = $1.to_i
81
+ max = n if n > max
82
+ [n, $2]
83
+ else
84
+ [nil, nil]
85
+ end
86
+ end.map do |num, word|
87
+ if word
88
+ [num*65535/max, word]
89
+ else
90
+ [nil, nil]
91
+ end
92
+ end
93
+ end
94
+
95
+ def read_rmmseg_cpp_words
96
+ File.readlines(ARGV[2]).map do |line|
97
+ if line =~ /^(\d+)\s+(\w+)$/
98
+ [$1, $2]
99
+ else
100
+ [nil, nil]
101
+ end
102
+ end
103
+ end
104
+
105
+ case ARGV[0,2]
106
+ when ['convert', 'chars']
107
+ output(read_RMMSeg_chars)
108
+ when ['convert', 'words']
109
+ output(read_RMMSeg_words)
110
+ when ['normalize', 'chars']
111
+ output(read_rmmseg_cpp_chars)
112
+ when ['normalize', 'words']
113
+ output(read_rmmseg_cpp_words)
114
+ end
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'rmmseg'
4
+ require 'rmmseg/ferret'
5
+
6
+ # dictionaries needed to be explicitly loaded
7
+ RMMSeg::Dictionary.load_dictionaries
8
+
9
+ analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
10
+ Ferret::Analysis::LowerCaseFilter.new(tokenizer)
11
+ }
12
+
13
+ $index = Ferret::Index::Index.new(:analyzer => analyzer)
14
+
15
+ $index << {
16
+ :title => "分词",
17
+ :content => "中文分词比较困难,不像英文那样,直接在空格和标点符号的地方断开就可以了。"
18
+ }
19
+ $index << {
20
+ :title => "RMMSeg",
21
+ :content => "RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。"
22
+ }
23
+ $index << {
24
+ :title => "Ruby 1.9",
25
+ :content => "Ruby 1.9.0 已经发布了,1.9 的一个重大改进就是对 Unicode 的支持。"
26
+ }
27
+ $index << {
28
+ :title => "Ferret",
29
+ :content => <<END
30
+ Ferret is a high-performance, full-featured text search engine library
31
+ written for Ruby. It is inspired by Apache Lucene Java project. With
32
+ the introduction of Ferret, Ruby users now have one of the fastest and
33
+ most flexible search libraries available. And it is surprisingly easy
34
+ to use.
35
+ END
36
+ }
37
+
38
+ def highlight_search(key)
39
+ $index.search_each(%Q!content:"#{key}"!) do |id, score|
40
+ puts "*** Document \"#{$index[id][:title]}\" found with a score of #{score}"
41
+ puts "-"*40
42
+ highlights = $index.highlight("content:#{key}", id,
43
+ :field => :content,
44
+ :pre_tag => "\033[36m",
45
+ :post_tag => "\033[m")
46
+ puts "#{highlights}"
47
+ puts ""
48
+ end
49
+ end
50
+
51
+ ARGV.each { |key|
52
+ puts "\033[33mSearching for #{key}...\033[m"
53
+ puts ""
54
+ highlight_search(key)
55
+ }
56
+
57
+ # Local Variables:
58
+ # coding: utf-8
59
+ # End:
@@ -0,0 +1,196 @@
1
+ <%# -*- mode: text; coding: utf-8 -*- %>
2
+ <%
3
+ $title = "rmmseg-cpp Homepage"
4
+ $authors = { 'pluskid' => 'http://blog.pluskid.org' }
5
+ %>
6
+
7
+ <% chapter "Introduction" do %>
8
+
9
+ rmmseg-cpp is a high performance Chinese word segmentation utility for
10
+ Ruby. It features full "Ferret":http://ferret.davebalmain.com/ integration
11
+ as well as support for normal Ruby program usage.
12
+
13
+ rmmseg-cpp is a re-written of the original
14
+ "RMMSeg":http://rmmseg.rubyforge.org/ gem in C++. RMMSeg is written
15
+ in pure Ruby. Though I tried hard to tweak RMMSeg, it just consumes
16
+ lots of memory and the segmenting process is rather slow.
17
+
18
+ The interface is almost identical to RMMSeg but the performance is
19
+ much better. This gem is always preferable in production
20
+ use. However, if you want to understand how the MMSEG segmenting
21
+ algorithm works, the source code of RMMSeg is a better choice than
22
+ this.
23
+
24
+ <% end %>
25
+
26
+ <% chapter "Setup" do %>
27
+ <% section "Requirements" do %>
28
+
29
+ Your system needs the following software to run RMMSeg.
30
+
31
+ |_. Software |_. Notes |
32
+ | "Ruby":http://ruby-lang.org | Version 1.8.x is required |
33
+ | RubyGems | rmmseg-cpp is released as a gem |
34
+ | g++ | Used to build the native extension |
35
+
36
+ <% end %>
37
+
38
+ <% section "Installation" do %>
39
+ <% section "Using RubyGems" do %>
40
+ To install the gem remotely from "RubyForge":http://rubyforge.org:
41
+
42
+ sudo gem install rmmseg-cpp
43
+
44
+ Or you can download the gem file manually from
45
+ "RubyForge":http://rubyforge.org/projects/rmmseg-cpp/ and
46
+ install it locally:
47
+
48
+ sudo gem install --local rmmseg-cpp-x.y.z.gem
49
+
50
+ <% end %>
51
+
52
+ <% section "From Git" do %>
53
+ To build the gem manually from the latest source code. You'll
54
+ need to have *git* and *rake* installed.
55
+
56
+ <% warning "The latest source code may be unstable" do %>
57
+
58
+ While I tried to avoid such kind of problems, the source
59
+ code from the repository might still be broken sometimes.
60
+ It is generally not recommended to follow the source code.
61
+
62
+ <% end %>
63
+
64
+ The source code of rmmseg-cpp is hosted at
65
+ "GitHub":http://github.com/pluskid/rmmseg-cpp/. You can get the
66
+ source code by git clone:
67
+
68
+ git clone git://github.com/pluskid/rmmseg-cpp.git
69
+
70
+ then you can use Rake to build and install the gem:
71
+
72
+ cd rmmseg-cpp
73
+ rake gem:install
74
+
75
+ <% end %>
76
+ <% end %>
77
+ <% end %>
78
+
79
+ <% chapter "Usage" do %>
80
+
81
+ <% section "Stand Alone rmmseg" do %>
82
+ rmmseg-cpp comes with a script *rmmseg*. To get the basic usage, just execute it with <tt>-h</tt> option:
83
+
84
+ rmmseg -h
85
+
86
+ It reads from STDIN and print result to STDOUT. Here is a real
87
+ example:
88
+
89
+ $ echo "我们都喜欢用 Ruby" | rmmseg
90
+ 我们 都 喜欢 用 Ruby
91
+
92
+ <% end %>
93
+
94
+ <% section "Use in Ruby program" do %>
95
+
96
+ <% section "Initialize" do %>
97
+
98
+ To use rmmseg-cpp in Ruby program, you'll first load it with RubyGems:
99
+
100
+ <code>
101
+ require 'rubygems'
102
+ require 'rmmseg'
103
+ </code>
104
+
105
+ Then you may customize the dictionaries used by rmmseg-cpp
106
+ (see "the rdoc":http://rmmseg-cpp.rubyforge.org/rdoc/classes/RMMSeg/Dictionary.html on
107
+ how to add your own dictionaries) and load all dictionaries:
108
+
109
+ <code>
110
+ RMMSeg::Dictionary.load_dictionaries
111
+ </code>
112
+
113
+ Now rmmseg-cpp will be ready to do segmenting. If you want to load your own customized
114
+ dictionaries, please customize <tt>RMMSeg::Dictionary.dictionaries</tt> before calling
115
+ <tt>load_dictionaries</tt>. e.g.
116
+
117
+ <code>
118
+ RMMSeg::Dictionary.dictionaries = [[:chars, "my_chars.dic"],
119
+ [:words, "my_words.dic"],
120
+ [:words, "my_words2.dic"]]
121
+ </code>
122
+
123
+ The basic format for char-dictionary and word-dictionary are similar. For each line,
124
+ there is a number, then *a* space, then the string. Note there *SHOULD* be a newline
125
+ at the end of the dictionary file. And the number in char-dictionary and word-dictionary
126
+ has different meaning.
127
+
128
+ In char-dictionary, the number means the frequency of the character. In word-dictionary,
129
+ the number mean the number of characters in the word. Note that this is NOT the number
130
+ of *bytes* in the word.
131
+
132
+ <% end %>
133
+
134
+ <% section "Ferret Integration" do %>
135
+
136
+ To use rmmseg-cpp with Ferret, you'll need to @require@ the
137
+ Ferret support of rmmseg-cpp (Of course you'll also have to
138
+ got Ferret installed. If you have problems running the belowing
139
+ example, please try to update to the latest version of both
140
+ Ferret and rmmseg-cpp first):
141
+
142
+ <code>
143
+ require 'rmmseg/ferret'
144
+ </code>
145
+
146
+ rmmseg-cpp comes with a ready to use Ferret analyzer:
147
+
148
+ <code>
149
+ analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
150
+ Ferret::Analysis::LowerCaseFilter.new(tokenizer)
151
+ }
152
+ index = Ferret::Index::Index.new(:analyzer => analyzer)
153
+ </code>
154
+
155
+ A complete example can be found in <tt>misc/ferret_example.rb</tt>. The result
156
+ of running that example is shown in <%= xref "Ferret Example Screenshot" %>.
157
+
158
+ <% figure "Ferret Example Screenshot" do %>
159
+ !http://lifegoo.pluskid.org/wp-content/uploads/2008/02/rmmseg.png!
160
+ <% end %>
161
+
162
+ <% end %>
163
+
164
+ <% section "Normal Ruby program" do %>
165
+ rmmseg-cpp can also be used in normal Ruby programs. Just create
166
+ an @Algorithm@ object and call @next_token@ until a @nil@ is returned:
167
+
168
+ <code>
169
+ algor = RMMSeg::Algorithm.new(text)
170
+ loop do
171
+ tok = algor.next_token
172
+ break if tok.nil?
173
+ puts "#{tok.text} [#{tok.start}..#{tok.end}]"
174
+ end
175
+ </code>
176
+ <% end %>
177
+ <% end %>
178
+
179
+ <% end %>
180
+
181
+ <% chapter "Who use it" do %>
182
+ <% tip "Expand this list" do %>
183
+ If you used rmmseg-cpp and would like your project to
184
+ appear in this list, please "contact me":mailto:pluskid@gmail.com.
185
+ <% end %>
186
+
187
+ * "JavaEye":http://www.javaeye.com/: One of the biggest software developper
188
+ community in China.
189
+ <% end %>
190
+
191
+ <% chapter "Resources" do %>
192
+ * "Project Home":http://rubyforge.org/projects/rmmseg-cpp/: The Project page at RubyForge.
193
+ * "RDoc of rmmseg-cpp":http://rmmseg-cpp.rubyforge.org/rdoc/index.html: The auto generated rdoc of RMMSeg.
194
+ * "Free Mind":http://blog.pluskid.org/: The author's blog.
195
+ * "Author's Email":mailto:pluskid@gmail.com: Contact me if you have any problem.
196
+ <% end %>