rmmseg-cpp-traditional 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/History.txt +21 -0
- data/LICENSE.txt +22 -0
- data/Manifest.txt +43 -0
- data/README +111 -0
- data/README.md +29 -0
- data/Rakefile +19 -0
- data/bin/rmmseg +63 -0
- data/data/chars.dic +12638 -0
- data/data/words.dic +120308 -0
- data/ext/rmmseg/algor.cpp +222 -0
- data/ext/rmmseg/algor.h +80 -0
- data/ext/rmmseg/chunk.h +59 -0
- data/ext/rmmseg/dict.cpp +230 -0
- data/ext/rmmseg/dict.h +34 -0
- data/ext/rmmseg/extconf.rb +17 -0
- data/ext/rmmseg/memory.cpp +9 -0
- data/ext/rmmseg/memory.h +43 -0
- data/ext/rmmseg/rmmseg.cpp +263 -0
- data/ext/rmmseg/rules.h +86 -0
- data/ext/rmmseg/token.h +19 -0
- data/ext/rmmseg/word.h +44 -0
- data/lib/rmmseg/dictionary.rb +59 -0
- data/lib/rmmseg/ferret.rb +64 -0
- data/lib/rmmseg-cpp-traditional/version.rb +7 -0
- data/lib/rmmseg-cpp-traditional.rb +9 -0
- data/lib/rmmseg.rb +3 -0
- data/misc/convert.rb +114 -0
- data/misc/ferret_example.rb +59 -0
- data/misc/homepage.erb +196 -0
- data/misc/homepage.html +1212 -0
- data/rmmseg-cpp-traditional.gemspec +19 -0
- data/spec/rmmseg_spec.rb +8 -0
- data/spec/spec_helper.rb +17 -0
- data/tasks/ann.rake +81 -0
- data/tasks/bones.rake +21 -0
- data/tasks/gem.rake +126 -0
- data/tasks/git.rake +41 -0
- data/tasks/homepage.rake +15 -0
- data/tasks/manifest.rake +49 -0
- data/tasks/notes.rake +28 -0
- data/tasks/post_load.rake +39 -0
- data/tasks/rdoc.rake +51 -0
- data/tasks/rubyforge.rake +58 -0
- data/tasks/setup.rb +268 -0
- data/tasks/spec.rake +55 -0
- data/tasks/svn.rake +48 -0
- data/tasks/test.rake +38 -0
- data/test/test_rmmseg.rb +0 -0
- metadata +116 -0
@@ -0,0 +1,59 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'rmmseg'
|
4
|
+
require 'rmmseg/ferret'
|
5
|
+
|
6
|
+
# dictionaries needed to be explicitly loaded
|
7
|
+
RMMSeg::Dictionary.load_dictionaries
|
8
|
+
|
9
|
+
analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
|
10
|
+
Ferret::Analysis::LowerCaseFilter.new(tokenizer)
|
11
|
+
}
|
12
|
+
|
13
|
+
$index = Ferret::Index::Index.new(:analyzer => analyzer)
|
14
|
+
|
15
|
+
$index << {
|
16
|
+
:title => "分词",
|
17
|
+
:content => "中文分词比较困难,不像英文那样,直接在空格和标点符号的地方断开就可以了。"
|
18
|
+
}
|
19
|
+
$index << {
|
20
|
+
:title => "RMMSeg",
|
21
|
+
:content => "RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。"
|
22
|
+
}
|
23
|
+
$index << {
|
24
|
+
:title => "Ruby 1.9",
|
25
|
+
:content => "Ruby 1.9.0 已经发布了,1.9 的一个重大改进就是对 Unicode 的支持。"
|
26
|
+
}
|
27
|
+
$index << {
|
28
|
+
:title => "Ferret",
|
29
|
+
:content => <<END
|
30
|
+
Ferret is a high-performance, full-featured text search engine library
|
31
|
+
written for Ruby. It is inspired by Apache Lucene Java project. With
|
32
|
+
the introduction of Ferret, Ruby users now have one of the fastest and
|
33
|
+
most flexible search libraries available. And it is surprisingly easy
|
34
|
+
to use.
|
35
|
+
END
|
36
|
+
}
|
37
|
+
|
38
|
+
def highlight_search(key)
|
39
|
+
$index.search_each(%Q!content:"#{key}"!) do |id, score|
|
40
|
+
puts "*** Document \"#{$index[id][:title]}\" found with a score of #{score}"
|
41
|
+
puts "-"*40
|
42
|
+
highlights = $index.highlight("content:#{key}", id,
|
43
|
+
:field => :content,
|
44
|
+
:pre_tag => "\033[36m",
|
45
|
+
:post_tag => "\033[m")
|
46
|
+
puts "#{highlights}"
|
47
|
+
puts ""
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
ARGV.each { |key|
|
52
|
+
puts "\033[33mSearching for #{key}...\033[m"
|
53
|
+
puts ""
|
54
|
+
highlight_search(key)
|
55
|
+
}
|
56
|
+
|
57
|
+
# Local Variables:
|
58
|
+
# coding: utf-8
|
59
|
+
# End:
|
data/misc/homepage.erb
ADDED
@@ -0,0 +1,196 @@
|
|
1
|
+
<%# -*- mode: text; coding: utf-8 -*- %>
|
2
|
+
<%
|
3
|
+
$title = "rmmseg-cpp Homepage"
|
4
|
+
$authors = { 'pluskid' => 'http://blog.pluskid.org' }
|
5
|
+
%>
|
6
|
+
|
7
|
+
<% chapter "Introduction" do %>
|
8
|
+
|
9
|
+
rmmseg-cpp is a high performance Chinese word segmentation utility for
|
10
|
+
Ruby. It features full "Ferret":http://ferret.davebalmain.com/ integration
|
11
|
+
as well as support for normal Ruby program usage.
|
12
|
+
|
13
|
+
rmmseg-cpp is a re-written of the original
|
14
|
+
"RMMSeg":http://rmmseg.rubyforge.org/ gem in C++. RMMSeg is written
|
15
|
+
in pure Ruby. Though I tried hard to tweak RMMSeg, it just consumes
|
16
|
+
lots of memory and the segmenting process is rather slow.
|
17
|
+
|
18
|
+
The interface is almost identical to RMMSeg but the performance is
|
19
|
+
much better. This gem is always preferable in production
|
20
|
+
use. However, if you want to understand how the MMSEG segmenting
|
21
|
+
algorithm works, the source code of RMMSeg is a better choice than
|
22
|
+
this.
|
23
|
+
|
24
|
+
<% end %>
|
25
|
+
|
26
|
+
<% chapter "Setup" do %>
|
27
|
+
<% section "Requirements" do %>
|
28
|
+
|
29
|
+
Your system needs the following software to run RMMSeg.
|
30
|
+
|
31
|
+
|_. Software |_. Notes |
|
32
|
+
| "Ruby":http://ruby-lang.org | Version 1.8.x is required |
|
33
|
+
| RubyGems | rmmseg-cpp is released as a gem |
|
34
|
+
| g++ | Used to build the native extension |
|
35
|
+
|
36
|
+
<% end %>
|
37
|
+
|
38
|
+
<% section "Installation" do %>
|
39
|
+
<% section "Using RubyGems" do %>
|
40
|
+
To install the gem remotely from "RubyForge":http://rubyforge.org:
|
41
|
+
|
42
|
+
sudo gem install rmmseg-cpp
|
43
|
+
|
44
|
+
Or you can download the gem file manually from
|
45
|
+
"RubyForge":http://rubyforge.org/projects/rmmseg-cpp/ and
|
46
|
+
install it locally:
|
47
|
+
|
48
|
+
sudo gem install --local rmmseg-cpp-x.y.z.gem
|
49
|
+
|
50
|
+
<% end %>
|
51
|
+
|
52
|
+
<% section "From Git" do %>
|
53
|
+
To build the gem manually from the latest source code. You'll
|
54
|
+
need to have *git* and *rake* installed.
|
55
|
+
|
56
|
+
<% warning "The latest source code may be unstable" do %>
|
57
|
+
|
58
|
+
While I tried to avoid such kind of problems, the source
|
59
|
+
code from the repository might still be broken sometimes.
|
60
|
+
It is generally not recommended to follow the source code.
|
61
|
+
|
62
|
+
<% end %>
|
63
|
+
|
64
|
+
The source code of rmmseg-cpp is hosted at
|
65
|
+
"GitHub":http://github.com/pluskid/rmmseg-cpp/. You can get the
|
66
|
+
source code by git clone:
|
67
|
+
|
68
|
+
git clone git://github.com/pluskid/rmmseg-cpp.git
|
69
|
+
|
70
|
+
then you can use Rake to build and install the gem:
|
71
|
+
|
72
|
+
cd rmmseg-cpp
|
73
|
+
rake gem:install
|
74
|
+
|
75
|
+
<% end %>
|
76
|
+
<% end %>
|
77
|
+
<% end %>
|
78
|
+
|
79
|
+
<% chapter "Usage" do %>
|
80
|
+
|
81
|
+
<% section "Stand Alone rmmseg" do %>
|
82
|
+
rmmseg-cpp comes with a script *rmmseg*. To get the basic usage, just execute it with <tt>-h</tt> option:
|
83
|
+
|
84
|
+
rmmseg -h
|
85
|
+
|
86
|
+
It reads from STDIN and print result to STDOUT. Here is a real
|
87
|
+
example:
|
88
|
+
|
89
|
+
$ echo "我们都喜欢用 Ruby" | rmmseg
|
90
|
+
我们 都 喜欢 用 Ruby
|
91
|
+
|
92
|
+
<% end %>
|
93
|
+
|
94
|
+
<% section "Use in Ruby program" do %>
|
95
|
+
|
96
|
+
<% section "Initialize" do %>
|
97
|
+
|
98
|
+
To use rmmseg-cpp in Ruby program, you'll first load it with RubyGems:
|
99
|
+
|
100
|
+
<code>
|
101
|
+
require 'rubygems'
|
102
|
+
require 'rmmseg'
|
103
|
+
</code>
|
104
|
+
|
105
|
+
Then you may customize the dictionaries used by rmmseg-cpp
|
106
|
+
(see "the rdoc":http://rmmseg-cpp.rubyforge.org/rdoc/classes/RMMSeg/Dictionary.html on
|
107
|
+
how to add your own dictionaries) and load all dictionaries:
|
108
|
+
|
109
|
+
<code>
|
110
|
+
RMMSeg::Dictionary.load_dictionaries
|
111
|
+
</code>
|
112
|
+
|
113
|
+
Now rmmseg-cpp will be ready to do segmenting. If you want to load your own customized
|
114
|
+
dictionaries, please customize <tt>RMMSeg::Dictionary.dictionaries</tt> before calling
|
115
|
+
<tt>load_dictionaries</tt>. e.g.
|
116
|
+
|
117
|
+
<code>
|
118
|
+
RMMSeg::Dictionary.dictionaries = [[:chars, "my_chars.dic"],
|
119
|
+
[:words, "my_words.dic"],
|
120
|
+
[:words, "my_words2.dic"]]
|
121
|
+
</code>
|
122
|
+
|
123
|
+
The basic format for char-dictionary and word-dictionary are similar. For each line,
|
124
|
+
there is a number, then *a* space, then the string. Note there *SHOULD* be a newline
|
125
|
+
at the end of the dictionary file. And the number in char-dictionary and word-dictionary
|
126
|
+
has different meaning.
|
127
|
+
|
128
|
+
In char-dictionary, the number means the frequency of the character. In word-dictionary,
|
129
|
+
the number mean the number of characters in the word. Note that this is NOT the number
|
130
|
+
of *bytes* in the word.
|
131
|
+
|
132
|
+
<% end %>
|
133
|
+
|
134
|
+
<% section "Ferret Integration" do %>
|
135
|
+
|
136
|
+
To use rmmseg-cpp with Ferret, you'll need to @require@ the
|
137
|
+
Ferret support of rmmseg-cpp (Of course you'll also have to
|
138
|
+
got Ferret installed. If you have problems running the belowing
|
139
|
+
example, please try to update to the latest version of both
|
140
|
+
Ferret and rmmseg-cpp first):
|
141
|
+
|
142
|
+
<code>
|
143
|
+
require 'rmmseg/ferret'
|
144
|
+
</code>
|
145
|
+
|
146
|
+
rmmseg-cpp comes with a ready to use Ferret analyzer:
|
147
|
+
|
148
|
+
<code>
|
149
|
+
analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
|
150
|
+
Ferret::Analysis::LowerCaseFilter.new(tokenizer)
|
151
|
+
}
|
152
|
+
index = Ferret::Index::Index.new(:analyzer => analyzer)
|
153
|
+
</code>
|
154
|
+
|
155
|
+
A complete example can be found in <tt>misc/ferret_example.rb</tt>. The result
|
156
|
+
of running that example is shown in <%= xref "Ferret Example Screenshot" %>.
|
157
|
+
|
158
|
+
<% figure "Ferret Example Screenshot" do %>
|
159
|
+
!http://lifegoo.pluskid.org/wp-content/uploads/2008/02/rmmseg.png!
|
160
|
+
<% end %>
|
161
|
+
|
162
|
+
<% end %>
|
163
|
+
|
164
|
+
<% section "Normal Ruby program" do %>
|
165
|
+
rmmseg-cpp can also be used in normal Ruby programs. Just create
|
166
|
+
an @Algorithm@ object and call @next_token@ until a @nil@ is returned:
|
167
|
+
|
168
|
+
<code>
|
169
|
+
algor = RMMSeg::Algorithm.new(text)
|
170
|
+
loop do
|
171
|
+
tok = algor.next_token
|
172
|
+
break if tok.nil?
|
173
|
+
puts "#{tok.text} [#{tok.start}..#{tok.end}]"
|
174
|
+
end
|
175
|
+
</code>
|
176
|
+
<% end %>
|
177
|
+
<% end %>
|
178
|
+
|
179
|
+
<% end %>
|
180
|
+
|
181
|
+
<% chapter "Who use it" do %>
|
182
|
+
<% tip "Expand this list" do %>
|
183
|
+
If you used rmmseg-cpp and would like your project to
|
184
|
+
appear in this list, please "contact me":mailto:pluskid@gmail.com.
|
185
|
+
<% end %>
|
186
|
+
|
187
|
+
* "JavaEye":http://www.javaeye.com/: One of the biggest software developper
|
188
|
+
community in China.
|
189
|
+
<% end %>
|
190
|
+
|
191
|
+
<% chapter "Resources" do %>
|
192
|
+
* "Project Home":http://rubyforge.org/projects/rmmseg-cpp/: The Project page at RubyForge.
|
193
|
+
* "RDoc of rmmseg-cpp":http://rmmseg-cpp.rubyforge.org/rdoc/index.html: The auto generated rdoc of RMMSeg.
|
194
|
+
* "Free Mind":http://blog.pluskid.org/: The author's blog.
|
195
|
+
* "Author's Email":mailto:pluskid@gmail.com: Contact me if you have any problem.
|
196
|
+
<% end %>
|