rmmseg-cpp-traditional 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/History.txt +21 -0
- data/LICENSE.txt +22 -0
- data/Manifest.txt +43 -0
- data/README +111 -0
- data/README.md +29 -0
- data/Rakefile +19 -0
- data/bin/rmmseg +63 -0
- data/data/chars.dic +12638 -0
- data/data/words.dic +120308 -0
- data/ext/rmmseg/algor.cpp +222 -0
- data/ext/rmmseg/algor.h +80 -0
- data/ext/rmmseg/chunk.h +59 -0
- data/ext/rmmseg/dict.cpp +230 -0
- data/ext/rmmseg/dict.h +34 -0
- data/ext/rmmseg/extconf.rb +17 -0
- data/ext/rmmseg/memory.cpp +9 -0
- data/ext/rmmseg/memory.h +43 -0
- data/ext/rmmseg/rmmseg.cpp +263 -0
- data/ext/rmmseg/rules.h +86 -0
- data/ext/rmmseg/token.h +19 -0
- data/ext/rmmseg/word.h +44 -0
- data/lib/rmmseg/dictionary.rb +59 -0
- data/lib/rmmseg/ferret.rb +64 -0
- data/lib/rmmseg-cpp-traditional/version.rb +7 -0
- data/lib/rmmseg-cpp-traditional.rb +9 -0
- data/lib/rmmseg.rb +3 -0
- data/misc/convert.rb +114 -0
- data/misc/ferret_example.rb +59 -0
- data/misc/homepage.erb +196 -0
- data/misc/homepage.html +1212 -0
- data/rmmseg-cpp-traditional.gemspec +19 -0
- data/spec/rmmseg_spec.rb +8 -0
- data/spec/spec_helper.rb +17 -0
- data/tasks/ann.rake +81 -0
- data/tasks/bones.rake +21 -0
- data/tasks/gem.rake +126 -0
- data/tasks/git.rake +41 -0
- data/tasks/homepage.rake +15 -0
- data/tasks/manifest.rake +49 -0
- data/tasks/notes.rake +28 -0
- data/tasks/post_load.rake +39 -0
- data/tasks/rdoc.rake +51 -0
- data/tasks/rubyforge.rake +58 -0
- data/tasks/setup.rb +268 -0
- data/tasks/spec.rake +55 -0
- data/tasks/svn.rake +48 -0
- data/tasks/test.rake +38 -0
- data/test/test_rmmseg.rb +0 -0
- metadata +116 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/History.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
== 0.2.9 / 2011-09-10
|
2
|
+
|
3
|
+
* Fix GC-related bugs in Ruby C extension.
|
4
|
+
|
5
|
+
== 0.2.8 / 2010-03-22
|
6
|
+
|
7
|
+
* Minor release, fixed building bugs in Ruby 1.9.
|
8
|
+
|
9
|
+
== 0.2.7 / 2008-09-17
|
10
|
+
|
11
|
+
* Fix various stupid bugs (typo) that cause problems under MacOSX.
|
12
|
+
|
13
|
+
== 0.2.6 / 2008-08-14
|
14
|
+
|
15
|
+
* Fix the problem that hang up when required (OS: MacOSX-Leopard)
|
16
|
+
|
17
|
+
== 0.2.5 / 2008-06-07
|
18
|
+
|
19
|
+
* Created homepage on Rubyforge.
|
20
|
+
* Gem can be installed directly from Rubyforge now.
|
21
|
+
* Added rdoc for native extensions.
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 MichaelHsu
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Manifest.txt
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
History.txt
|
2
|
+
Manifest.txt
|
3
|
+
README
|
4
|
+
Rakefile
|
5
|
+
bin/rmmseg
|
6
|
+
data/chars.dic
|
7
|
+
data/words.dic
|
8
|
+
ext/rmmseg/algor.cpp
|
9
|
+
ext/rmmseg/algor.h
|
10
|
+
ext/rmmseg/chunk.h
|
11
|
+
ext/rmmseg/dict.cpp
|
12
|
+
ext/rmmseg/dict.h
|
13
|
+
ext/rmmseg/extconf.rb
|
14
|
+
ext/rmmseg/memory.cpp
|
15
|
+
ext/rmmseg/memory.h
|
16
|
+
ext/rmmseg/rmmseg.cpp
|
17
|
+
ext/rmmseg/rules.h
|
18
|
+
ext/rmmseg/token.h
|
19
|
+
ext/rmmseg/word.h
|
20
|
+
lib/rmmseg.rb
|
21
|
+
lib/rmmseg/dictionary.rb
|
22
|
+
lib/rmmseg/ferret.rb
|
23
|
+
misc/convert.rb
|
24
|
+
misc/ferret_example.rb
|
25
|
+
misc/homepage.erb
|
26
|
+
misc/homepage.html
|
27
|
+
spec/rmmseg_spec.rb
|
28
|
+
spec/spec_helper.rb
|
29
|
+
tasks/ann.rake
|
30
|
+
tasks/bones.rake
|
31
|
+
tasks/gem.rake
|
32
|
+
tasks/git.rake
|
33
|
+
tasks/homepage.rake
|
34
|
+
tasks/manifest.rake
|
35
|
+
tasks/notes.rake
|
36
|
+
tasks/post_load.rake
|
37
|
+
tasks/rdoc.rake
|
38
|
+
tasks/rubyforge.rake
|
39
|
+
tasks/setup.rb
|
40
|
+
tasks/spec.rake
|
41
|
+
tasks/svn.rake
|
42
|
+
tasks/test.rake
|
43
|
+
test/test_rmmseg.rb
|
data/README
ADDED
@@ -0,0 +1,111 @@
|
|
1
|
+
rmmseg-cpp
|
2
|
+
by pluskid
|
3
|
+
http://rmmseg-cpp.rubyforge.org
|
4
|
+
|
5
|
+
== DESCRIPTION:
|
6
|
+
|
7
|
+
rmmseg-cpp is a high performance Chinese word segmentation utility for
|
8
|
+
Ruby. It features full "Ferret":http://ferret.davebalmain.com/ integration
|
9
|
+
as well as support for normal Ruby program usage.
|
10
|
+
|
11
|
+
rmmseg-cpp is a re-written of the original
|
12
|
+
RMMSeg(http://rmmseg.rubyforge.org/) gem in C++. RMMSeg is written
|
13
|
+
in pure Ruby. Though I tried hard to tweak RMMSeg, it just consumes
|
14
|
+
lots of memory and the segmenting process is rather slow.
|
15
|
+
|
16
|
+
The interface is almost identical to RMMSeg but the performance is
|
17
|
+
much better. This gem is always preferable in production
|
18
|
+
use. However, if you want to understand how the MMSEG segmenting
|
19
|
+
algorithm works, the source code of RMMSeg is a better choice than
|
20
|
+
this.
|
21
|
+
|
22
|
+
== FEATURES/PROBLEMS:
|
23
|
+
|
24
|
+
* Runs fast and the memory consumption is small.
|
25
|
+
* Support user customized dictionaries.
|
26
|
+
* Easy Ferret integration.
|
27
|
+
|
28
|
+
== SYNOPSIS:
|
29
|
+
|
30
|
+
=== A simple script
|
31
|
+
|
32
|
+
rmmseg-cpp provides a simple script, which can read the text from
|
33
|
+
standard input and print the segmented result to standard output. Try
|
34
|
+
<tt>rmmseg -h</tt> for help on the options.
|
35
|
+
|
36
|
+
=== In a normal Ruby program
|
37
|
+
|
38
|
+
To use rmmseg-cpp in normal Ruby program, first load the package and
|
39
|
+
init by loading the dictionaries:
|
40
|
+
|
41
|
+
require 'rubygems'
|
42
|
+
require 'rmmseg'
|
43
|
+
|
44
|
+
RMMSeg::Dictionary.load_dictionaries
|
45
|
+
|
46
|
+
If you want to add customized dictionaries, append them to
|
47
|
+
+RMMSeg::Dictionary.dictionaries+ before calling +load_dictionaries+.
|
48
|
+
The formats of chars.dic and words.dic are NOT the same:
|
49
|
+
|
50
|
+
* For chars.dic, each line contains freq, a space, and then the character
|
51
|
+
* For words.dic, each line contains length, a space, and then the word.
|
52
|
+
|
53
|
+
Note length mean the length of the word, i.e. the number of characters
|
54
|
+
of the word, not number of bytes. WARNING: there should be a newline at
|
55
|
+
the end of every dictionary file.
|
56
|
+
|
57
|
+
Then create a +Algorithm+ object and call +next_token+ until got a
|
58
|
+
+nil+:
|
59
|
+
|
60
|
+
algor = RMMSeg::Algorithm.new(text)
|
61
|
+
loop do
|
62
|
+
tok = algor.next_token
|
63
|
+
break if tok.nil?
|
64
|
+
puts "#{tok.text} [#{tok.start}..#{tok.end}]"
|
65
|
+
end
|
66
|
+
|
67
|
+
=== With Ferret
|
68
|
+
|
69
|
+
To use rmmseg-cpp with Ferret, just use the analyzer provided:
|
70
|
+
|
71
|
+
analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
|
72
|
+
Ferret::Analysis::LowerCaseFilter.new(tokenizer)
|
73
|
+
}
|
74
|
+
|
75
|
+
index = Ferret::Index::Index.new(:analyzer => analyzer)
|
76
|
+
|
77
|
+
See <tt>misc/ferret_example.rb</tt> for a complete example.
|
78
|
+
|
79
|
+
== REQUIREMENTS:
|
80
|
+
|
81
|
+
* ruby 1.8.x
|
82
|
+
* g++
|
83
|
+
|
84
|
+
== INSTALL:
|
85
|
+
|
86
|
+
* sudo gem install rmmseg-cpp
|
87
|
+
|
88
|
+
== LICENSE:
|
89
|
+
|
90
|
+
(The MIT License)
|
91
|
+
|
92
|
+
Copyright (c) 2008 FIXME (different license?)
|
93
|
+
|
94
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
95
|
+
a copy of this software and associated documentation files (the
|
96
|
+
'Software'), to deal in the Software without restriction, including
|
97
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
98
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
99
|
+
permit persons to whom the Software is furnished to do so, subject to
|
100
|
+
the following conditions:
|
101
|
+
|
102
|
+
The above copyright notice and this permission notice shall be
|
103
|
+
included in all copies or substantial portions of the Software.
|
104
|
+
|
105
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
106
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
107
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
108
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
109
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
110
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
111
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# Rmmseg::Cpp::Traditional
|
2
|
+
|
3
|
+
TODO: Write a gem description
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'rmmseg-cpp-traditional'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install rmmseg-cpp-traditional
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
TODO: Write usage instructions here
|
22
|
+
|
23
|
+
## Contributing
|
24
|
+
|
25
|
+
1. Fork it
|
26
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
27
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
28
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
29
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
# -*- Ruby -*-
|
2
|
+
|
3
|
+
load 'tasks/setup.rb'
|
4
|
+
|
5
|
+
ensure_in_path 'lib'
|
6
|
+
require 'rmmseg'
|
7
|
+
|
8
|
+
task :default => 'spec:run'
|
9
|
+
|
10
|
+
PROJ.name = 'rmmseg-cpp'
|
11
|
+
PROJ.version = '0.2.9'
|
12
|
+
PROJ.authors = 'pluskid'
|
13
|
+
PROJ.email = 'pluskid@gmail.com'
|
14
|
+
PROJ.url = 'http://rmmseg-cpp.rubyforge.org'
|
15
|
+
PROJ.rubyforge.name = 'rmmseg-cpp'
|
16
|
+
|
17
|
+
PROJ.spec.opts << '--color'
|
18
|
+
|
19
|
+
# EOF
|
data/bin/rmmseg
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require File.expand_path(
|
4
|
+
File.join(File.dirname(__FILE__), '..', 'lib', 'rmmseg'))
|
5
|
+
|
6
|
+
require 'getoptlong'
|
7
|
+
|
8
|
+
def print_usage
|
9
|
+
puts <<EOF
|
10
|
+
#{__FILE__} Segment Chinese text. Read from stdin and print to stdout.
|
11
|
+
|
12
|
+
Options:
|
13
|
+
-h
|
14
|
+
--help Print this message
|
15
|
+
|
16
|
+
-s
|
17
|
+
--separator Select the separator of the segmented text. Default is
|
18
|
+
space.
|
19
|
+
EOF
|
20
|
+
exit 0
|
21
|
+
end
|
22
|
+
|
23
|
+
separator = " "
|
24
|
+
|
25
|
+
optparser = GetoptLong.new
|
26
|
+
optparser.set_options(["-s", "--separator", GetoptLong::REQUIRED_ARGUMENT],
|
27
|
+
["-h", "--help", GetoptLong::NO_ARGUMENT])
|
28
|
+
|
29
|
+
loop do
|
30
|
+
begin
|
31
|
+
opt, arg = optparser.get
|
32
|
+
break if not opt
|
33
|
+
|
34
|
+
case opt
|
35
|
+
when "-h"
|
36
|
+
print_usage
|
37
|
+
|
38
|
+
when "-s"
|
39
|
+
separator = arg
|
40
|
+
end
|
41
|
+
|
42
|
+
rescue => err
|
43
|
+
puts err
|
44
|
+
exit 1
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
RMMSeg::Dictionary.load_dictionaries
|
49
|
+
algor = RMMSeg::Algorithm.new(STDIN.read)
|
50
|
+
tok = algor.next_token
|
51
|
+
unless tok.nil?
|
52
|
+
print tok.text
|
53
|
+
|
54
|
+
loop do
|
55
|
+
tok = algor.next_token
|
56
|
+
break if tok.nil?
|
57
|
+
print separator
|
58
|
+
print tok.text
|
59
|
+
end
|
60
|
+
puts
|
61
|
+
end
|
62
|
+
|
63
|
+
# EOF
|