rmmseg-cpp-traditional 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. data/.gitignore +17 -0
  2. data/Gemfile +4 -0
  3. data/History.txt +21 -0
  4. data/LICENSE.txt +22 -0
  5. data/Manifest.txt +43 -0
  6. data/README +111 -0
  7. data/README.md +29 -0
  8. data/Rakefile +19 -0
  9. data/bin/rmmseg +63 -0
  10. data/data/chars.dic +12638 -0
  11. data/data/words.dic +120308 -0
  12. data/ext/rmmseg/algor.cpp +222 -0
  13. data/ext/rmmseg/algor.h +80 -0
  14. data/ext/rmmseg/chunk.h +59 -0
  15. data/ext/rmmseg/dict.cpp +230 -0
  16. data/ext/rmmseg/dict.h +34 -0
  17. data/ext/rmmseg/extconf.rb +17 -0
  18. data/ext/rmmseg/memory.cpp +9 -0
  19. data/ext/rmmseg/memory.h +43 -0
  20. data/ext/rmmseg/rmmseg.cpp +263 -0
  21. data/ext/rmmseg/rules.h +86 -0
  22. data/ext/rmmseg/token.h +19 -0
  23. data/ext/rmmseg/word.h +44 -0
  24. data/lib/rmmseg/dictionary.rb +59 -0
  25. data/lib/rmmseg/ferret.rb +64 -0
  26. data/lib/rmmseg-cpp-traditional/version.rb +7 -0
  27. data/lib/rmmseg-cpp-traditional.rb +9 -0
  28. data/lib/rmmseg.rb +3 -0
  29. data/misc/convert.rb +114 -0
  30. data/misc/ferret_example.rb +59 -0
  31. data/misc/homepage.erb +196 -0
  32. data/misc/homepage.html +1212 -0
  33. data/rmmseg-cpp-traditional.gemspec +19 -0
  34. data/spec/rmmseg_spec.rb +8 -0
  35. data/spec/spec_helper.rb +17 -0
  36. data/tasks/ann.rake +81 -0
  37. data/tasks/bones.rake +21 -0
  38. data/tasks/gem.rake +126 -0
  39. data/tasks/git.rake +41 -0
  40. data/tasks/homepage.rake +15 -0
  41. data/tasks/manifest.rake +49 -0
  42. data/tasks/notes.rake +28 -0
  43. data/tasks/post_load.rake +39 -0
  44. data/tasks/rdoc.rake +51 -0
  45. data/tasks/rubyforge.rake +58 -0
  46. data/tasks/setup.rb +268 -0
  47. data/tasks/spec.rake +55 -0
  48. data/tasks/svn.rake +48 -0
  49. data/tasks/test.rake +38 -0
  50. data/test/test_rmmseg.rb +0 -0
  51. metadata +116 -0
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in rmmseg-cpp-traditional.gemspec
4
+ gemspec
data/History.txt ADDED
@@ -0,0 +1,21 @@
1
+ == 0.2.9 / 2011-09-10
2
+
3
+ * Fix GC-related bugs in Ruby C extension.
4
+
5
+ == 0.2.8 / 2010-03-22
6
+
7
+ * Minor release, fixed building bugs in Ruby 1.9.
8
+
9
+ == 0.2.7 / 2008-09-17
10
+
11
+ * Fix various stupid bugs (typo) that cause problems under MacOSX.
12
+
13
+ == 0.2.6 / 2008-08-14
14
+
15
+ * Fix the problem that hang up when required (OS: MacOSX-Leopard)
16
+
17
+ == 0.2.5 / 2008-06-07
18
+
19
+ * Created homepage on Rubyforge.
20
+ * Gem can be installed directly from Rubyforge now.
21
+ * Added rdoc for native extensions.
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 MichaelHsu
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Manifest.txt ADDED
@@ -0,0 +1,43 @@
1
+ History.txt
2
+ Manifest.txt
3
+ README
4
+ Rakefile
5
+ bin/rmmseg
6
+ data/chars.dic
7
+ data/words.dic
8
+ ext/rmmseg/algor.cpp
9
+ ext/rmmseg/algor.h
10
+ ext/rmmseg/chunk.h
11
+ ext/rmmseg/dict.cpp
12
+ ext/rmmseg/dict.h
13
+ ext/rmmseg/extconf.rb
14
+ ext/rmmseg/memory.cpp
15
+ ext/rmmseg/memory.h
16
+ ext/rmmseg/rmmseg.cpp
17
+ ext/rmmseg/rules.h
18
+ ext/rmmseg/token.h
19
+ ext/rmmseg/word.h
20
+ lib/rmmseg.rb
21
+ lib/rmmseg/dictionary.rb
22
+ lib/rmmseg/ferret.rb
23
+ misc/convert.rb
24
+ misc/ferret_example.rb
25
+ misc/homepage.erb
26
+ misc/homepage.html
27
+ spec/rmmseg_spec.rb
28
+ spec/spec_helper.rb
29
+ tasks/ann.rake
30
+ tasks/bones.rake
31
+ tasks/gem.rake
32
+ tasks/git.rake
33
+ tasks/homepage.rake
34
+ tasks/manifest.rake
35
+ tasks/notes.rake
36
+ tasks/post_load.rake
37
+ tasks/rdoc.rake
38
+ tasks/rubyforge.rake
39
+ tasks/setup.rb
40
+ tasks/spec.rake
41
+ tasks/svn.rake
42
+ tasks/test.rake
43
+ test/test_rmmseg.rb
data/README ADDED
@@ -0,0 +1,111 @@
1
+ rmmseg-cpp
2
+ by pluskid
3
+ http://rmmseg-cpp.rubyforge.org
4
+
5
+ == DESCRIPTION:
6
+
7
+ rmmseg-cpp is a high performance Chinese word segmentation utility for
8
+ Ruby. It features full "Ferret":http://ferret.davebalmain.com/ integration
9
+ as well as support for normal Ruby program usage.
10
+
11
+ rmmseg-cpp is a re-written of the original
12
+ RMMSeg(http://rmmseg.rubyforge.org/) gem in C++. RMMSeg is written
13
+ in pure Ruby. Though I tried hard to tweak RMMSeg, it just consumes
14
+ lots of memory and the segmenting process is rather slow.
15
+
16
+ The interface is almost identical to RMMSeg but the performance is
17
+ much better. This gem is always preferable in production
18
+ use. However, if you want to understand how the MMSEG segmenting
19
+ algorithm works, the source code of RMMSeg is a better choice than
20
+ this.
21
+
22
+ == FEATURES/PROBLEMS:
23
+
24
+ * Runs fast and the memory consumption is small.
25
+ * Support user customized dictionaries.
26
+ * Easy Ferret integration.
27
+
28
+ == SYNOPSIS:
29
+
30
+ === A simple script
31
+
32
+ rmmseg-cpp provides a simple script, which can read the text from
33
+ standard input and print the segmented result to standard output. Try
34
+ <tt>rmmseg -h</tt> for help on the options.
35
+
36
+ === In a normal Ruby program
37
+
38
+ To use rmmseg-cpp in normal Ruby program, first load the package and
39
+ init by loading the dictionaries:
40
+
41
+ require 'rubygems'
42
+ require 'rmmseg'
43
+
44
+ RMMSeg::Dictionary.load_dictionaries
45
+
46
+ If you want to add customized dictionaries, append them to
47
+ +RMMSeg::Dictionary.dictionaries+ before calling +load_dictionaries+.
48
+ The formats of chars.dic and words.dic are NOT the same:
49
+
50
+ * For chars.dic, each line contains freq, a space, and then the character
51
+ * For words.dic, each line contains length, a space, and then the word.
52
+
53
+ Note length mean the length of the word, i.e. the number of characters
54
+ of the word, not number of bytes. WARNING: there should be a newline at
55
+ the end of every dictionary file.
56
+
57
+ Then create a +Algorithm+ object and call +next_token+ until got a
58
+ +nil+:
59
+
60
+ algor = RMMSeg::Algorithm.new(text)
61
+ loop do
62
+ tok = algor.next_token
63
+ break if tok.nil?
64
+ puts "#{tok.text} [#{tok.start}..#{tok.end}]"
65
+ end
66
+
67
+ === With Ferret
68
+
69
+ To use rmmseg-cpp with Ferret, just use the analyzer provided:
70
+
71
+ analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
72
+ Ferret::Analysis::LowerCaseFilter.new(tokenizer)
73
+ }
74
+
75
+ index = Ferret::Index::Index.new(:analyzer => analyzer)
76
+
77
+ See <tt>misc/ferret_example.rb</tt> for a complete example.
78
+
79
+ == REQUIREMENTS:
80
+
81
+ * ruby 1.8.x
82
+ * g++
83
+
84
+ == INSTALL:
85
+
86
+ * sudo gem install rmmseg-cpp
87
+
88
+ == LICENSE:
89
+
90
+ (The MIT License)
91
+
92
+ Copyright (c) 2008 FIXME (different license?)
93
+
94
+ Permission is hereby granted, free of charge, to any person obtaining
95
+ a copy of this software and associated documentation files (the
96
+ 'Software'), to deal in the Software without restriction, including
97
+ without limitation the rights to use, copy, modify, merge, publish,
98
+ distribute, sublicense, and/or sell copies of the Software, and to
99
+ permit persons to whom the Software is furnished to do so, subject to
100
+ the following conditions:
101
+
102
+ The above copyright notice and this permission notice shall be
103
+ included in all copies or substantial portions of the Software.
104
+
105
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
106
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
107
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
108
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
109
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
110
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
111
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # Rmmseg::Cpp::Traditional
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'rmmseg-cpp-traditional'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install rmmseg-cpp-traditional
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,19 @@
1
+ # -*- Ruby -*-
2
+
3
+ load 'tasks/setup.rb'
4
+
5
+ ensure_in_path 'lib'
6
+ require 'rmmseg'
7
+
8
+ task :default => 'spec:run'
9
+
10
+ PROJ.name = 'rmmseg-cpp'
11
+ PROJ.version = '0.2.9'
12
+ PROJ.authors = 'pluskid'
13
+ PROJ.email = 'pluskid@gmail.com'
14
+ PROJ.url = 'http://rmmseg-cpp.rubyforge.org'
15
+ PROJ.rubyforge.name = 'rmmseg-cpp'
16
+
17
+ PROJ.spec.opts << '--color'
18
+
19
+ # EOF
data/bin/rmmseg ADDED
@@ -0,0 +1,63 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.expand_path(
4
+ File.join(File.dirname(__FILE__), '..', 'lib', 'rmmseg'))
5
+
6
+ require 'getoptlong'
7
+
8
+ def print_usage
9
+ puts <<EOF
10
+ #{__FILE__} Segment Chinese text. Read from stdin and print to stdout.
11
+
12
+ Options:
13
+ -h
14
+ --help Print this message
15
+
16
+ -s
17
+ --separator Select the separator of the segmented text. Default is
18
+ space.
19
+ EOF
20
+ exit 0
21
+ end
22
+
23
+ separator = " "
24
+
25
+ optparser = GetoptLong.new
26
+ optparser.set_options(["-s", "--separator", GetoptLong::REQUIRED_ARGUMENT],
27
+ ["-h", "--help", GetoptLong::NO_ARGUMENT])
28
+
29
+ loop do
30
+ begin
31
+ opt, arg = optparser.get
32
+ break if not opt
33
+
34
+ case opt
35
+ when "-h"
36
+ print_usage
37
+
38
+ when "-s"
39
+ separator = arg
40
+ end
41
+
42
+ rescue => err
43
+ puts err
44
+ exit 1
45
+ end
46
+ end
47
+
48
+ RMMSeg::Dictionary.load_dictionaries
49
+ algor = RMMSeg::Algorithm.new(STDIN.read)
50
+ tok = algor.next_token
51
+ unless tok.nil?
52
+ print tok.text
53
+
54
+ loop do
55
+ tok = algor.next_token
56
+ break if tok.nil?
57
+ print separator
58
+ print tok.text
59
+ end
60
+ puts
61
+ end
62
+
63
+ # EOF