plexus-rmmseg 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +1 -0
  3. data/History.txt +42 -0
  4. data/Manifest.txt +51 -0
  5. data/README.txt +74 -0
  6. data/Rakefile +12 -0
  7. data/TODO.txt +5 -0
  8. data/bin/rmmseg +65 -0
  9. data/data/chars.dic +12638 -0
  10. data/data/custom.dic +12 -0
  11. data/data/punctuation.dic +79 -0
  12. data/data/words.dic +120330 -0
  13. data/lib/rmmseg.rb +13 -0
  14. data/lib/rmmseg/algorithm.rb +136 -0
  15. data/lib/rmmseg/amibguity.rb +4 -0
  16. data/lib/rmmseg/chunk.rb +41 -0
  17. data/lib/rmmseg/complex_algorithm.rb +122 -0
  18. data/lib/rmmseg/config.rb +65 -0
  19. data/lib/rmmseg/dictionary.rb +80 -0
  20. data/lib/rmmseg/ferret.rb +109 -0
  21. data/lib/rmmseg/lawl_rule.rb +12 -0
  22. data/lib/rmmseg/lsdmfocw_rule.rb +13 -0
  23. data/lib/rmmseg/mm_rule.rb +13 -0
  24. data/lib/rmmseg/rule_helper.rb +28 -0
  25. data/lib/rmmseg/simple_algorithm.rb +37 -0
  26. data/lib/rmmseg/svwl_rule.rb +12 -0
  27. data/lib/rmmseg/token.rb +30 -0
  28. data/lib/rmmseg/version.rb +3 -0
  29. data/lib/rmmseg/word.rb +38 -0
  30. data/misc/ferret_example.rb +56 -0
  31. data/misc/homepage.erb +170 -0
  32. data/misc/homepage.html +1214 -0
  33. data/plexus-rmmseg.gemspec +20 -0
  34. data/spec/chunk_spec.rb +25 -0
  35. data/spec/complex_algorithm_spec.rb +18 -0
  36. data/spec/config_spec.rb +12 -0
  37. data/spec/dictionary_spec.rb +20 -0
  38. data/spec/lawl_rule_spec.rb +15 -0
  39. data/spec/lsdmfocw_rule_spec.rb +14 -0
  40. data/spec/mm_rule_spec.rb +15 -0
  41. data/spec/simple_algorithm_spec.rb +46 -0
  42. data/spec/spec_helper.rb +12 -0
  43. data/spec/svwl_rule_spec.rb +14 -0
  44. data/spec/word_spec.rb +9 -0
  45. data/tasks/ann.rake +76 -0
  46. data/tasks/annotations.rake +22 -0
  47. data/tasks/doc.rake +48 -0
  48. data/tasks/gem.rake +110 -0
  49. data/tasks/homepage.rake +12 -0
  50. data/tasks/manifest.rake +49 -0
  51. data/tasks/post_load.rake +26 -0
  52. data/tasks/rubyforge.rake +57 -0
  53. data/tasks/setup.rb +227 -0
  54. data/tasks/spec.rake +54 -0
  55. data/tasks/svn.rake +44 -0
  56. data/tasks/test.rake +38 -0
  57. metadata +121 -0
@@ -0,0 +1,109 @@
1
+ # This file integrate RMMSeg with Ferret
2
+ require 'singleton'
3
+ require 'rubygems'
4
+ require 'ferret'
5
+ require 'rmmseg'
6
+
7
+ module RMMSeg
8
+ module Ferret
9
+ # The Analyzer class can be used with Ferret .
10
+ class Analyzer < ::Ferret::Analysis::Analyzer
11
+
12
+ # Construct an Analyzer. Optional block can be used to
13
+ # add more +TokenFilter+s. e.g.
14
+ #
15
+ # analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
16
+ # Ferret::Analysis::LowerCaseFilter.new(tokenizer)
17
+ # }
18
+ #
19
+ def initialize(&brk)
20
+ @brk = brk
21
+ end
22
+
23
+ def token_stream(field, text)
24
+ t = PunctuationFilter.new(Tokenizer.new(text))
25
+ if @brk
26
+ @brk.call(t)
27
+ else
28
+ t
29
+ end
30
+ end
31
+ end
32
+
33
+ # The Tokenizer tokenize text with RMMSeg::Algorithm.
34
+ class Tokenizer < ::Ferret::Analysis::TokenStream
35
+ # Create a new Tokenizer to tokenize +text+
36
+ def initialize(str)
37
+ self.text = str
38
+ end
39
+
40
+ # Get next token
41
+ def next
42
+ @algor.next_token
43
+ end
44
+
45
+ # Get the text being tokenized
46
+ def text
47
+ @text
48
+ end
49
+
50
+ # Set the text to be tokenized
51
+ def text=(str)
52
+ @text = str
53
+ @algor = RMMSeg::Config.algorithm_instance(@text,
54
+ ::Ferret::Analysis::Token)
55
+ end
56
+ end
57
+
58
+ # PunctuationFilter filter out the stand alone Chinese
59
+ # punctuation tokens.
60
+ class PunctuationFilter < ::Ferret::Analysis::TokenStream
61
+ # The punctuation dictionary.
62
+ class Dictionary
63
+ include Singleton
64
+
65
+ DIC_FILE = File.join(File.dirname(__FILE__),
66
+ "..",
67
+ "..",
68
+ "data",
69
+ "punctuation.dic")
70
+ def initialize
71
+ @dic = Hash.new
72
+ File.open(DIC_FILE, "r") do |f|
73
+ f.each_line { |line|
74
+ @dic[line.chomp.freeze] = nil
75
+ }
76
+ end
77
+ end
78
+
79
+ def include?(str)
80
+ @dic.has_key?(str)
81
+ end
82
+ end
83
+
84
+ def initialize(stream)
85
+ @stream = stream
86
+ end
87
+
88
+ # Get next token, skip stand alone Chinese punctuations.
89
+ def next
90
+ token = @stream.next
91
+ dic = Dictionary.instance
92
+
93
+ until token.nil? || !(dic.include? token.text)
94
+ token = @stream.next
95
+ end
96
+
97
+ token
98
+ end
99
+
100
+ def text
101
+ @stream.text
102
+ end
103
+
104
+ def text=(str)
105
+ @stream.text = str
106
+ end
107
+ end
108
+ end
109
+ end
@@ -0,0 +1,12 @@
1
+ require 'rmmseg/rule_helper'
2
+
3
+ module RMMSeg
4
+ # Largest average word length rule.
5
+ class LAWLRule
6
+ def self.filter(chunks)
7
+ chunks.take_highest { |a, b|
8
+ Chunk::average_length(a) <=> Chunk::average_length(b)
9
+ }
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,13 @@
1
+ require 'rmmseg/rule_helper'
2
+
3
+ module RMMSeg
4
+ # Largest sum of degree of morphemic freedom of one-character
5
+ # words rule.
6
+ class LSDMFOCWRule
7
+ def self.filter(chunks)
8
+ chunks.take_highest { |a, b|
9
+ Chunk::degree_of_morphemic_freedom(a) <=> Chunk::degree_of_morphemic_freedom(b)
10
+ }
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,13 @@
1
+ require 'rmmseg/rule_helper'
2
+
3
+ module RMMSeg
4
+ # Maximum matching rule, select the chunks with the
5
+ # maximum length.
6
+ class MMRule
7
+ def self.filter(chunks)
8
+ chunks.take_highest { |a, b|
9
+ Chunk::total_length(a) <=> Chunk::total_length(b)
10
+ }
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,28 @@
1
+ class Array
2
+ # Take the elements with the highest value. Value are compared
3
+ # through the block. e.g
4
+ #
5
+ # ["aaaa", "bb", "cccc"].take_highest { |a, b|
6
+ # a.length <=> b.length
7
+ # }
8
+ # # => ["aaaa", "cccc"]
9
+ #
10
+ def take_highest
11
+ return [] if empty?
12
+
13
+ rlt = [self.first]
14
+ max = self.first
15
+
16
+ for i in 1...length
17
+ cmp = yield(self[i], max)
18
+ if cmp == 0
19
+ rlt << self[i]
20
+ elsif cmp > 0
21
+ max = self[i]
22
+ rlt = [max]
23
+ end
24
+ end
25
+
26
+ rlt
27
+ end
28
+ end
@@ -0,0 +1,37 @@
1
+ require 'rmmseg/algorithm'
2
+ require 'rmmseg/mm_rule'
3
+
4
+ module RMMSeg
5
+ class SimpleAlgorithm
6
+ include Algorithm
7
+
8
+ # Create a new SimpleAlgorithm . The only rule used by this
9
+ # algorithm is MMRule .
10
+ def initialize(text, token=Token)
11
+ super
12
+ end
13
+
14
+ # Get the most proper CJK word.
15
+ def get_cjk_word
16
+ dic = Dictionary.instance
17
+ i = Config.max_word_length
18
+ if i + @index > @chars.length
19
+ i = @chars.length - @index
20
+ end
21
+ chars = @chars[@index, i]
22
+ word = chars.join
23
+
24
+ while i > 1 && !dic.has_word?(word)
25
+ i -= 1
26
+ word.slice!(-chars[i].size,chars[i].size) # truncate last char
27
+ end
28
+
29
+ token = @token.new(word, @byte_index, @byte_index+word.size)
30
+
31
+ @index += i
32
+ @byte_index += word.size
33
+
34
+ return token
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,12 @@
1
+ require 'rmmseg/rule_helper'
2
+
3
+ module RMMSeg
4
+ # Smallest variance of word length rule.
5
+ class SVWLRule
6
+ def self.filter(chunks)
7
+ chunks.take_highest { |a, b|
8
+ Chunk::variance(b) <=> Chunk::variance(a)
9
+ }
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,30 @@
1
+ # -*- encoding: utf-8 -*-
2
+ module RMMSeg
3
+ # A Token consists of a term's text and the start and end offset
4
+ # of the term.
5
+ class Token
6
+ # The text of the token
7
+ attr_accessor :text
8
+
9
+ # The start position of the token. This is *byte* index instead of
10
+ # character.
11
+ attr_accessor :start
12
+
13
+ # The one greater than the position of the last byte of the
14
+ # token. This is *byte* index instead of character.
15
+ attr_accessor :end
16
+
17
+ # +text+ is the ref to the whole text. In other words:
18
+ # +text[start_pos...end_pos]+ should be the string held by this
19
+ # token.
20
+ def initialize(text, start_pos, end_pos)
21
+ @text = text
22
+ @start = start_pos
23
+ @end = end_pos
24
+ end
25
+
26
+ def to_s
27
+ @text.dup
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,3 @@
1
+ module RMMSeg
2
+ VERSION = '0.1.6'
3
+ end
@@ -0,0 +1,38 @@
1
+ module RMMSeg
2
+ # An object representing a CJK word.
3
+ class Word
4
+ TYPES = {
5
+ :unrecognized => :unrecognized,
6
+ :basic_latin_word => :basic_latin_word,
7
+ :cjk_word => :cjk_word
8
+ }.freeze
9
+
10
+ # The content text of the word.
11
+ attr_reader :text
12
+
13
+ # The type of the word, may be one of the key of TYPES .
14
+ attr_reader :type
15
+
16
+ # The frequency of the word. This value is meaningful only
17
+ # when this is a one-character word.
18
+ attr_reader :frequency
19
+
20
+ # Initialize a Word object.
21
+ def initialize(text, type=TYPES[:unrecognized], frequency=nil)
22
+ @text = text
23
+ @type = type
24
+ @frequency = frequency
25
+ @length = @text.length
26
+ end
27
+
28
+ # The number of characters in the word. *Not* number of bytes.
29
+ def length
30
+ @length
31
+ end
32
+
33
+ # The number of bytes in the word.
34
+ def byte_size
35
+ @text.each_byte.to_a.length
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,56 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'rmmseg'
4
+ require 'rmmseg/ferret'
5
+
6
+ analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
7
+ Ferret::Analysis::LowerCaseFilter.new(tokenizer)
8
+ }
9
+
10
+ $index = Ferret::Index::Index.new(:analyzer => analyzer)
11
+
12
+ $index << {
13
+ :title => "分词",
14
+ :content => "中文分词比较困难,不像英文那样,直接在空格和标点符号的地方断开就可以了。"
15
+ }
16
+ $index << {
17
+ :title => "RMMSeg",
18
+ :content => "RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。"
19
+ }
20
+ $index << {
21
+ :title => "Ruby 1.9",
22
+ :content => "Ruby 1.9.0 已经发布了,1.9 的一个重大改进就是对 Unicode 的支持。"
23
+ }
24
+ $index << {
25
+ :title => "Ferret",
26
+ :content => <<END
27
+ Ferret is a high-performance, full-featured text search engine library
28
+ written for Ruby. It is inspired by Apache Lucene Java project. With
29
+ the introduction of Ferret, Ruby users now have one of the fastest and
30
+ most flexible search libraries available. And it is surprisingly easy
31
+ to use.
32
+ END
33
+ }
34
+
35
+ def highlight_search(key)
36
+ $index.search_each(%Q!content:"#{key}"!) do |id, score|
37
+ puts "*** Document \"#{$index[id][:title]}\" found with a score of #{score}"
38
+ puts "-"*40
39
+ highlights = $index.highlight("content:#{key}", id,
40
+ :field => :content,
41
+ :pre_tag => "\033[36m",
42
+ :post_tag => "\033[m")
43
+ puts "#{highlights}"
44
+ puts ""
45
+ end
46
+ end
47
+
48
+ ARGV.each { |key|
49
+ puts "\033[33mSearching for #{key}...\033[m"
50
+ puts ""
51
+ highlight_search(key)
52
+ }
53
+
54
+ # Local Variables:
55
+ # coding: utf-8
56
+ # End:
@@ -0,0 +1,170 @@
1
+ <%# -*- mode: text; coding: utf-8 -*- %>
2
+ <%
3
+ $title = "RMMSeg Homepage"
4
+ $authors = { 'pluskid' => 'http://pluskid.lifegoo.com' }
5
+
6
+ $unindent = ' '
7
+ %>
8
+
9
+ <% chapter "Introduction" do %>
10
+
11
+ RMMSeg is an implementation of
12
+ "MMSEG":http://technology.chtsai.org/mmseg/ Chinese word
13
+ segmentation algorithm. It is based on two variants of maximum
14
+ matching algorithms. Two algorithms are available for using:
15
+
16
+ * simple algorithm that uses only forward maximum matching.
17
+ * complex algorithm that uses three-word chunk maximum matching and 3
18
+ aditonal rules to solve ambiguities.
19
+
20
+ For more information about the algorithm, please refer to the
21
+ following essays:
22
+
23
+ * http://technology.chtsai.org/mmseg/
24
+ * http://pluskid.lifegoo.com/?p=261
25
+
26
+ RMMSeg can be used as either a stand alone program or an Analyzer of
27
+ "Ferret":http://ferret.davebalmain.com/trac.
28
+
29
+ <% end %>
30
+
31
+ <% chapter "Setup" do %>
32
+ <% section "Requirements" do %>
33
+
34
+ Your system needs the following software to run RMMSeg.
35
+
36
+ |_. Software |_. Notes |
37
+ | "Ruby":http://ruby-lang.org | Version 1.8.x is required |
38
+ | "hoe":http://seattlerb.rubyforge.org/hoe/ | If you want to build the gem manually |
39
+ | "Rake":http://rake.rubyforge.org/ | If you want to build the gem manually |
40
+ | "rspec":http://rspec.rubyforge.org/ | If you want to run the testcases |
41
+
42
+ <% end %>
43
+
44
+ <% section "Installation" do %>
45
+ <% section "Using RubyGems" do %>
46
+ To install the gem remotely from "RubyForge":http://rubyforge.org :
47
+
48
+ sudo gem install rmmseg
49
+
50
+ Or you can download the gem file manually from "RubyForge":http://rubyforge.org/projects/rmmseg/ and install it locally:
51
+
52
+ sudo gem install --local rmmseg-x.y.z.gem
53
+
54
+ <% end %>
55
+
56
+ <% section "From Subversion" do %>
57
+ From subversion repository hosted at "RubyForge":http://rmmseg.rubyforge.org/svn/, you can always get the latest source code.
58
+ <% note "The latest code might be unstable" do %>
59
+ Some new features may only be available in the latest code in subversion, but the code might be broken in some cases. So it is recommended to use the released gem package for production.
60
+ <% end %>
61
+ To check out the code from Rubyforge, you need to install subversion, then:
62
+
63
+ svn checkout http://rmmseg.rubyforge.org/svn/trunk/ rmmseg
64
+
65
+ Then you can run
66
+
67
+ rake gem
68
+
69
+ to build the gem file.
70
+ <% end %>
71
+ <% end %>
72
+ <% end %>
73
+
74
+ <% chapter "Usage" do %>
75
+
76
+ <% section "Stand Alone rmmseg" do %>
77
+ RMMSeg comes with a script @rmmseg@. To get the basic usage, just execute it with @-h@ option:
78
+
79
+ rmmseg -h
80
+
81
+ It reads from STDIN and print result to STDOUT. Here is a real
82
+ example:
83
+
84
+ $ echo "我们都喜欢用 Ruby" | rmmseg
85
+ 我们 都 喜欢 用 Ruby
86
+
87
+ <% end %>
88
+
89
+ <% section "Analyzer for Ferret" do %>
90
+ RMMSeg include an analyzer for Ferret. It is simply ready to
91
+ use. Just require it and pass it to Ferret. Here's a complete
92
+ example:
93
+
94
+ <code lang="ruby">
95
+ <%# include ferret_example.rb %>
96
+ </code>
97
+
98
+ execute it on the following key words:
99
+
100
+ $ ruby ferret_example.rb Ruby 中文
101
+
102
+ will generate the following results:
103
+
104
+ <code lang="text">
105
+ Searching for Ruby...
106
+
107
+ *** Document "RMMSeg" found with a score of 0.21875
108
+ ----------------------------------------
109
+ RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。
110
+
111
+ *** Document "Ruby 1.9" found with a score of 0.21875
112
+ ----------------------------------------
113
+ Ruby 1.9.0 已经发布了,1.9 的一个重大改进就是对 Unicode 的支持。
114
+
115
+ *** Document "Ferret" found with a score of 0.176776692271233
116
+ ----------------------------------------
117
+ Ferret is a high-performance, full-featured text search engine library
118
+ written for Ruby. It is inspired by Apache Lucene Java project. With
119
+ the introduction of Ferret, Ruby users now have one of the fastest and
120
+ most flexible search libraries available. And it's surprisingly easy
121
+ to use.
122
+
123
+ Searching for 中文...
124
+
125
+ *** Document "分词" found with a score of 0.281680464744568
126
+ ----------------------------------------
127
+ 中文分词比较困难,不像英文那样,直接在空格和标点符号的地方断开就可以了。
128
+
129
+ *** Document "RMMSeg" found with a score of 0.281680464744568
130
+ ----------------------------------------
131
+ RMMSeg 我近日做的一个 Ruby 中文分词实现,下一步是和 Ferret 进行集成。
132
+ </code>
133
+
134
+ And if you run the example in terminal, you'll see the result
135
+ highlighted as in <%= xref "Ferret Example Screenshot" %>.
136
+
137
+ <% figure "Ferret Example Screenshot" do %>
138
+ !http://pluskid.lifegoo.com/wp-content/uploads/2008/02/rmmseg.png!
139
+ <% end %>
140
+
141
+ <% end %>
142
+
143
+ <% section "Customization" do %>
144
+ RMMSeg can be customized through @RMMSeg::Config@. For example, to use your own dictionaries, just set it before starting to do segmentation:
145
+
146
+ <code lang="ruby">
147
+ RMMSeg::Config.dictionaries = [["dict1.dic", true], # with frequency info
148
+ ["dict2.dic", false], # without
149
+ ["dict3.dic", false]]
150
+ RMMSeg::Config.max_word_length = 6
151
+ </code>
152
+
153
+ Or to use the simple algorithm for more efficient (and less accurate) segmenting:
154
+
155
+ <code>
156
+ RMMSeg::Config.algorithm = :simple
157
+ </code>
158
+
159
+ For more information on customization, please refer to the RDoc of "RMMSeg::Config":http://rmmseg.rubyforge.org/rmmseg/index.html.
160
+ <% end %>
161
+
162
+ <% end %>
163
+
164
+ <% chapter "Resources" do %>
165
+ * "Project Home":http://rubyforge.org/projects/rmmseg/: The Project page at RubyForge.
166
+ * "RDoc of RMMSeg":http://rmmseg.rubyforge.org/rmmseg/index.html: The auto generated rdoc of RMMSeg.
167
+ * "A Screencast":http://pluskid.lifegoo.com/?p=272: Demo of Ferret RMMSeg and acts_as_ferret.
168
+ * "Implementation Details":http://pluskid.lifegoo.com/?p=261: My blog post about the implementation details of RMMSeg (Chinese).
169
+ * "Author's Email":mailto:pluskid@gmail.com: Contact me if you have any problem.
170
+ <% end %>