plexus-rmmseg 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +1 -0
  3. data/History.txt +42 -0
  4. data/Manifest.txt +51 -0
  5. data/README.txt +74 -0
  6. data/Rakefile +12 -0
  7. data/TODO.txt +5 -0
  8. data/bin/rmmseg +65 -0
  9. data/data/chars.dic +12638 -0
  10. data/data/custom.dic +12 -0
  11. data/data/punctuation.dic +79 -0
  12. data/data/words.dic +120330 -0
  13. data/lib/rmmseg.rb +13 -0
  14. data/lib/rmmseg/algorithm.rb +136 -0
  15. data/lib/rmmseg/amibguity.rb +4 -0
  16. data/lib/rmmseg/chunk.rb +41 -0
  17. data/lib/rmmseg/complex_algorithm.rb +122 -0
  18. data/lib/rmmseg/config.rb +65 -0
  19. data/lib/rmmseg/dictionary.rb +80 -0
  20. data/lib/rmmseg/ferret.rb +109 -0
  21. data/lib/rmmseg/lawl_rule.rb +12 -0
  22. data/lib/rmmseg/lsdmfocw_rule.rb +13 -0
  23. data/lib/rmmseg/mm_rule.rb +13 -0
  24. data/lib/rmmseg/rule_helper.rb +28 -0
  25. data/lib/rmmseg/simple_algorithm.rb +37 -0
  26. data/lib/rmmseg/svwl_rule.rb +12 -0
  27. data/lib/rmmseg/token.rb +30 -0
  28. data/lib/rmmseg/version.rb +3 -0
  29. data/lib/rmmseg/word.rb +38 -0
  30. data/misc/ferret_example.rb +56 -0
  31. data/misc/homepage.erb +170 -0
  32. data/misc/homepage.html +1214 -0
  33. data/plexus-rmmseg.gemspec +20 -0
  34. data/spec/chunk_spec.rb +25 -0
  35. data/spec/complex_algorithm_spec.rb +18 -0
  36. data/spec/config_spec.rb +12 -0
  37. data/spec/dictionary_spec.rb +20 -0
  38. data/spec/lawl_rule_spec.rb +15 -0
  39. data/spec/lsdmfocw_rule_spec.rb +14 -0
  40. data/spec/mm_rule_spec.rb +15 -0
  41. data/spec/simple_algorithm_spec.rb +46 -0
  42. data/spec/spec_helper.rb +12 -0
  43. data/spec/svwl_rule_spec.rb +14 -0
  44. data/spec/word_spec.rb +9 -0
  45. data/tasks/ann.rake +76 -0
  46. data/tasks/annotations.rake +22 -0
  47. data/tasks/doc.rake +48 -0
  48. data/tasks/gem.rake +110 -0
  49. data/tasks/homepage.rake +12 -0
  50. data/tasks/manifest.rake +49 -0
  51. data/tasks/post_load.rake +26 -0
  52. data/tasks/rubyforge.rake +57 -0
  53. data/tasks/setup.rb +227 -0
  54. data/tasks/spec.rake +54 -0
  55. data/tasks/svn.rake +44 -0
  56. data/tasks/test.rake +38 -0
  57. metadata +121 -0
@@ -0,0 +1,13 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require 'rmmseg/version'
4
+ require 'rmmseg/config'
5
+ require 'rmmseg/simple_algorithm'
6
+ require 'rmmseg/complex_algorithm'
7
+
8
+ module RMMSeg
9
+ # Segment +text+ using the algorithm configured.
10
+ def segment(text)
11
+ Config.algorithm_instance(text).segment
12
+ end
13
+ end
@@ -0,0 +1,136 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'rmmseg/dictionary'
3
+ require 'rmmseg/word'
4
+ require 'rmmseg/chunk'
5
+ require 'rmmseg/token'
6
+
7
+ module RMMSeg
8
+ # An algorithm can segment a piece of text into an array of
9
+ # words. This module is the common operations shared by
10
+ # SimpleAlgorithm and ComplexAlgorithm .
11
+ module Algorithm
12
+ # Initialize a new instance of Algorithm, the +text+ will
13
+ # then be segmented by this instance. +token+ is the class
14
+ # which will be used to construct the result token.
15
+ def initialize(text, token=Token)
16
+ @text = text
17
+ @chars = text.each_char.to_a
18
+ @index = 0
19
+ @byte_index = 0
20
+ @token = token
21
+ end
22
+
23
+ # Get the next Token recognized.
24
+ def next_token
25
+ return nil if @index >= @chars.length
26
+
27
+ if basic_latin?(@chars[@index])
28
+ token = get_basic_latin_word
29
+ else
30
+ token = get_cjk_word
31
+ end
32
+
33
+ if token.start == token.end # empty
34
+ return next_token
35
+ else
36
+ return token
37
+ end
38
+ end
39
+
40
+ # Segment the string in +text+ into an array
41
+ # of words.
42
+ def segment
43
+ words = Array.new
44
+
45
+ token = next_token
46
+ until token.nil?
47
+ words << token.text
48
+ token = next_token
49
+ end
50
+
51
+ words
52
+ end
53
+
54
+ # Skip whitespaces and punctuation to extract a basic latin
55
+ # word.
56
+ def get_basic_latin_word
57
+ start_pos = nil
58
+ end_pos = nil
59
+
60
+ i = @index
61
+ while i < @chars.length &&
62
+ basic_latin?(@chars[i]) &&
63
+ nonword_char?(@chars[i])
64
+ i += 1
65
+ end
66
+ start_pos = i
67
+
68
+ while i < @chars.length && basic_latin?(@chars[i])
69
+ break if nonword_char?(@chars[i])
70
+ i += 1
71
+ end
72
+ end_pos = i
73
+
74
+ while i < @chars.length &&
75
+ basic_latin?(@chars[i]) &&
76
+ nonword_char?(@chars[i])
77
+ i += 1
78
+ end
79
+
80
+ @index = i
81
+ return @token.new(@text[start_pos...end_pos], start_pos, end_pos)
82
+ end
83
+
84
+ # Find all words occuring in the dictionary starting from
85
+ # +index+ . The maximum word length is determined by
86
+ # +Config.max_word_length+ .
87
+ def find_match_words(index)
88
+ for i, w in @match_cache
89
+ if i == index
90
+ return w
91
+ end
92
+ end
93
+
94
+ dic = Dictionary.instance
95
+ str = String.new
96
+ strlen = 0
97
+ words = Array.new
98
+ i = index
99
+
100
+ while i < @chars.length &&
101
+ !basic_latin?(@chars[i]) &&
102
+ strlen < Config.max_word_length
103
+
104
+ str << @chars[i]
105
+ strlen += 1
106
+
107
+ if dic.has_word?(str)
108
+ words << dic.get_word(str)
109
+ end
110
+ i += 1
111
+ end
112
+
113
+ if words.empty?
114
+ words << Word.new(@chars[index], Word::TYPES[:unrecognized])
115
+ end
116
+
117
+ @match_cache[@match_cache_idx] = [index, words]
118
+ @match_cache_idx += 1
119
+ @match_cache_idx = 0 if @match_cache_idx == MATCH_CACHE_MAX_LENGTH
120
+
121
+ words
122
+ end
123
+
124
+ # Determine whether a character is a basic latin character.
125
+ def basic_latin?(char)
126
+ char.each_byte.to_a.length == 1
127
+ end
128
+
129
+ # Determine whether a character can be part of a basic latin
130
+ # word.
131
+ NONWORD_CHAR_RE = /^\W$/
132
+ def nonword_char?(char)
133
+ NONWORD_CHAR_RE =~ char
134
+ end
135
+ end
136
+ end
@@ -0,0 +1,4 @@
1
+ module RMMSeg
2
+ class Ambiguity < Exception
3
+ end
4
+ end
@@ -0,0 +1,41 @@
1
+ module RMMSeg
2
+ # A Chunk holds one or more successive Word .
3
+ module Chunk
4
+
5
+ # The sum of length of all words.
6
+ def self.total_length(words)
7
+ len = 0
8
+ for word in words
9
+ len += word.length
10
+ end
11
+ len
12
+ end
13
+
14
+ # The average length of words.
15
+ def self.average_length(words)
16
+ total_length(words).to_f/words.size
17
+ end
18
+
19
+ # The square of the standard deviation of length of all words.
20
+ def self.variance(words)
21
+ avglen = average_length(words)
22
+ sqr_sum = 0.0
23
+ for word in words
24
+ tmp = word.length - avglen
25
+ sqr_sum += tmp*tmp
26
+ end
27
+ Math.sqrt(sqr_sum)
28
+ end
29
+
30
+ # The sum of all frequencies of one-character words.
31
+ def self.degree_of_morphemic_freedom(words)
32
+ sum = 0
33
+ for word in words
34
+ if word.length == 1 && word.type == Word::TYPES[:cjk_word]
35
+ sum += word.frequency
36
+ end
37
+ end
38
+ sum
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,122 @@
1
+ require 'rmmseg/algorithm'
2
+ require 'rmmseg/mm_rule'
3
+ require 'rmmseg/lawl_rule'
4
+ require 'rmmseg/svwl_rule'
5
+ require 'rmmseg/lsdmfocw_rule'
6
+
7
+ module RMMSeg
8
+ class ComplexAlgorithm
9
+ MATCH_CACHE_MAX_LENGTH = 3
10
+
11
+ include Algorithm
12
+
13
+ # Create a new ComplexAlgorithm . Rules used by this algorithm
14
+ # includes MMRule , LAWLRule , SVWLRule and LSDMFOCWRule .
15
+ def initialize(text, token=Token)
16
+ super
17
+ @rules = [
18
+ MMRule,
19
+ LAWLRule,
20
+ SVWLRule,
21
+ LSDMFOCWRule
22
+ ]
23
+ @match_cache = Array.new(MATCH_CACHE_MAX_LENGTH)
24
+ @match_cache_idx = 0
25
+ end
26
+
27
+ # Get the most proper CJK word.
28
+ def get_cjk_word
29
+ chunks = create_chunks
30
+ i = 0
31
+ while i < @rules.length
32
+ break if chunks.length < 2
33
+ chunks = @rules[i].filter(chunks)
34
+ i += 1
35
+ end
36
+
37
+ if chunks.length > 1
38
+ if Config.on_ambiguity == :raise_exception
39
+ raise Ambiguity, "Can't solve ambiguity on #{chunks}"
40
+ end
41
+ end
42
+
43
+ word = chunks[0][0]
44
+ token = @token.new(word.text, @byte_index, @byte_index+word.byte_size)
45
+
46
+ @index += word.length
47
+ @byte_index += word.byte_size
48
+
49
+ return token
50
+ end
51
+
52
+ # Create all possible three-word (or less) chunks
53
+ # starting from +@index+ .
54
+ def create_chunks
55
+ chunks = Array.new
56
+ for w0 in find_match_words(@index)
57
+ index0 = @index + w0.length
58
+ if index0 < @chars.length
59
+ for w1 in find_match_words(index0)
60
+ index1 = index0 + w1.length
61
+ if index1 < @chars.length
62
+ for w2 in find_match_words(index1)
63
+ if w2.type == Word::TYPES[:unrecognized]
64
+ chunks << [w0, w1]
65
+ else
66
+ chunks << [w0, w1, w2]
67
+ end
68
+ end
69
+ elsif index1 == @chars.length
70
+ chunks << [w0, w1]
71
+ end
72
+ end
73
+ elsif index0 == @chars.length
74
+ chunks << [w0]
75
+ end
76
+ end
77
+
78
+ chunks
79
+ end
80
+
81
+ # Find all words occuring in the dictionary starting from
82
+ # +index+ . The maximum word length is determined by
83
+ # +Config.max_word_length+ .
84
+ def find_match_words(index)
85
+ for i, w in @match_cache
86
+ if i == index
87
+ return w
88
+ end
89
+ end
90
+
91
+ dic = Dictionary.instance
92
+ str = String.new
93
+ strlen = 0
94
+ words = Array.new
95
+ i = index
96
+
97
+ while i < @chars.length &&
98
+ !basic_latin?(@chars[i]) &&
99
+ strlen < Config.max_word_length
100
+
101
+ str << @chars[i]
102
+ strlen += 1
103
+
104
+ if dic.has_word?(str)
105
+ words << dic.get_word(str)
106
+ end
107
+ i += 1
108
+ end
109
+
110
+ if words.empty?
111
+ words << Word.new(@chars[index], Word::TYPES[:unrecognized])
112
+ end
113
+
114
+ @match_cache[@match_cache_idx] = [index, words]
115
+ @match_cache_idx += 1
116
+ @match_cache_idx = 0 if @match_cache_idx == MATCH_CACHE_MAX_LENGTH
117
+
118
+ words
119
+ end
120
+
121
+ end
122
+ end
@@ -0,0 +1,65 @@
1
+ require 'rmmseg/simple_algorithm'
2
+ require 'rmmseg/complex_algorithm'
3
+
4
+ module RMMSeg
5
+ # Configurations of RMMSeg.
6
+ class Config
7
+ @algorithm = :complex
8
+ @on_ambiguity = :select_first
9
+ data_dir = File.join(File.dirname(__FILE__), "..", "..", "data")
10
+ @dictionaries = [
11
+ [File.join(data_dir, "chars.dic"), true],
12
+ [File.join(data_dir, "words.dic"), false],
13
+ [File.join(data_dir, "custom.dic"), false]
14
+ ]
15
+ @max_word_length = 4
16
+
17
+ class << self
18
+ # Get the algorithm name currently using
19
+ def algorithm
20
+ @algorithm
21
+ end
22
+ # Set the algorithm name used to segment. Valid values are
23
+ # +:complex+ and +:simple+ . The former is the default one.
24
+ def algorithm=(algor)
25
+ unless [:complex, :simple].include? algor
26
+ raise ArgumentError, "Unknown algorithm #{algor}"
27
+ end
28
+ @algorithm = algor
29
+ end
30
+ # Get an instance of the algorithm object corresponding to the
31
+ # algorithm name configured. +tok+ is the class of the token oject
32
+ # to be returned. For example, if you want to use with Ferret, you
33
+ # should provide +::Ferret::Analysis::Token+ .
34
+ def algorithm_instance(text, tok=Token)
35
+ RMMSeg.const_get("#{@algorithm}".capitalize+"Algorithm").new(text, tok)
36
+ end
37
+
38
+ # Get the behavior description when an unresolved ambiguity occured.
39
+ def on_ambiguity
40
+ @on_ambiguity
41
+ end
42
+ # Set the behavior on an unresolved ambiguity. Valid values are
43
+ # +:raise_exception+ and +:select_first+ . The latter is the default
44
+ # one.
45
+ def on_ambiguity=(behavior)
46
+ unless [:raise_exception, :select_first].include? behavior
47
+ raise ArgumentError, "Unknown behavior on ambiguity: #{behavior}"
48
+ end
49
+ @on_ambiguity = behavior
50
+ end
51
+
52
+ # An array of dictionary files. Each element should be of the
53
+ # form: [file, whether_dic_include_frequency_info]. This should
54
+ # be set before the dictionaries are loaded (They are loaded
55
+ # only when they are used). Or else you should call
56
+ # Dictionary.instance.reload manually to reload the
57
+ # dictionaries.
58
+ attr_accessor :dictionaries
59
+
60
+ # The maximum length of a CJK word. The default value is 4. Making
61
+ # this value too large might slow down the segment operations.
62
+ attr_accessor :max_word_length
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,80 @@
1
+ require 'singleton'
2
+
3
+ module RMMSeg
4
+ # The dictionary is a singleton object which is lazily initialized.
5
+ # *NOTE* dictionary data should use the UNIX line-break '\n' instead
6
+ # of DOS '\r\n'.
7
+ class Dictionary
8
+ include Singleton
9
+
10
+ # Initialize and load dictionaries from files specified by
11
+ # +Config.dictionaries+ .
12
+ def initialize
13
+ load_dictionaries
14
+ end
15
+
16
+ # Determin whether +value+ is a word in the dictionary.
17
+ def has_word?(value)
18
+ @dic.has_key?(value)
19
+ end
20
+
21
+ # Store a new word to dictionary.
22
+ # +w+ may be:
23
+ # * an instance of Word.
24
+ # * +true+, then this is a normal world.
25
+ # * a String(which can be converted to a Number) or Number.
26
+ # The number is the frequency of the word.
27
+ def store_word(key, w=true)
28
+ @dic[key] = w
29
+ end
30
+
31
+ # Get an instance of Word corresponding to +value+ .
32
+ def get_word(value)
33
+ word = @dic[value]
34
+ # Construct a Word lazily
35
+ if word == true
36
+ word = Word.new(value.dup, Word::TYPES[:cjk_word])
37
+ @dic[value] = word
38
+ elsif String === word
39
+ word = Word.new(value.dup, Word::TYPES[:cjk_word], word.to_i)
40
+ @dic[value] = word
41
+ end
42
+ word
43
+ end
44
+
45
+ # Reload all dictionary files.
46
+ def reload
47
+ @dic = nil
48
+ load_dictionaries
49
+ end
50
+
51
+ private
52
+ def load_dictionaries
53
+ @dic = Hash.new
54
+ Config.dictionaries.each { |file, has_freq|
55
+ if has_freq
56
+ load_dictionary_with_freq(file)
57
+ else
58
+ load_dictionary(file)
59
+ end
60
+ }
61
+ end
62
+
63
+ def load_dictionary_with_freq(file)
64
+ File.open(file, "r") { |f|
65
+ f.each_line { |line|
66
+ pair = line.split(" ")
67
+ @dic[pair[0]] = pair[1]
68
+ }
69
+ }
70
+ end
71
+ def load_dictionary(file)
72
+ File.open(file, "r") { |f|
73
+ f.each_line { |line|
74
+ line.slice!(-1) # chop!
75
+ @dic[line] = true
76
+ }
77
+ }
78
+ end
79
+ end
80
+ end