plexus-rmmseg 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +1 -0
  3. data/History.txt +42 -0
  4. data/Manifest.txt +51 -0
  5. data/README.txt +74 -0
  6. data/Rakefile +12 -0
  7. data/TODO.txt +5 -0
  8. data/bin/rmmseg +65 -0
  9. data/data/chars.dic +12638 -0
  10. data/data/custom.dic +12 -0
  11. data/data/punctuation.dic +79 -0
  12. data/data/words.dic +120330 -0
  13. data/lib/rmmseg.rb +13 -0
  14. data/lib/rmmseg/algorithm.rb +136 -0
  15. data/lib/rmmseg/amibguity.rb +4 -0
  16. data/lib/rmmseg/chunk.rb +41 -0
  17. data/lib/rmmseg/complex_algorithm.rb +122 -0
  18. data/lib/rmmseg/config.rb +65 -0
  19. data/lib/rmmseg/dictionary.rb +80 -0
  20. data/lib/rmmseg/ferret.rb +109 -0
  21. data/lib/rmmseg/lawl_rule.rb +12 -0
  22. data/lib/rmmseg/lsdmfocw_rule.rb +13 -0
  23. data/lib/rmmseg/mm_rule.rb +13 -0
  24. data/lib/rmmseg/rule_helper.rb +28 -0
  25. data/lib/rmmseg/simple_algorithm.rb +37 -0
  26. data/lib/rmmseg/svwl_rule.rb +12 -0
  27. data/lib/rmmseg/token.rb +30 -0
  28. data/lib/rmmseg/version.rb +3 -0
  29. data/lib/rmmseg/word.rb +38 -0
  30. data/misc/ferret_example.rb +56 -0
  31. data/misc/homepage.erb +170 -0
  32. data/misc/homepage.html +1214 -0
  33. data/plexus-rmmseg.gemspec +20 -0
  34. data/spec/chunk_spec.rb +25 -0
  35. data/spec/complex_algorithm_spec.rb +18 -0
  36. data/spec/config_spec.rb +12 -0
  37. data/spec/dictionary_spec.rb +20 -0
  38. data/spec/lawl_rule_spec.rb +15 -0
  39. data/spec/lsdmfocw_rule_spec.rb +14 -0
  40. data/spec/mm_rule_spec.rb +15 -0
  41. data/spec/simple_algorithm_spec.rb +46 -0
  42. data/spec/spec_helper.rb +12 -0
  43. data/spec/svwl_rule_spec.rb +14 -0
  44. data/spec/word_spec.rb +9 -0
  45. data/tasks/ann.rake +76 -0
  46. data/tasks/annotations.rake +22 -0
  47. data/tasks/doc.rake +48 -0
  48. data/tasks/gem.rake +110 -0
  49. data/tasks/homepage.rake +12 -0
  50. data/tasks/manifest.rake +49 -0
  51. data/tasks/post_load.rake +26 -0
  52. data/tasks/rubyforge.rake +57 -0
  53. data/tasks/setup.rb +227 -0
  54. data/tasks/spec.rake +54 -0
  55. data/tasks/svn.rake +44 -0
  56. data/tasks/test.rake +38 -0
  57. metadata +121 -0
@@ -0,0 +1,13 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require 'rmmseg/version'
4
+ require 'rmmseg/config'
5
+ require 'rmmseg/simple_algorithm'
6
+ require 'rmmseg/complex_algorithm'
7
+
8
+ module RMMSeg
9
+ # Segment +text+ using the algorithm configured.
10
+ def segment(text)
11
+ Config.algorithm_instance(text).segment
12
+ end
13
+ end
@@ -0,0 +1,136 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'rmmseg/dictionary'
3
+ require 'rmmseg/word'
4
+ require 'rmmseg/chunk'
5
+ require 'rmmseg/token'
6
+
7
+ module RMMSeg
8
+ # An algorithm can segment a piece of text into an array of
9
+ # words. This module is the common operations shared by
10
+ # SimpleAlgorithm and ComplexAlgorithm .
11
+ module Algorithm
12
+ # Initialize a new instance of Algorithm, the +text+ will
13
+ # then be segmented by this instance. +token+ is the class
14
+ # which will be used to construct the result token.
15
+ def initialize(text, token=Token)
16
+ @text = text
17
+ @chars = text.each_char.to_a
18
+ @index = 0
19
+ @byte_index = 0
20
+ @token = token
21
+ end
22
+
23
+ # Get the next Token recognized.
24
+ def next_token
25
+ return nil if @index >= @chars.length
26
+
27
+ if basic_latin?(@chars[@index])
28
+ token = get_basic_latin_word
29
+ else
30
+ token = get_cjk_word
31
+ end
32
+
33
+ if token.start == token.end # empty
34
+ return next_token
35
+ else
36
+ return token
37
+ end
38
+ end
39
+
40
+ # Segment the string in +text+ into an array
41
+ # of words.
42
+ def segment
43
+ words = Array.new
44
+
45
+ token = next_token
46
+ until token.nil?
47
+ words << token.text
48
+ token = next_token
49
+ end
50
+
51
+ words
52
+ end
53
+
54
+ # Skip whitespaces and punctuation to extract a basic latin
55
+ # word.
56
+ def get_basic_latin_word
57
+ start_pos = nil
58
+ end_pos = nil
59
+
60
+ i = @index
61
+ while i < @chars.length &&
62
+ basic_latin?(@chars[i]) &&
63
+ nonword_char?(@chars[i])
64
+ i += 1
65
+ end
66
+ start_pos = i
67
+
68
+ while i < @chars.length && basic_latin?(@chars[i])
69
+ break if nonword_char?(@chars[i])
70
+ i += 1
71
+ end
72
+ end_pos = i
73
+
74
+ while i < @chars.length &&
75
+ basic_latin?(@chars[i]) &&
76
+ nonword_char?(@chars[i])
77
+ i += 1
78
+ end
79
+
80
+ @index = i
81
+ return @token.new(@text[start_pos...end_pos], start_pos, end_pos)
82
+ end
83
+
84
+ # Find all words occuring in the dictionary starting from
85
+ # +index+ . The maximum word length is determined by
86
+ # +Config.max_word_length+ .
87
+ def find_match_words(index)
88
+ for i, w in @match_cache
89
+ if i == index
90
+ return w
91
+ end
92
+ end
93
+
94
+ dic = Dictionary.instance
95
+ str = String.new
96
+ strlen = 0
97
+ words = Array.new
98
+ i = index
99
+
100
+ while i < @chars.length &&
101
+ !basic_latin?(@chars[i]) &&
102
+ strlen < Config.max_word_length
103
+
104
+ str << @chars[i]
105
+ strlen += 1
106
+
107
+ if dic.has_word?(str)
108
+ words << dic.get_word(str)
109
+ end
110
+ i += 1
111
+ end
112
+
113
+ if words.empty?
114
+ words << Word.new(@chars[index], Word::TYPES[:unrecognized])
115
+ end
116
+
117
+ @match_cache[@match_cache_idx] = [index, words]
118
+ @match_cache_idx += 1
119
+ @match_cache_idx = 0 if @match_cache_idx == MATCH_CACHE_MAX_LENGTH
120
+
121
+ words
122
+ end
123
+
124
+ # Determine whether a character is a basic latin character.
125
+ def basic_latin?(char)
126
+ char.each_byte.to_a.length == 1
127
+ end
128
+
129
+ # Determine whether a character can be part of a basic latin
130
+ # word.
131
+ NONWORD_CHAR_RE = /^\W$/
132
+ def nonword_char?(char)
133
+ NONWORD_CHAR_RE =~ char
134
+ end
135
+ end
136
+ end
@@ -0,0 +1,4 @@
1
+ module RMMSeg
2
+ class Ambiguity < Exception
3
+ end
4
+ end
@@ -0,0 +1,41 @@
1
+ module RMMSeg
2
+ # A Chunk holds one or more successive Word .
3
+ module Chunk
4
+
5
+ # The sum of length of all words.
6
+ def self.total_length(words)
7
+ len = 0
8
+ for word in words
9
+ len += word.length
10
+ end
11
+ len
12
+ end
13
+
14
+ # The average length of words.
15
+ def self.average_length(words)
16
+ total_length(words).to_f/words.size
17
+ end
18
+
19
+ # The square of the standard deviation of length of all words.
20
+ def self.variance(words)
21
+ avglen = average_length(words)
22
+ sqr_sum = 0.0
23
+ for word in words
24
+ tmp = word.length - avglen
25
+ sqr_sum += tmp*tmp
26
+ end
27
+ Math.sqrt(sqr_sum)
28
+ end
29
+
30
+ # The sum of all frequencies of one-character words.
31
+ def self.degree_of_morphemic_freedom(words)
32
+ sum = 0
33
+ for word in words
34
+ if word.length == 1 && word.type == Word::TYPES[:cjk_word]
35
+ sum += word.frequency
36
+ end
37
+ end
38
+ sum
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,122 @@
1
+ require 'rmmseg/algorithm'
2
+ require 'rmmseg/mm_rule'
3
+ require 'rmmseg/lawl_rule'
4
+ require 'rmmseg/svwl_rule'
5
+ require 'rmmseg/lsdmfocw_rule'
6
+
7
+ module RMMSeg
8
+ class ComplexAlgorithm
9
+ MATCH_CACHE_MAX_LENGTH = 3
10
+
11
+ include Algorithm
12
+
13
+ # Create a new ComplexAlgorithm . Rules used by this algorithm
14
+ # includes MMRule , LAWLRule , SVWLRule and LSDMFOCWRule .
15
+ def initialize(text, token=Token)
16
+ super
17
+ @rules = [
18
+ MMRule,
19
+ LAWLRule,
20
+ SVWLRule,
21
+ LSDMFOCWRule
22
+ ]
23
+ @match_cache = Array.new(MATCH_CACHE_MAX_LENGTH)
24
+ @match_cache_idx = 0
25
+ end
26
+
27
+ # Get the most proper CJK word.
28
+ def get_cjk_word
29
+ chunks = create_chunks
30
+ i = 0
31
+ while i < @rules.length
32
+ break if chunks.length < 2
33
+ chunks = @rules[i].filter(chunks)
34
+ i += 1
35
+ end
36
+
37
+ if chunks.length > 1
38
+ if Config.on_ambiguity == :raise_exception
39
+ raise Ambiguity, "Can't solve ambiguity on #{chunks}"
40
+ end
41
+ end
42
+
43
+ word = chunks[0][0]
44
+ token = @token.new(word.text, @byte_index, @byte_index+word.byte_size)
45
+
46
+ @index += word.length
47
+ @byte_index += word.byte_size
48
+
49
+ return token
50
+ end
51
+
52
+ # Create all possible three-word (or less) chunks
53
+ # starting from +@index+ .
54
+ def create_chunks
55
+ chunks = Array.new
56
+ for w0 in find_match_words(@index)
57
+ index0 = @index + w0.length
58
+ if index0 < @chars.length
59
+ for w1 in find_match_words(index0)
60
+ index1 = index0 + w1.length
61
+ if index1 < @chars.length
62
+ for w2 in find_match_words(index1)
63
+ if w2.type == Word::TYPES[:unrecognized]
64
+ chunks << [w0, w1]
65
+ else
66
+ chunks << [w0, w1, w2]
67
+ end
68
+ end
69
+ elsif index1 == @chars.length
70
+ chunks << [w0, w1]
71
+ end
72
+ end
73
+ elsif index0 == @chars.length
74
+ chunks << [w0]
75
+ end
76
+ end
77
+
78
+ chunks
79
+ end
80
+
81
+ # Find all words occuring in the dictionary starting from
82
+ # +index+ . The maximum word length is determined by
83
+ # +Config.max_word_length+ .
84
+ def find_match_words(index)
85
+ for i, w in @match_cache
86
+ if i == index
87
+ return w
88
+ end
89
+ end
90
+
91
+ dic = Dictionary.instance
92
+ str = String.new
93
+ strlen = 0
94
+ words = Array.new
95
+ i = index
96
+
97
+ while i < @chars.length &&
98
+ !basic_latin?(@chars[i]) &&
99
+ strlen < Config.max_word_length
100
+
101
+ str << @chars[i]
102
+ strlen += 1
103
+
104
+ if dic.has_word?(str)
105
+ words << dic.get_word(str)
106
+ end
107
+ i += 1
108
+ end
109
+
110
+ if words.empty?
111
+ words << Word.new(@chars[index], Word::TYPES[:unrecognized])
112
+ end
113
+
114
+ @match_cache[@match_cache_idx] = [index, words]
115
+ @match_cache_idx += 1
116
+ @match_cache_idx = 0 if @match_cache_idx == MATCH_CACHE_MAX_LENGTH
117
+
118
+ words
119
+ end
120
+
121
+ end
122
+ end
@@ -0,0 +1,65 @@
1
+ require 'rmmseg/simple_algorithm'
2
+ require 'rmmseg/complex_algorithm'
3
+
4
+ module RMMSeg
5
+ # Configurations of RMMSeg.
6
+ class Config
7
+ @algorithm = :complex
8
+ @on_ambiguity = :select_first
9
+ data_dir = File.join(File.dirname(__FILE__), "..", "..", "data")
10
+ @dictionaries = [
11
+ [File.join(data_dir, "chars.dic"), true],
12
+ [File.join(data_dir, "words.dic"), false],
13
+ [File.join(data_dir, "custom.dic"), false]
14
+ ]
15
+ @max_word_length = 4
16
+
17
+ class << self
18
+ # Get the algorithm name currently using
19
+ def algorithm
20
+ @algorithm
21
+ end
22
+ # Set the algorithm name used to segment. Valid values are
23
+ # +:complex+ and +:simple+ . The former is the default one.
24
+ def algorithm=(algor)
25
+ unless [:complex, :simple].include? algor
26
+ raise ArgumentError, "Unknown algorithm #{algor}"
27
+ end
28
+ @algorithm = algor
29
+ end
30
+ # Get an instance of the algorithm object corresponding to the
31
+ # algorithm name configured. +tok+ is the class of the token oject
32
+ # to be returned. For example, if you want to use with Ferret, you
33
+ # should provide +::Ferret::Analysis::Token+ .
34
+ def algorithm_instance(text, tok=Token)
35
+ RMMSeg.const_get("#{@algorithm}".capitalize+"Algorithm").new(text, tok)
36
+ end
37
+
38
+ # Get the behavior description when an unresolved ambiguity occured.
39
+ def on_ambiguity
40
+ @on_ambiguity
41
+ end
42
+ # Set the behavior on an unresolved ambiguity. Valid values are
43
+ # +:raise_exception+ and +:select_first+ . The latter is the default
44
+ # one.
45
+ def on_ambiguity=(behavior)
46
+ unless [:raise_exception, :select_first].include? behavior
47
+ raise ArgumentError, "Unknown behavior on ambiguity: #{behavior}"
48
+ end
49
+ @on_ambiguity = behavior
50
+ end
51
+
52
+ # An array of dictionary files. Each element should be of the
53
+ # form: [file, whether_dic_include_frequency_info]. This should
54
+ # be set before the dictionaries are loaded (They are loaded
55
+ # only when they are used). Or else you should call
56
+ # Dictionary.instance.reload manually to reload the
57
+ # dictionaries.
58
+ attr_accessor :dictionaries
59
+
60
+ # The maximum length of a CJK word. The default value is 4. Making
61
+ # this value too large might slow down the segment operations.
62
+ attr_accessor :max_word_length
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,80 @@
1
+ require 'singleton'
2
+
3
+ module RMMSeg
4
+ # The dictionary is a singleton object which is lazily initialized.
5
+ # *NOTE* dictionary data should use the UNIX line-break '\n' instead
6
+ # of DOS '\r\n'.
7
+ class Dictionary
8
+ include Singleton
9
+
10
+ # Initialize and load dictionaries from files specified by
11
+ # +Config.dictionaries+ .
12
+ def initialize
13
+ load_dictionaries
14
+ end
15
+
16
+ # Determin whether +value+ is a word in the dictionary.
17
+ def has_word?(value)
18
+ @dic.has_key?(value)
19
+ end
20
+
21
+ # Store a new word to dictionary.
22
+ # +w+ may be:
23
+ # * an instance of Word.
24
+ # * +true+, then this is a normal world.
25
+ # * a String(which can be converted to a Number) or Number.
26
+ # The number is the frequency of the word.
27
+ def store_word(key, w=true)
28
+ @dic[key] = w
29
+ end
30
+
31
+ # Get an instance of Word corresponding to +value+ .
32
+ def get_word(value)
33
+ word = @dic[value]
34
+ # Construct a Word lazily
35
+ if word == true
36
+ word = Word.new(value.dup, Word::TYPES[:cjk_word])
37
+ @dic[value] = word
38
+ elsif String === word
39
+ word = Word.new(value.dup, Word::TYPES[:cjk_word], word.to_i)
40
+ @dic[value] = word
41
+ end
42
+ word
43
+ end
44
+
45
+ # Reload all dictionary files.
46
+ def reload
47
+ @dic = nil
48
+ load_dictionaries
49
+ end
50
+
51
+ private
52
+ def load_dictionaries
53
+ @dic = Hash.new
54
+ Config.dictionaries.each { |file, has_freq|
55
+ if has_freq
56
+ load_dictionary_with_freq(file)
57
+ else
58
+ load_dictionary(file)
59
+ end
60
+ }
61
+ end
62
+
63
+ def load_dictionary_with_freq(file)
64
+ File.open(file, "r") { |f|
65
+ f.each_line { |line|
66
+ pair = line.split(" ")
67
+ @dic[pair[0]] = pair[1]
68
+ }
69
+ }
70
+ end
71
+ def load_dictionary(file)
72
+ File.open(file, "r") { |f|
73
+ f.each_line { |line|
74
+ line.slice!(-1) # chop!
75
+ @dic[line] = true
76
+ }
77
+ }
78
+ end
79
+ end
80
+ end