plexus-rmmseg 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/History.txt +42 -0
- data/Manifest.txt +51 -0
- data/README.txt +74 -0
- data/Rakefile +12 -0
- data/TODO.txt +5 -0
- data/bin/rmmseg +65 -0
- data/data/chars.dic +12638 -0
- data/data/custom.dic +12 -0
- data/data/punctuation.dic +79 -0
- data/data/words.dic +120330 -0
- data/lib/rmmseg.rb +13 -0
- data/lib/rmmseg/algorithm.rb +136 -0
- data/lib/rmmseg/amibguity.rb +4 -0
- data/lib/rmmseg/chunk.rb +41 -0
- data/lib/rmmseg/complex_algorithm.rb +122 -0
- data/lib/rmmseg/config.rb +65 -0
- data/lib/rmmseg/dictionary.rb +80 -0
- data/lib/rmmseg/ferret.rb +109 -0
- data/lib/rmmseg/lawl_rule.rb +12 -0
- data/lib/rmmseg/lsdmfocw_rule.rb +13 -0
- data/lib/rmmseg/mm_rule.rb +13 -0
- data/lib/rmmseg/rule_helper.rb +28 -0
- data/lib/rmmseg/simple_algorithm.rb +37 -0
- data/lib/rmmseg/svwl_rule.rb +12 -0
- data/lib/rmmseg/token.rb +30 -0
- data/lib/rmmseg/version.rb +3 -0
- data/lib/rmmseg/word.rb +38 -0
- data/misc/ferret_example.rb +56 -0
- data/misc/homepage.erb +170 -0
- data/misc/homepage.html +1214 -0
- data/plexus-rmmseg.gemspec +20 -0
- data/spec/chunk_spec.rb +25 -0
- data/spec/complex_algorithm_spec.rb +18 -0
- data/spec/config_spec.rb +12 -0
- data/spec/dictionary_spec.rb +20 -0
- data/spec/lawl_rule_spec.rb +15 -0
- data/spec/lsdmfocw_rule_spec.rb +14 -0
- data/spec/mm_rule_spec.rb +15 -0
- data/spec/simple_algorithm_spec.rb +46 -0
- data/spec/spec_helper.rb +12 -0
- data/spec/svwl_rule_spec.rb +14 -0
- data/spec/word_spec.rb +9 -0
- data/tasks/ann.rake +76 -0
- data/tasks/annotations.rake +22 -0
- data/tasks/doc.rake +48 -0
- data/tasks/gem.rake +110 -0
- data/tasks/homepage.rake +12 -0
- data/tasks/manifest.rake +49 -0
- data/tasks/post_load.rake +26 -0
- data/tasks/rubyforge.rake +57 -0
- data/tasks/setup.rb +227 -0
- data/tasks/spec.rake +54 -0
- data/tasks/svn.rake +44 -0
- data/tasks/test.rake +38 -0
- metadata +121 -0
data/lib/rmmseg.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require 'rmmseg/version'
|
4
|
+
require 'rmmseg/config'
|
5
|
+
require 'rmmseg/simple_algorithm'
|
6
|
+
require 'rmmseg/complex_algorithm'
|
7
|
+
|
8
|
+
module RMMSeg
|
9
|
+
# Segment +text+ using the algorithm configured.
|
10
|
+
def segment(text)
|
11
|
+
Config.algorithm_instance(text).segment
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,136 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require 'rmmseg/dictionary'
|
3
|
+
require 'rmmseg/word'
|
4
|
+
require 'rmmseg/chunk'
|
5
|
+
require 'rmmseg/token'
|
6
|
+
|
7
|
+
module RMMSeg
|
8
|
+
# An algorithm can segment a piece of text into an array of
|
9
|
+
# words. This module is the common operations shared by
|
10
|
+
# SimpleAlgorithm and ComplexAlgorithm .
|
11
|
+
module Algorithm
|
12
|
+
# Initialize a new instance of Algorithm, the +text+ will
|
13
|
+
# then be segmented by this instance. +token+ is the class
|
14
|
+
# which will be used to construct the result token.
|
15
|
+
def initialize(text, token=Token)
|
16
|
+
@text = text
|
17
|
+
@chars = text.each_char.to_a
|
18
|
+
@index = 0
|
19
|
+
@byte_index = 0
|
20
|
+
@token = token
|
21
|
+
end
|
22
|
+
|
23
|
+
# Get the next Token recognized.
|
24
|
+
def next_token
|
25
|
+
return nil if @index >= @chars.length
|
26
|
+
|
27
|
+
if basic_latin?(@chars[@index])
|
28
|
+
token = get_basic_latin_word
|
29
|
+
else
|
30
|
+
token = get_cjk_word
|
31
|
+
end
|
32
|
+
|
33
|
+
if token.start == token.end # empty
|
34
|
+
return next_token
|
35
|
+
else
|
36
|
+
return token
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# Segment the string in +text+ into an array
|
41
|
+
# of words.
|
42
|
+
def segment
|
43
|
+
words = Array.new
|
44
|
+
|
45
|
+
token = next_token
|
46
|
+
until token.nil?
|
47
|
+
words << token.text
|
48
|
+
token = next_token
|
49
|
+
end
|
50
|
+
|
51
|
+
words
|
52
|
+
end
|
53
|
+
|
54
|
+
# Skip whitespaces and punctuation to extract a basic latin
|
55
|
+
# word.
|
56
|
+
def get_basic_latin_word
|
57
|
+
start_pos = nil
|
58
|
+
end_pos = nil
|
59
|
+
|
60
|
+
i = @index
|
61
|
+
while i < @chars.length &&
|
62
|
+
basic_latin?(@chars[i]) &&
|
63
|
+
nonword_char?(@chars[i])
|
64
|
+
i += 1
|
65
|
+
end
|
66
|
+
start_pos = i
|
67
|
+
|
68
|
+
while i < @chars.length && basic_latin?(@chars[i])
|
69
|
+
break if nonword_char?(@chars[i])
|
70
|
+
i += 1
|
71
|
+
end
|
72
|
+
end_pos = i
|
73
|
+
|
74
|
+
while i < @chars.length &&
|
75
|
+
basic_latin?(@chars[i]) &&
|
76
|
+
nonword_char?(@chars[i])
|
77
|
+
i += 1
|
78
|
+
end
|
79
|
+
|
80
|
+
@index = i
|
81
|
+
return @token.new(@text[start_pos...end_pos], start_pos, end_pos)
|
82
|
+
end
|
83
|
+
|
84
|
+
# Find all words occuring in the dictionary starting from
|
85
|
+
# +index+ . The maximum word length is determined by
|
86
|
+
# +Config.max_word_length+ .
|
87
|
+
def find_match_words(index)
|
88
|
+
for i, w in @match_cache
|
89
|
+
if i == index
|
90
|
+
return w
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
dic = Dictionary.instance
|
95
|
+
str = String.new
|
96
|
+
strlen = 0
|
97
|
+
words = Array.new
|
98
|
+
i = index
|
99
|
+
|
100
|
+
while i < @chars.length &&
|
101
|
+
!basic_latin?(@chars[i]) &&
|
102
|
+
strlen < Config.max_word_length
|
103
|
+
|
104
|
+
str << @chars[i]
|
105
|
+
strlen += 1
|
106
|
+
|
107
|
+
if dic.has_word?(str)
|
108
|
+
words << dic.get_word(str)
|
109
|
+
end
|
110
|
+
i += 1
|
111
|
+
end
|
112
|
+
|
113
|
+
if words.empty?
|
114
|
+
words << Word.new(@chars[index], Word::TYPES[:unrecognized])
|
115
|
+
end
|
116
|
+
|
117
|
+
@match_cache[@match_cache_idx] = [index, words]
|
118
|
+
@match_cache_idx += 1
|
119
|
+
@match_cache_idx = 0 if @match_cache_idx == MATCH_CACHE_MAX_LENGTH
|
120
|
+
|
121
|
+
words
|
122
|
+
end
|
123
|
+
|
124
|
+
# Determine whether a character is a basic latin character.
|
125
|
+
def basic_latin?(char)
|
126
|
+
char.each_byte.to_a.length == 1
|
127
|
+
end
|
128
|
+
|
129
|
+
# Determine whether a character can be part of a basic latin
|
130
|
+
# word.
|
131
|
+
NONWORD_CHAR_RE = /^\W$/
|
132
|
+
def nonword_char?(char)
|
133
|
+
NONWORD_CHAR_RE =~ char
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
data/lib/rmmseg/chunk.rb
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
module RMMSeg
|
2
|
+
# A Chunk holds one or more successive Word .
|
3
|
+
module Chunk
|
4
|
+
|
5
|
+
# The sum of length of all words.
|
6
|
+
def self.total_length(words)
|
7
|
+
len = 0
|
8
|
+
for word in words
|
9
|
+
len += word.length
|
10
|
+
end
|
11
|
+
len
|
12
|
+
end
|
13
|
+
|
14
|
+
# The average length of words.
|
15
|
+
def self.average_length(words)
|
16
|
+
total_length(words).to_f/words.size
|
17
|
+
end
|
18
|
+
|
19
|
+
# The square of the standard deviation of length of all words.
|
20
|
+
def self.variance(words)
|
21
|
+
avglen = average_length(words)
|
22
|
+
sqr_sum = 0.0
|
23
|
+
for word in words
|
24
|
+
tmp = word.length - avglen
|
25
|
+
sqr_sum += tmp*tmp
|
26
|
+
end
|
27
|
+
Math.sqrt(sqr_sum)
|
28
|
+
end
|
29
|
+
|
30
|
+
# The sum of all frequencies of one-character words.
|
31
|
+
def self.degree_of_morphemic_freedom(words)
|
32
|
+
sum = 0
|
33
|
+
for word in words
|
34
|
+
if word.length == 1 && word.type == Word::TYPES[:cjk_word]
|
35
|
+
sum += word.frequency
|
36
|
+
end
|
37
|
+
end
|
38
|
+
sum
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,122 @@
|
|
1
|
+
require 'rmmseg/algorithm'
|
2
|
+
require 'rmmseg/mm_rule'
|
3
|
+
require 'rmmseg/lawl_rule'
|
4
|
+
require 'rmmseg/svwl_rule'
|
5
|
+
require 'rmmseg/lsdmfocw_rule'
|
6
|
+
|
7
|
+
module RMMSeg
|
8
|
+
class ComplexAlgorithm
|
9
|
+
MATCH_CACHE_MAX_LENGTH = 3
|
10
|
+
|
11
|
+
include Algorithm
|
12
|
+
|
13
|
+
# Create a new ComplexAlgorithm . Rules used by this algorithm
|
14
|
+
# includes MMRule , LAWLRule , SVWLRule and LSDMFOCWRule .
|
15
|
+
def initialize(text, token=Token)
|
16
|
+
super
|
17
|
+
@rules = [
|
18
|
+
MMRule,
|
19
|
+
LAWLRule,
|
20
|
+
SVWLRule,
|
21
|
+
LSDMFOCWRule
|
22
|
+
]
|
23
|
+
@match_cache = Array.new(MATCH_CACHE_MAX_LENGTH)
|
24
|
+
@match_cache_idx = 0
|
25
|
+
end
|
26
|
+
|
27
|
+
# Get the most proper CJK word.
|
28
|
+
def get_cjk_word
|
29
|
+
chunks = create_chunks
|
30
|
+
i = 0
|
31
|
+
while i < @rules.length
|
32
|
+
break if chunks.length < 2
|
33
|
+
chunks = @rules[i].filter(chunks)
|
34
|
+
i += 1
|
35
|
+
end
|
36
|
+
|
37
|
+
if chunks.length > 1
|
38
|
+
if Config.on_ambiguity == :raise_exception
|
39
|
+
raise Ambiguity, "Can't solve ambiguity on #{chunks}"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
word = chunks[0][0]
|
44
|
+
token = @token.new(word.text, @byte_index, @byte_index+word.byte_size)
|
45
|
+
|
46
|
+
@index += word.length
|
47
|
+
@byte_index += word.byte_size
|
48
|
+
|
49
|
+
return token
|
50
|
+
end
|
51
|
+
|
52
|
+
# Create all possible three-word (or less) chunks
|
53
|
+
# starting from +@index+ .
|
54
|
+
def create_chunks
|
55
|
+
chunks = Array.new
|
56
|
+
for w0 in find_match_words(@index)
|
57
|
+
index0 = @index + w0.length
|
58
|
+
if index0 < @chars.length
|
59
|
+
for w1 in find_match_words(index0)
|
60
|
+
index1 = index0 + w1.length
|
61
|
+
if index1 < @chars.length
|
62
|
+
for w2 in find_match_words(index1)
|
63
|
+
if w2.type == Word::TYPES[:unrecognized]
|
64
|
+
chunks << [w0, w1]
|
65
|
+
else
|
66
|
+
chunks << [w0, w1, w2]
|
67
|
+
end
|
68
|
+
end
|
69
|
+
elsif index1 == @chars.length
|
70
|
+
chunks << [w0, w1]
|
71
|
+
end
|
72
|
+
end
|
73
|
+
elsif index0 == @chars.length
|
74
|
+
chunks << [w0]
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
chunks
|
79
|
+
end
|
80
|
+
|
81
|
+
# Find all words occuring in the dictionary starting from
|
82
|
+
# +index+ . The maximum word length is determined by
|
83
|
+
# +Config.max_word_length+ .
|
84
|
+
def find_match_words(index)
|
85
|
+
for i, w in @match_cache
|
86
|
+
if i == index
|
87
|
+
return w
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
dic = Dictionary.instance
|
92
|
+
str = String.new
|
93
|
+
strlen = 0
|
94
|
+
words = Array.new
|
95
|
+
i = index
|
96
|
+
|
97
|
+
while i < @chars.length &&
|
98
|
+
!basic_latin?(@chars[i]) &&
|
99
|
+
strlen < Config.max_word_length
|
100
|
+
|
101
|
+
str << @chars[i]
|
102
|
+
strlen += 1
|
103
|
+
|
104
|
+
if dic.has_word?(str)
|
105
|
+
words << dic.get_word(str)
|
106
|
+
end
|
107
|
+
i += 1
|
108
|
+
end
|
109
|
+
|
110
|
+
if words.empty?
|
111
|
+
words << Word.new(@chars[index], Word::TYPES[:unrecognized])
|
112
|
+
end
|
113
|
+
|
114
|
+
@match_cache[@match_cache_idx] = [index, words]
|
115
|
+
@match_cache_idx += 1
|
116
|
+
@match_cache_idx = 0 if @match_cache_idx == MATCH_CACHE_MAX_LENGTH
|
117
|
+
|
118
|
+
words
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
122
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'rmmseg/simple_algorithm'
|
2
|
+
require 'rmmseg/complex_algorithm'
|
3
|
+
|
4
|
+
module RMMSeg
|
5
|
+
# Configurations of RMMSeg.
|
6
|
+
class Config
|
7
|
+
@algorithm = :complex
|
8
|
+
@on_ambiguity = :select_first
|
9
|
+
data_dir = File.join(File.dirname(__FILE__), "..", "..", "data")
|
10
|
+
@dictionaries = [
|
11
|
+
[File.join(data_dir, "chars.dic"), true],
|
12
|
+
[File.join(data_dir, "words.dic"), false],
|
13
|
+
[File.join(data_dir, "custom.dic"), false]
|
14
|
+
]
|
15
|
+
@max_word_length = 4
|
16
|
+
|
17
|
+
class << self
|
18
|
+
# Get the algorithm name currently using
|
19
|
+
def algorithm
|
20
|
+
@algorithm
|
21
|
+
end
|
22
|
+
# Set the algorithm name used to segment. Valid values are
|
23
|
+
# +:complex+ and +:simple+ . The former is the default one.
|
24
|
+
def algorithm=(algor)
|
25
|
+
unless [:complex, :simple].include? algor
|
26
|
+
raise ArgumentError, "Unknown algorithm #{algor}"
|
27
|
+
end
|
28
|
+
@algorithm = algor
|
29
|
+
end
|
30
|
+
# Get an instance of the algorithm object corresponding to the
|
31
|
+
# algorithm name configured. +tok+ is the class of the token oject
|
32
|
+
# to be returned. For example, if you want to use with Ferret, you
|
33
|
+
# should provide +::Ferret::Analysis::Token+ .
|
34
|
+
def algorithm_instance(text, tok=Token)
|
35
|
+
RMMSeg.const_get("#{@algorithm}".capitalize+"Algorithm").new(text, tok)
|
36
|
+
end
|
37
|
+
|
38
|
+
# Get the behavior description when an unresolved ambiguity occured.
|
39
|
+
def on_ambiguity
|
40
|
+
@on_ambiguity
|
41
|
+
end
|
42
|
+
# Set the behavior on an unresolved ambiguity. Valid values are
|
43
|
+
# +:raise_exception+ and +:select_first+ . The latter is the default
|
44
|
+
# one.
|
45
|
+
def on_ambiguity=(behavior)
|
46
|
+
unless [:raise_exception, :select_first].include? behavior
|
47
|
+
raise ArgumentError, "Unknown behavior on ambiguity: #{behavior}"
|
48
|
+
end
|
49
|
+
@on_ambiguity = behavior
|
50
|
+
end
|
51
|
+
|
52
|
+
# An array of dictionary files. Each element should be of the
|
53
|
+
# form: [file, whether_dic_include_frequency_info]. This should
|
54
|
+
# be set before the dictionaries are loaded (They are loaded
|
55
|
+
# only when they are used). Or else you should call
|
56
|
+
# Dictionary.instance.reload manually to reload the
|
57
|
+
# dictionaries.
|
58
|
+
attr_accessor :dictionaries
|
59
|
+
|
60
|
+
# The maximum length of a CJK word. The default value is 4. Making
|
61
|
+
# this value too large might slow down the segment operations.
|
62
|
+
attr_accessor :max_word_length
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
require 'singleton'
|
2
|
+
|
3
|
+
module RMMSeg
|
4
|
+
# The dictionary is a singleton object which is lazily initialized.
|
5
|
+
# *NOTE* dictionary data should use the UNIX line-break '\n' instead
|
6
|
+
# of DOS '\r\n'.
|
7
|
+
class Dictionary
|
8
|
+
include Singleton
|
9
|
+
|
10
|
+
# Initialize and load dictionaries from files specified by
|
11
|
+
# +Config.dictionaries+ .
|
12
|
+
def initialize
|
13
|
+
load_dictionaries
|
14
|
+
end
|
15
|
+
|
16
|
+
# Determin whether +value+ is a word in the dictionary.
|
17
|
+
def has_word?(value)
|
18
|
+
@dic.has_key?(value)
|
19
|
+
end
|
20
|
+
|
21
|
+
# Store a new word to dictionary.
|
22
|
+
# +w+ may be:
|
23
|
+
# * an instance of Word.
|
24
|
+
# * +true+, then this is a normal world.
|
25
|
+
# * a String(which can be converted to a Number) or Number.
|
26
|
+
# The number is the frequency of the word.
|
27
|
+
def store_word(key, w=true)
|
28
|
+
@dic[key] = w
|
29
|
+
end
|
30
|
+
|
31
|
+
# Get an instance of Word corresponding to +value+ .
|
32
|
+
def get_word(value)
|
33
|
+
word = @dic[value]
|
34
|
+
# Construct a Word lazily
|
35
|
+
if word == true
|
36
|
+
word = Word.new(value.dup, Word::TYPES[:cjk_word])
|
37
|
+
@dic[value] = word
|
38
|
+
elsif String === word
|
39
|
+
word = Word.new(value.dup, Word::TYPES[:cjk_word], word.to_i)
|
40
|
+
@dic[value] = word
|
41
|
+
end
|
42
|
+
word
|
43
|
+
end
|
44
|
+
|
45
|
+
# Reload all dictionary files.
|
46
|
+
def reload
|
47
|
+
@dic = nil
|
48
|
+
load_dictionaries
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
def load_dictionaries
|
53
|
+
@dic = Hash.new
|
54
|
+
Config.dictionaries.each { |file, has_freq|
|
55
|
+
if has_freq
|
56
|
+
load_dictionary_with_freq(file)
|
57
|
+
else
|
58
|
+
load_dictionary(file)
|
59
|
+
end
|
60
|
+
}
|
61
|
+
end
|
62
|
+
|
63
|
+
def load_dictionary_with_freq(file)
|
64
|
+
File.open(file, "r") { |f|
|
65
|
+
f.each_line { |line|
|
66
|
+
pair = line.split(" ")
|
67
|
+
@dic[pair[0]] = pair[1]
|
68
|
+
}
|
69
|
+
}
|
70
|
+
end
|
71
|
+
def load_dictionary(file)
|
72
|
+
File.open(file, "r") { |f|
|
73
|
+
f.each_line { |line|
|
74
|
+
line.slice!(-1) # chop!
|
75
|
+
@dic[line] = true
|
76
|
+
}
|
77
|
+
}
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|