rmmseg 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +6 -0
- data/Manifest.txt +37 -0
- data/README.txt +63 -0
- data/Rakefile +33 -0
- data/TODO.txt +3 -0
- data/bin/rmmseg +63 -0
- data/lib/rmmseg/algorithm.rb +157 -0
- data/lib/rmmseg/amibguity.rb +4 -0
- data/lib/rmmseg/chars.dic +12638 -0
- data/lib/rmmseg/chunk.rb +51 -0
- data/lib/rmmseg/complex_algorithm.rb +52 -0
- data/lib/rmmseg/config.rb +59 -0
- data/lib/rmmseg/dictionary.rb +66 -0
- data/lib/rmmseg/ferret.rb +43 -0
- data/lib/rmmseg/lawl_rule.rb +14 -0
- data/lib/rmmseg/lsdmfocw_rule.rb +15 -0
- data/lib/rmmseg/mm_rule.rb +15 -0
- data/lib/rmmseg/rule_helper.rb +22 -0
- data/lib/rmmseg/simple_algorithm.rb +22 -0
- data/lib/rmmseg/svwl_rule.rb +14 -0
- data/lib/rmmseg/token.rb +22 -0
- data/lib/rmmseg/word.rb +37 -0
- data/lib/rmmseg/words.dic +120330 -0
- data/lib/rmmseg.rb +15 -0
- data/misc/homepage.erb +93 -0
- data/misc/homepage.html +1063 -0
- data/spec/chunk_spec.rb +26 -0
- data/spec/complex_algorithm_spec.rb +18 -0
- data/spec/config_spec.rb +12 -0
- data/spec/dictionary_spec.rb +20 -0
- data/spec/lawl_rule_spec.rb +15 -0
- data/spec/lsdmfocw_rule_spec.rb +14 -0
- data/spec/mm_rule_spec.rb +15 -0
- data/spec/simple_algorithm_spec.rb +46 -0
- data/spec/spec_helper.rb +15 -0
- data/spec/svwl_rule_spec.rb +14 -0
- data/spec/word_spec.rb +9 -0
- metadata +101 -0
data/lib/rmmseg/chunk.rb
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
module RMMSeg
|
2
|
+
# A Chunk holds one or more successive Word .
|
3
|
+
class Chunk
|
4
|
+
# The words held by this chunk.
|
5
|
+
attr_reader :words
|
6
|
+
|
7
|
+
# Build a Chunk on an array of Word .
|
8
|
+
def initialize(words)
|
9
|
+
@words = words
|
10
|
+
@average_length = nil
|
11
|
+
@total_length = nil
|
12
|
+
@variance = nil
|
13
|
+
@degree_of_morphemic_freedom = nil
|
14
|
+
end
|
15
|
+
|
16
|
+
# The sum of length of all words held by this chunk.
|
17
|
+
def total_length
|
18
|
+
@total_length ||= @words.inject(0.0) { |len, word| len + word.length }
|
19
|
+
@total_length
|
20
|
+
end
|
21
|
+
|
22
|
+
# The average length of words held by this chunk.
|
23
|
+
def average_length
|
24
|
+
@average_length ||= total_length/@words.size
|
25
|
+
@average_length
|
26
|
+
end
|
27
|
+
|
28
|
+
# The square of the standard deviation of length of all words
|
29
|
+
# held by this chunk.
|
30
|
+
def variance
|
31
|
+
@variance ||= Math.sqrt(@words.inject(0.0) { |sqr_sum, word|
|
32
|
+
tmp = word.length - average_length
|
33
|
+
sqr_sum + tmp*tmp
|
34
|
+
})
|
35
|
+
@variance
|
36
|
+
end
|
37
|
+
|
38
|
+
# The sum of all frequencies of one-character words held by
|
39
|
+
# this chunk.
|
40
|
+
def degree_of_morphemic_freedom
|
41
|
+
@degree_of_morphemic_freedom ||= @words.inject(0) { |sum, word|
|
42
|
+
if word.length == 1 && word.type == Word::TYPES[:cjk_word]
|
43
|
+
sum + word.frequency
|
44
|
+
else
|
45
|
+
sum
|
46
|
+
end
|
47
|
+
}
|
48
|
+
@degree_of_morphemic_freedom
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'rmmseg/algorithm'
|
2
|
+
require 'rmmseg/mm_rule'
|
3
|
+
require 'rmmseg/lawl_rule'
|
4
|
+
require 'rmmseg/svwl_rule'
|
5
|
+
require 'rmmseg/lsdmfocw_rule'
|
6
|
+
|
7
|
+
module RMMSeg
|
8
|
+
class ComplexAlgorithm
|
9
|
+
include Algorithm
|
10
|
+
|
11
|
+
# Create a new ComplexAlgorithm . Rules used by this algorithm
|
12
|
+
# includes MMRule , LAWLRule , SVWLRule and LSDMFOCWRule .
|
13
|
+
def initialize(text)
|
14
|
+
super
|
15
|
+
@rules = [
|
16
|
+
MMRule.new,
|
17
|
+
LAWLRule.new,
|
18
|
+
SVWLRule.new,
|
19
|
+
LSDMFOCWRule.new
|
20
|
+
]
|
21
|
+
end
|
22
|
+
|
23
|
+
# Create all possible three-word (or less) chunks
|
24
|
+
# starting from +@index+ .
|
25
|
+
def create_chunks
|
26
|
+
chunks = Array.new
|
27
|
+
find_match_words(@chars, @index).each { |w0|
|
28
|
+
index0 = @index + w0.length
|
29
|
+
if index0 < @chars.length
|
30
|
+
find_match_words(@chars, index0).each { |w1|
|
31
|
+
index1 = index0 + w1.length
|
32
|
+
if index1 < @chars.length
|
33
|
+
find_match_words(@chars, index1).each { |w2|
|
34
|
+
if w2.type == Word::TYPES[:unrecognized]
|
35
|
+
chunks << Chunk.new([w0, w1])
|
36
|
+
else
|
37
|
+
chunks << Chunk.new([w0, w1, w2])
|
38
|
+
end
|
39
|
+
}
|
40
|
+
elsif index1 == @chars.length
|
41
|
+
chunks << Chunk.new([w0, w1])
|
42
|
+
end
|
43
|
+
}
|
44
|
+
elsif index0 == @chars.length
|
45
|
+
chunks << Chunk.new([w0])
|
46
|
+
end
|
47
|
+
}
|
48
|
+
|
49
|
+
chunks
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'rmmseg/simple_algorithm'
|
2
|
+
require 'rmmseg/complex_algorithm'
|
3
|
+
|
4
|
+
module RMMSeg
|
5
|
+
# Configurations of RMMSeg.
|
6
|
+
class Config
|
7
|
+
@algorithm = :complex
|
8
|
+
@on_ambiguity = :select_first
|
9
|
+
@dictionaries = [[File.join(File.dirname(__FILE__), "chars.dic"), true],
|
10
|
+
[File.join(File.dirname(__FILE__), "words.dic"), false]]
|
11
|
+
@max_word_length = 4
|
12
|
+
|
13
|
+
class << self
|
14
|
+
# Get the algorithm name currently using
|
15
|
+
def algorithm
|
16
|
+
@algorithm
|
17
|
+
end
|
18
|
+
# Set the algorithm name used to segment. Valid values are
|
19
|
+
# +:complex+ and +:simple+ . The former is the default one.
|
20
|
+
def algorithm=(algor)
|
21
|
+
unless [:complex, :simple].include? algor
|
22
|
+
raise ArgumentError, "Unknown algorithm #{algor}"
|
23
|
+
end
|
24
|
+
@algorithm = algor
|
25
|
+
end
|
26
|
+
# Get an instance of the algorithm object corresponding to the
|
27
|
+
# algorithm name configured.
|
28
|
+
def algorithm_instance(text)
|
29
|
+
RMMSeg.const_get("#{@algorithm}".capitalize+"Algorithm").new(text)
|
30
|
+
end
|
31
|
+
|
32
|
+
# Get the behavior description when an unresolved ambiguity occured.
|
33
|
+
def on_ambiguity
|
34
|
+
@on_ambiguity
|
35
|
+
end
|
36
|
+
# Set the behavior on an unresolved ambiguity. Valid values are
|
37
|
+
# +:raise_exception+ and +:select_first+ . The latter is the default
|
38
|
+
# one.
|
39
|
+
def on_ambiguity=(behavior)
|
40
|
+
unless [:raise_exception, :select_first].include? behavior
|
41
|
+
raise ArgumentError, "Unknown behavior on ambiguity: #{behavior}"
|
42
|
+
end
|
43
|
+
@on_ambiguity = behavior
|
44
|
+
end
|
45
|
+
|
46
|
+
# An array of dictionary files. Each element should be of the
|
47
|
+
# form: [file, whether_dic_include_frequency_info]. This should
|
48
|
+
# be set before the dictionaries are loaded (They are loaded
|
49
|
+
# only when they are used). Or else you should call
|
50
|
+
# Dictionary.instance.reload manually to reload the
|
51
|
+
# dictionaries.
|
52
|
+
attr_accessor :dictionaries
|
53
|
+
|
54
|
+
# The maximum length of a CJK word. The default value is 4. Making
|
55
|
+
# this value too large might slow down the segment operations.
|
56
|
+
attr_accessor :max_word_length
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require 'singleton'
|
2
|
+
|
3
|
+
module RMMSeg
|
4
|
+
# The dictionary is a singleton object which is lazily initialized.
|
5
|
+
class Dictionary
|
6
|
+
include Singleton
|
7
|
+
|
8
|
+
# Initialize and load dictionaries from files specified by
|
9
|
+
# +Config.dictionaries+ .
|
10
|
+
def initialize
|
11
|
+
load_dictionaries
|
12
|
+
end
|
13
|
+
|
14
|
+
# Determin whether +value+ is a word in the dictionary.
|
15
|
+
def has_word?(value)
|
16
|
+
@dic.has_key?(value)
|
17
|
+
end
|
18
|
+
|
19
|
+
# Get an instance of Word corresponding to +value+ .
|
20
|
+
def get_word(value)
|
21
|
+
word = @dic[value]
|
22
|
+
# Construct a Word lazily
|
23
|
+
if word.is_a? String
|
24
|
+
arr = word.split(" ")
|
25
|
+
word = Word.new(arr[0], Word::TYPES[:cjk_word], arr[1].to_i)
|
26
|
+
@dic[value] = word
|
27
|
+
end
|
28
|
+
word
|
29
|
+
end
|
30
|
+
|
31
|
+
# Reload all dictionary files.
|
32
|
+
def reload
|
33
|
+
@dic = nil
|
34
|
+
load_dictionaries
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
def load_dictionaries
|
39
|
+
@dic = Hash.new
|
40
|
+
Config.dictionaries.each { |file, has_freq|
|
41
|
+
if has_freq
|
42
|
+
load_dictionary_with_freq(file)
|
43
|
+
else
|
44
|
+
load_dictionary(file)
|
45
|
+
end
|
46
|
+
}
|
47
|
+
end
|
48
|
+
|
49
|
+
def load_dictionary_with_freq(file)
|
50
|
+
File.open(file, "r") { |f|
|
51
|
+
f.each_line { |line|
|
52
|
+
pair = line.split(" ")
|
53
|
+
@dic[pair[0]] = line
|
54
|
+
}
|
55
|
+
}
|
56
|
+
end
|
57
|
+
def load_dictionary(file)
|
58
|
+
File.open(file, "r") { |f|
|
59
|
+
f.each_line { |line|
|
60
|
+
line.chomp!.freeze
|
61
|
+
@dic[line] = line
|
62
|
+
}
|
63
|
+
}
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# This file integrate RMMSeg with Ferret
|
2
|
+
require 'rubygems'
|
3
|
+
require 'ferret'
|
4
|
+
|
5
|
+
module RMMSeg
|
6
|
+
module Ferret
|
7
|
+
# The Analyzer class can be used with Ferret .
|
8
|
+
class Analyzer < ::Ferret::Analysis::Analyzer
|
9
|
+
def token_stream(field, text)
|
10
|
+
Tokenizer.new(text)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
# The Tokenizer tokenize text with RMMSeg::Algorithm.
|
15
|
+
class Tokenizer < ::Ferret::Analysis::TokenStream
|
16
|
+
# Create a new Tokenizer to tokenize +text+
|
17
|
+
def initialize(str)
|
18
|
+
self.text = str
|
19
|
+
end
|
20
|
+
|
21
|
+
# Get next token
|
22
|
+
def next
|
23
|
+
tk = @algor.next_token
|
24
|
+
if tk.nil?
|
25
|
+
nil
|
26
|
+
else
|
27
|
+
::Ferret::Analysis::Token.new(tk.text, tk.start_pos, tk.end_pos)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# Get the text being tokenized
|
32
|
+
def text
|
33
|
+
@text
|
34
|
+
end
|
35
|
+
|
36
|
+
# Set the text to be tokenized
|
37
|
+
def text=(str)
|
38
|
+
@text = str
|
39
|
+
@algor = RMMSeg::Config.algorithm_instance(@text)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'rmmseg/rule_helper'
|
2
|
+
|
3
|
+
module RMMSeg
|
4
|
+
# Largest average word length rule.
|
5
|
+
class LAWLRule
|
6
|
+
def filter(chunks)
|
7
|
+
chunks.sort { |a, b|
|
8
|
+
b.average_length <=> a.average_length
|
9
|
+
}.similar_elements { |a, b|
|
10
|
+
a.average_length == b.average_length
|
11
|
+
}
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'rmmseg/rule_helper'
|
2
|
+
|
3
|
+
module RMMSeg
|
4
|
+
# Largest sum of degree of morphemic freedom of one-character
|
5
|
+
# words rule.
|
6
|
+
class LSDMFOCWRule
|
7
|
+
def filter(chunks)
|
8
|
+
chunks.sort { |a, b|
|
9
|
+
b.degree_of_morphemic_freedom <=> a.degree_of_morphemic_freedom
|
10
|
+
}.similar_elements { |a, b|
|
11
|
+
a.degree_of_morphemic_freedom == b.degree_of_morphemic_freedom
|
12
|
+
}
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'rmmseg/rule_helper'
|
2
|
+
|
3
|
+
module RMMSeg
|
4
|
+
# Maximum matching rule, select the chunks with the
|
5
|
+
# maximum length.
|
6
|
+
class MMRule
|
7
|
+
def filter(chunks)
|
8
|
+
chunks.sort { |a, b|
|
9
|
+
b.total_length <=> a.total_length
|
10
|
+
}.similar_elements { |a, b|
|
11
|
+
a.total_length == b.total_length
|
12
|
+
}
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
class Array
|
2
|
+
# Return an array of _similar_ elements neighbouring to each
|
3
|
+
# other. e.g.
|
4
|
+
# [1,2,2,2,3,3,5].similar_elements(1) => [2,2,2]
|
5
|
+
# and (maybe more useful example)
|
6
|
+
# ["Kid", "Kily", "KDE", "Foo", "Food"].similar_elements { |a, b|
|
7
|
+
# a[0] == b[0]
|
8
|
+
# } => ["Kid", "Kily", "KDE"]
|
9
|
+
def similar_elements(index=0)
|
10
|
+
i = index+1
|
11
|
+
loop do
|
12
|
+
break if i >= self.length
|
13
|
+
if block_given?
|
14
|
+
break unless yield(self[index], self[i])
|
15
|
+
else
|
16
|
+
break if self[index] == self[i]
|
17
|
+
end
|
18
|
+
i += 1
|
19
|
+
end
|
20
|
+
self[index...i]
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'rmmseg/algorithm'
|
2
|
+
require 'rmmseg/mm_rule'
|
3
|
+
|
4
|
+
module RMMSeg
|
5
|
+
class SimpleAlgorithm
|
6
|
+
include Algorithm
|
7
|
+
|
8
|
+
# Create a new SimpleAlgorithm . The only rule used by this
|
9
|
+
# algorithm is MMRule .
|
10
|
+
def initialize(text)
|
11
|
+
super
|
12
|
+
@rules = [ MMRule.new ]
|
13
|
+
end
|
14
|
+
|
15
|
+
# Create all possible one-word chunks starting from +@index+ .
|
16
|
+
def create_chunks
|
17
|
+
find_match_words(@chars, @index).map { |word|
|
18
|
+
Chunk.new([word])
|
19
|
+
}
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'rmmseg/rule_helper'
|
2
|
+
|
3
|
+
module RMMSeg
|
4
|
+
# Smallest variance of word length rule.
|
5
|
+
class SVWLRule
|
6
|
+
def filter(chunks)
|
7
|
+
chunks.sort { |a, b|
|
8
|
+
a.variance <=> b.variance
|
9
|
+
}.similar_elements { |a, b|
|
10
|
+
a.variance == b.variance
|
11
|
+
}
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
data/lib/rmmseg/token.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
module RMMSeg
|
2
|
+
# A Token consists of a term's text and the start and end offset
|
3
|
+
# of the term.
|
4
|
+
class Token
|
5
|
+
# Text of the token.
|
6
|
+
attr_reader :text
|
7
|
+
|
8
|
+
# The start position of the token. This is *byte* index instead of
|
9
|
+
# character.
|
10
|
+
attr_reader :start_pos
|
11
|
+
|
12
|
+
# The one greater than the position of the last byte of the
|
13
|
+
# token. This is *byte* index instead of character.
|
14
|
+
attr_reader :end_pos
|
15
|
+
|
16
|
+
def initialize(text, start_pos, end_pos)
|
17
|
+
@text = text
|
18
|
+
@start_pos = start_pos
|
19
|
+
@end_pos = end_pos
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
data/lib/rmmseg/word.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
module RMMSeg
|
2
|
+
# An object representing a CJK word.
|
3
|
+
class Word
|
4
|
+
TYPES = {
|
5
|
+
:unrecognized => :unrecognized,
|
6
|
+
:basic_latin_word => :basic_latin_word,
|
7
|
+
:cjk_word => :cjk_word
|
8
|
+
}.freeze
|
9
|
+
|
10
|
+
# The content text of the word.
|
11
|
+
attr_reader :text
|
12
|
+
|
13
|
+
# The type of the word, may be one of the key of TYPES .
|
14
|
+
attr_reader :type
|
15
|
+
|
16
|
+
# The frequency of the word. This value is meaningful only
|
17
|
+
# when this is a one-character word.
|
18
|
+
attr_reader :frequency
|
19
|
+
|
20
|
+
# Initialize a Word object.
|
21
|
+
def initialize(text, type=TYPES[:unrecognized], frequency=nil)
|
22
|
+
@text = text
|
23
|
+
@type = type
|
24
|
+
@frequency = frequency
|
25
|
+
end
|
26
|
+
|
27
|
+
# The number of characters in the word. *Not* number of bytes.
|
28
|
+
def length
|
29
|
+
@text.jlength
|
30
|
+
end
|
31
|
+
|
32
|
+
# The number of bytes in the word.
|
33
|
+
def byte_size
|
34
|
+
@text.length
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|