rmmseg 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +6 -0
- data/Manifest.txt +37 -0
- data/README.txt +63 -0
- data/Rakefile +33 -0
- data/TODO.txt +3 -0
- data/bin/rmmseg +63 -0
- data/lib/rmmseg/algorithm.rb +157 -0
- data/lib/rmmseg/amibguity.rb +4 -0
- data/lib/rmmseg/chars.dic +12638 -0
- data/lib/rmmseg/chunk.rb +51 -0
- data/lib/rmmseg/complex_algorithm.rb +52 -0
- data/lib/rmmseg/config.rb +59 -0
- data/lib/rmmseg/dictionary.rb +66 -0
- data/lib/rmmseg/ferret.rb +43 -0
- data/lib/rmmseg/lawl_rule.rb +14 -0
- data/lib/rmmseg/lsdmfocw_rule.rb +15 -0
- data/lib/rmmseg/mm_rule.rb +15 -0
- data/lib/rmmseg/rule_helper.rb +22 -0
- data/lib/rmmseg/simple_algorithm.rb +22 -0
- data/lib/rmmseg/svwl_rule.rb +14 -0
- data/lib/rmmseg/token.rb +22 -0
- data/lib/rmmseg/word.rb +37 -0
- data/lib/rmmseg/words.dic +120330 -0
- data/lib/rmmseg.rb +15 -0
- data/misc/homepage.erb +93 -0
- data/misc/homepage.html +1063 -0
- data/spec/chunk_spec.rb +26 -0
- data/spec/complex_algorithm_spec.rb +18 -0
- data/spec/config_spec.rb +12 -0
- data/spec/dictionary_spec.rb +20 -0
- data/spec/lawl_rule_spec.rb +15 -0
- data/spec/lsdmfocw_rule_spec.rb +14 -0
- data/spec/mm_rule_spec.rb +15 -0
- data/spec/simple_algorithm_spec.rb +46 -0
- data/spec/spec_helper.rb +15 -0
- data/spec/svwl_rule_spec.rb +14 -0
- data/spec/word_spec.rb +9 -0
- metadata +101 -0
data/lib/rmmseg/chunk.rb
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
module RMMSeg
|
2
|
+
# A Chunk holds one or more successive Word .
|
3
|
+
class Chunk
|
4
|
+
# The words held by this chunk.
|
5
|
+
attr_reader :words
|
6
|
+
|
7
|
+
# Build a Chunk on an array of Word .
|
8
|
+
def initialize(words)
|
9
|
+
@words = words
|
10
|
+
@average_length = nil
|
11
|
+
@total_length = nil
|
12
|
+
@variance = nil
|
13
|
+
@degree_of_morphemic_freedom = nil
|
14
|
+
end
|
15
|
+
|
16
|
+
# The sum of length of all words held by this chunk.
|
17
|
+
def total_length
|
18
|
+
@total_length ||= @words.inject(0.0) { |len, word| len + word.length }
|
19
|
+
@total_length
|
20
|
+
end
|
21
|
+
|
22
|
+
# The average length of words held by this chunk.
|
23
|
+
def average_length
|
24
|
+
@average_length ||= total_length/@words.size
|
25
|
+
@average_length
|
26
|
+
end
|
27
|
+
|
28
|
+
# The square of the standard deviation of length of all words
|
29
|
+
# held by this chunk.
|
30
|
+
def variance
|
31
|
+
@variance ||= Math.sqrt(@words.inject(0.0) { |sqr_sum, word|
|
32
|
+
tmp = word.length - average_length
|
33
|
+
sqr_sum + tmp*tmp
|
34
|
+
})
|
35
|
+
@variance
|
36
|
+
end
|
37
|
+
|
38
|
+
# The sum of all frequencies of one-character words held by
|
39
|
+
# this chunk.
|
40
|
+
def degree_of_morphemic_freedom
|
41
|
+
@degree_of_morphemic_freedom ||= @words.inject(0) { |sum, word|
|
42
|
+
if word.length == 1 && word.type == Word::TYPES[:cjk_word]
|
43
|
+
sum + word.frequency
|
44
|
+
else
|
45
|
+
sum
|
46
|
+
end
|
47
|
+
}
|
48
|
+
@degree_of_morphemic_freedom
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'rmmseg/algorithm'
|
2
|
+
require 'rmmseg/mm_rule'
|
3
|
+
require 'rmmseg/lawl_rule'
|
4
|
+
require 'rmmseg/svwl_rule'
|
5
|
+
require 'rmmseg/lsdmfocw_rule'
|
6
|
+
|
7
|
+
module RMMSeg
|
8
|
+
class ComplexAlgorithm
|
9
|
+
include Algorithm
|
10
|
+
|
11
|
+
# Create a new ComplexAlgorithm . Rules used by this algorithm
|
12
|
+
# includes MMRule , LAWLRule , SVWLRule and LSDMFOCWRule .
|
13
|
+
def initialize(text)
|
14
|
+
super
|
15
|
+
@rules = [
|
16
|
+
MMRule.new,
|
17
|
+
LAWLRule.new,
|
18
|
+
SVWLRule.new,
|
19
|
+
LSDMFOCWRule.new
|
20
|
+
]
|
21
|
+
end
|
22
|
+
|
23
|
+
# Create all possible three-word (or less) chunks
|
24
|
+
# starting from +@index+ .
|
25
|
+
def create_chunks
|
26
|
+
chunks = Array.new
|
27
|
+
find_match_words(@chars, @index).each { |w0|
|
28
|
+
index0 = @index + w0.length
|
29
|
+
if index0 < @chars.length
|
30
|
+
find_match_words(@chars, index0).each { |w1|
|
31
|
+
index1 = index0 + w1.length
|
32
|
+
if index1 < @chars.length
|
33
|
+
find_match_words(@chars, index1).each { |w2|
|
34
|
+
if w2.type == Word::TYPES[:unrecognized]
|
35
|
+
chunks << Chunk.new([w0, w1])
|
36
|
+
else
|
37
|
+
chunks << Chunk.new([w0, w1, w2])
|
38
|
+
end
|
39
|
+
}
|
40
|
+
elsif index1 == @chars.length
|
41
|
+
chunks << Chunk.new([w0, w1])
|
42
|
+
end
|
43
|
+
}
|
44
|
+
elsif index0 == @chars.length
|
45
|
+
chunks << Chunk.new([w0])
|
46
|
+
end
|
47
|
+
}
|
48
|
+
|
49
|
+
chunks
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'rmmseg/simple_algorithm'
|
2
|
+
require 'rmmseg/complex_algorithm'
|
3
|
+
|
4
|
+
module RMMSeg
|
5
|
+
# Configurations of RMMSeg.
|
6
|
+
class Config
|
7
|
+
@algorithm = :complex
|
8
|
+
@on_ambiguity = :select_first
|
9
|
+
@dictionaries = [[File.join(File.dirname(__FILE__), "chars.dic"), true],
|
10
|
+
[File.join(File.dirname(__FILE__), "words.dic"), false]]
|
11
|
+
@max_word_length = 4
|
12
|
+
|
13
|
+
class << self
|
14
|
+
# Get the algorithm name currently using
|
15
|
+
def algorithm
|
16
|
+
@algorithm
|
17
|
+
end
|
18
|
+
# Set the algorithm name used to segment. Valid values are
|
19
|
+
# +:complex+ and +:simple+ . The former is the default one.
|
20
|
+
def algorithm=(algor)
|
21
|
+
unless [:complex, :simple].include? algor
|
22
|
+
raise ArgumentError, "Unknown algorithm #{algor}"
|
23
|
+
end
|
24
|
+
@algorithm = algor
|
25
|
+
end
|
26
|
+
# Get an instance of the algorithm object corresponding to the
|
27
|
+
# algorithm name configured.
|
28
|
+
def algorithm_instance(text)
|
29
|
+
RMMSeg.const_get("#{@algorithm}".capitalize+"Algorithm").new(text)
|
30
|
+
end
|
31
|
+
|
32
|
+
# Get the behavior description when an unresolved ambiguity occured.
|
33
|
+
def on_ambiguity
|
34
|
+
@on_ambiguity
|
35
|
+
end
|
36
|
+
# Set the behavior on an unresolved ambiguity. Valid values are
|
37
|
+
# +:raise_exception+ and +:select_first+ . The latter is the default
|
38
|
+
# one.
|
39
|
+
def on_ambiguity=(behavior)
|
40
|
+
unless [:raise_exception, :select_first].include? behavior
|
41
|
+
raise ArgumentError, "Unknown behavior on ambiguity: #{behavior}"
|
42
|
+
end
|
43
|
+
@on_ambiguity = behavior
|
44
|
+
end
|
45
|
+
|
46
|
+
# An array of dictionary files. Each element should be of the
|
47
|
+
# form: [file, whether_dic_include_frequency_info]. This should
|
48
|
+
# be set before the dictionaries are loaded (They are loaded
|
49
|
+
# only when they are used). Or else you should call
|
50
|
+
# Dictionary.instance.reload manually to reload the
|
51
|
+
# dictionaries.
|
52
|
+
attr_accessor :dictionaries
|
53
|
+
|
54
|
+
# The maximum length of a CJK word. The default value is 4. Making
|
55
|
+
# this value too large might slow down the segment operations.
|
56
|
+
attr_accessor :max_word_length
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require 'singleton'
|
2
|
+
|
3
|
+
module RMMSeg
|
4
|
+
# The dictionary is a singleton object which is lazily initialized.
|
5
|
+
class Dictionary
|
6
|
+
include Singleton
|
7
|
+
|
8
|
+
# Initialize and load dictionaries from files specified by
|
9
|
+
# +Config.dictionaries+ .
|
10
|
+
def initialize
|
11
|
+
load_dictionaries
|
12
|
+
end
|
13
|
+
|
14
|
+
# Determin whether +value+ is a word in the dictionary.
|
15
|
+
def has_word?(value)
|
16
|
+
@dic.has_key?(value)
|
17
|
+
end
|
18
|
+
|
19
|
+
# Get an instance of Word corresponding to +value+ .
|
20
|
+
def get_word(value)
|
21
|
+
word = @dic[value]
|
22
|
+
# Construct a Word lazily
|
23
|
+
if word.is_a? String
|
24
|
+
arr = word.split(" ")
|
25
|
+
word = Word.new(arr[0], Word::TYPES[:cjk_word], arr[1].to_i)
|
26
|
+
@dic[value] = word
|
27
|
+
end
|
28
|
+
word
|
29
|
+
end
|
30
|
+
|
31
|
+
# Reload all dictionary files.
|
32
|
+
def reload
|
33
|
+
@dic = nil
|
34
|
+
load_dictionaries
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
def load_dictionaries
|
39
|
+
@dic = Hash.new
|
40
|
+
Config.dictionaries.each { |file, has_freq|
|
41
|
+
if has_freq
|
42
|
+
load_dictionary_with_freq(file)
|
43
|
+
else
|
44
|
+
load_dictionary(file)
|
45
|
+
end
|
46
|
+
}
|
47
|
+
end
|
48
|
+
|
49
|
+
def load_dictionary_with_freq(file)
|
50
|
+
File.open(file, "r") { |f|
|
51
|
+
f.each_line { |line|
|
52
|
+
pair = line.split(" ")
|
53
|
+
@dic[pair[0]] = line
|
54
|
+
}
|
55
|
+
}
|
56
|
+
end
|
57
|
+
def load_dictionary(file)
|
58
|
+
File.open(file, "r") { |f|
|
59
|
+
f.each_line { |line|
|
60
|
+
line.chomp!.freeze
|
61
|
+
@dic[line] = line
|
62
|
+
}
|
63
|
+
}
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# This file integrate RMMSeg with Ferret
|
2
|
+
require 'rubygems'
|
3
|
+
require 'ferret'
|
4
|
+
|
5
|
+
module RMMSeg
|
6
|
+
module Ferret
|
7
|
+
# The Analyzer class can be used with Ferret .
|
8
|
+
class Analyzer < ::Ferret::Analysis::Analyzer
|
9
|
+
def token_stream(field, text)
|
10
|
+
Tokenizer.new(text)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
# The Tokenizer tokenize text with RMMSeg::Algorithm.
|
15
|
+
class Tokenizer < ::Ferret::Analysis::TokenStream
|
16
|
+
# Create a new Tokenizer to tokenize +text+
|
17
|
+
def initialize(str)
|
18
|
+
self.text = str
|
19
|
+
end
|
20
|
+
|
21
|
+
# Get next token
|
22
|
+
def next
|
23
|
+
tk = @algor.next_token
|
24
|
+
if tk.nil?
|
25
|
+
nil
|
26
|
+
else
|
27
|
+
::Ferret::Analysis::Token.new(tk.text, tk.start_pos, tk.end_pos)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# Get the text being tokenized
|
32
|
+
def text
|
33
|
+
@text
|
34
|
+
end
|
35
|
+
|
36
|
+
# Set the text to be tokenized
|
37
|
+
def text=(str)
|
38
|
+
@text = str
|
39
|
+
@algor = RMMSeg::Config.algorithm_instance(@text)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'rmmseg/rule_helper'
|
2
|
+
|
3
|
+
module RMMSeg
|
4
|
+
# Largest average word length rule.
|
5
|
+
class LAWLRule
|
6
|
+
def filter(chunks)
|
7
|
+
chunks.sort { |a, b|
|
8
|
+
b.average_length <=> a.average_length
|
9
|
+
}.similar_elements { |a, b|
|
10
|
+
a.average_length == b.average_length
|
11
|
+
}
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'rmmseg/rule_helper'
|
2
|
+
|
3
|
+
module RMMSeg
|
4
|
+
# Largest sum of degree of morphemic freedom of one-character
|
5
|
+
# words rule.
|
6
|
+
class LSDMFOCWRule
|
7
|
+
def filter(chunks)
|
8
|
+
chunks.sort { |a, b|
|
9
|
+
b.degree_of_morphemic_freedom <=> a.degree_of_morphemic_freedom
|
10
|
+
}.similar_elements { |a, b|
|
11
|
+
a.degree_of_morphemic_freedom == b.degree_of_morphemic_freedom
|
12
|
+
}
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'rmmseg/rule_helper'
|
2
|
+
|
3
|
+
module RMMSeg
|
4
|
+
# Maximum matching rule, select the chunks with the
|
5
|
+
# maximum length.
|
6
|
+
class MMRule
|
7
|
+
def filter(chunks)
|
8
|
+
chunks.sort { |a, b|
|
9
|
+
b.total_length <=> a.total_length
|
10
|
+
}.similar_elements { |a, b|
|
11
|
+
a.total_length == b.total_length
|
12
|
+
}
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
class Array
|
2
|
+
# Return an array of _similar_ elements neighbouring to each
|
3
|
+
# other. e.g.
|
4
|
+
# [1,2,2,2,3,3,5].similar_elements(1) => [2,2,2]
|
5
|
+
# and (maybe more useful example)
|
6
|
+
# ["Kid", "Kily", "KDE", "Foo", "Food"].similar_elements { |a, b|
|
7
|
+
# a[0] == b[0]
|
8
|
+
# } => ["Kid", "Kily", "KDE"]
|
9
|
+
def similar_elements(index=0)
|
10
|
+
i = index+1
|
11
|
+
loop do
|
12
|
+
break if i >= self.length
|
13
|
+
if block_given?
|
14
|
+
break unless yield(self[index], self[i])
|
15
|
+
else
|
16
|
+
break if self[index] == self[i]
|
17
|
+
end
|
18
|
+
i += 1
|
19
|
+
end
|
20
|
+
self[index...i]
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'rmmseg/algorithm'
|
2
|
+
require 'rmmseg/mm_rule'
|
3
|
+
|
4
|
+
module RMMSeg
|
5
|
+
class SimpleAlgorithm
|
6
|
+
include Algorithm
|
7
|
+
|
8
|
+
# Create a new SimpleAlgorithm . The only rule used by this
|
9
|
+
# algorithm is MMRule .
|
10
|
+
def initialize(text)
|
11
|
+
super
|
12
|
+
@rules = [ MMRule.new ]
|
13
|
+
end
|
14
|
+
|
15
|
+
# Create all possible one-word chunks starting from +@index+ .
|
16
|
+
def create_chunks
|
17
|
+
find_match_words(@chars, @index).map { |word|
|
18
|
+
Chunk.new([word])
|
19
|
+
}
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'rmmseg/rule_helper'
|
2
|
+
|
3
|
+
module RMMSeg
|
4
|
+
# Smallest variance of word length rule.
|
5
|
+
class SVWLRule
|
6
|
+
def filter(chunks)
|
7
|
+
chunks.sort { |a, b|
|
8
|
+
a.variance <=> b.variance
|
9
|
+
}.similar_elements { |a, b|
|
10
|
+
a.variance == b.variance
|
11
|
+
}
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
data/lib/rmmseg/token.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
module RMMSeg
|
2
|
+
# A Token consists of a term's text and the start and end offset
|
3
|
+
# of the term.
|
4
|
+
class Token
|
5
|
+
# Text of the token.
|
6
|
+
attr_reader :text
|
7
|
+
|
8
|
+
# The start position of the token. This is *byte* index instead of
|
9
|
+
# character.
|
10
|
+
attr_reader :start_pos
|
11
|
+
|
12
|
+
# The one greater than the position of the last byte of the
|
13
|
+
# token. This is *byte* index instead of character.
|
14
|
+
attr_reader :end_pos
|
15
|
+
|
16
|
+
def initialize(text, start_pos, end_pos)
|
17
|
+
@text = text
|
18
|
+
@start_pos = start_pos
|
19
|
+
@end_pos = end_pos
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
data/lib/rmmseg/word.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
module RMMSeg
|
2
|
+
# An object representing a CJK word.
|
3
|
+
class Word
|
4
|
+
TYPES = {
|
5
|
+
:unrecognized => :unrecognized,
|
6
|
+
:basic_latin_word => :basic_latin_word,
|
7
|
+
:cjk_word => :cjk_word
|
8
|
+
}.freeze
|
9
|
+
|
10
|
+
# The content text of the word.
|
11
|
+
attr_reader :text
|
12
|
+
|
13
|
+
# The type of the word, may be one of the key of TYPES .
|
14
|
+
attr_reader :type
|
15
|
+
|
16
|
+
# The frequency of the word. This value is meaningful only
|
17
|
+
# when this is a one-character word.
|
18
|
+
attr_reader :frequency
|
19
|
+
|
20
|
+
# Initialize a Word object.
|
21
|
+
def initialize(text, type=TYPES[:unrecognized], frequency=nil)
|
22
|
+
@text = text
|
23
|
+
@type = type
|
24
|
+
@frequency = frequency
|
25
|
+
end
|
26
|
+
|
27
|
+
# The number of characters in the word. *Not* number of bytes.
|
28
|
+
def length
|
29
|
+
@text.jlength
|
30
|
+
end
|
31
|
+
|
32
|
+
# The number of bytes in the word.
|
33
|
+
def byte_size
|
34
|
+
@text.length
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|