rmmseg 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,51 @@
1
+ module RMMSeg
2
+ # A Chunk holds one or more successive Word .
3
+ class Chunk
4
+ # The words held by this chunk.
5
+ attr_reader :words
6
+
7
+ # Build a Chunk on an array of Word .
8
+ def initialize(words)
9
+ @words = words
10
+ @average_length = nil
11
+ @total_length = nil
12
+ @variance = nil
13
+ @degree_of_morphemic_freedom = nil
14
+ end
15
+
16
+ # The sum of length of all words held by this chunk.
17
+ def total_length
18
+ @total_length ||= @words.inject(0.0) { |len, word| len + word.length }
19
+ @total_length
20
+ end
21
+
22
+ # The average length of words held by this chunk.
23
+ def average_length
24
+ @average_length ||= total_length/@words.size
25
+ @average_length
26
+ end
27
+
28
+ # The square of the standard deviation of length of all words
29
+ # held by this chunk.
30
+ def variance
31
+ @variance ||= Math.sqrt(@words.inject(0.0) { |sqr_sum, word|
32
+ tmp = word.length - average_length
33
+ sqr_sum + tmp*tmp
34
+ })
35
+ @variance
36
+ end
37
+
38
+ # The sum of all frequencies of one-character words held by
39
+ # this chunk.
40
+ def degree_of_morphemic_freedom
41
+ @degree_of_morphemic_freedom ||= @words.inject(0) { |sum, word|
42
+ if word.length == 1 && word.type == Word::TYPES[:cjk_word]
43
+ sum + word.frequency
44
+ else
45
+ sum
46
+ end
47
+ }
48
+ @degree_of_morphemic_freedom
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,52 @@
1
+ require 'rmmseg/algorithm'
2
+ require 'rmmseg/mm_rule'
3
+ require 'rmmseg/lawl_rule'
4
+ require 'rmmseg/svwl_rule'
5
+ require 'rmmseg/lsdmfocw_rule'
6
+
7
+ module RMMSeg
8
+ class ComplexAlgorithm
9
+ include Algorithm
10
+
11
+ # Create a new ComplexAlgorithm . Rules used by this algorithm
12
+ # includes MMRule , LAWLRule , SVWLRule and LSDMFOCWRule .
13
+ def initialize(text)
14
+ super
15
+ @rules = [
16
+ MMRule.new,
17
+ LAWLRule.new,
18
+ SVWLRule.new,
19
+ LSDMFOCWRule.new
20
+ ]
21
+ end
22
+
23
+ # Create all possible three-word (or less) chunks
24
+ # starting from +@index+ .
25
+ def create_chunks
26
+ chunks = Array.new
27
+ find_match_words(@chars, @index).each { |w0|
28
+ index0 = @index + w0.length
29
+ if index0 < @chars.length
30
+ find_match_words(@chars, index0).each { |w1|
31
+ index1 = index0 + w1.length
32
+ if index1 < @chars.length
33
+ find_match_words(@chars, index1).each { |w2|
34
+ if w2.type == Word::TYPES[:unrecognized]
35
+ chunks << Chunk.new([w0, w1])
36
+ else
37
+ chunks << Chunk.new([w0, w1, w2])
38
+ end
39
+ }
40
+ elsif index1 == @chars.length
41
+ chunks << Chunk.new([w0, w1])
42
+ end
43
+ }
44
+ elsif index0 == @chars.length
45
+ chunks << Chunk.new([w0])
46
+ end
47
+ }
48
+
49
+ chunks
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,59 @@
1
+ require 'rmmseg/simple_algorithm'
2
+ require 'rmmseg/complex_algorithm'
3
+
4
+ module RMMSeg
5
+ # Configurations of RMMSeg.
6
+ class Config
7
+ @algorithm = :complex
8
+ @on_ambiguity = :select_first
9
+ @dictionaries = [[File.join(File.dirname(__FILE__), "chars.dic"), true],
10
+ [File.join(File.dirname(__FILE__), "words.dic"), false]]
11
+ @max_word_length = 4
12
+
13
+ class << self
14
+ # Get the algorithm name currently using
15
+ def algorithm
16
+ @algorithm
17
+ end
18
+ # Set the algorithm name used to segment. Valid values are
19
+ # +:complex+ and +:simple+ . The former is the default one.
20
+ def algorithm=(algor)
21
+ unless [:complex, :simple].include? algor
22
+ raise ArgumentError, "Unknown algorithm #{algor}"
23
+ end
24
+ @algorithm = algor
25
+ end
26
+ # Get an instance of the algorithm object corresponding to the
27
+ # algorithm name configured.
28
+ def algorithm_instance(text)
29
+ RMMSeg.const_get("#{@algorithm}".capitalize+"Algorithm").new(text)
30
+ end
31
+
32
+ # Get the behavior description when an unresolved ambiguity occured.
33
+ def on_ambiguity
34
+ @on_ambiguity
35
+ end
36
+ # Set the behavior on an unresolved ambiguity. Valid values are
37
+ # +:raise_exception+ and +:select_first+ . The latter is the default
38
+ # one.
39
+ def on_ambiguity=(behavior)
40
+ unless [:raise_exception, :select_first].include? behavior
41
+ raise ArgumentError, "Unknown behavior on ambiguity: #{behavior}"
42
+ end
43
+ @on_ambiguity = behavior
44
+ end
45
+
46
+ # An array of dictionary files. Each element should be of the
47
+ # form: [file, whether_dic_include_frequency_info]. This should
48
+ # be set before the dictionaries are loaded (They are loaded
49
+ # only when they are used). Or else you should call
50
+ # Dictionary.instance.reload manually to reload the
51
+ # dictionaries.
52
+ attr_accessor :dictionaries
53
+
54
+ # The maximum length of a CJK word. The default value is 4. Making
55
+ # this value too large might slow down the segment operations.
56
+ attr_accessor :max_word_length
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,66 @@
1
+ require 'singleton'
2
+
3
+ module RMMSeg
4
+ # The dictionary is a singleton object which is lazily initialized.
5
+ class Dictionary
6
+ include Singleton
7
+
8
+ # Initialize and load dictionaries from files specified by
9
+ # +Config.dictionaries+ .
10
+ def initialize
11
+ load_dictionaries
12
+ end
13
+
14
+ # Determin whether +value+ is a word in the dictionary.
15
+ def has_word?(value)
16
+ @dic.has_key?(value)
17
+ end
18
+
19
+ # Get an instance of Word corresponding to +value+ .
20
+ def get_word(value)
21
+ word = @dic[value]
22
+ # Construct a Word lazily
23
+ if word.is_a? String
24
+ arr = word.split(" ")
25
+ word = Word.new(arr[0], Word::TYPES[:cjk_word], arr[1].to_i)
26
+ @dic[value] = word
27
+ end
28
+ word
29
+ end
30
+
31
+ # Reload all dictionary files.
32
+ def reload
33
+ @dic = nil
34
+ load_dictionaries
35
+ end
36
+
37
+ private
38
+ def load_dictionaries
39
+ @dic = Hash.new
40
+ Config.dictionaries.each { |file, has_freq|
41
+ if has_freq
42
+ load_dictionary_with_freq(file)
43
+ else
44
+ load_dictionary(file)
45
+ end
46
+ }
47
+ end
48
+
49
+ def load_dictionary_with_freq(file)
50
+ File.open(file, "r") { |f|
51
+ f.each_line { |line|
52
+ pair = line.split(" ")
53
+ @dic[pair[0]] = line
54
+ }
55
+ }
56
+ end
57
+ def load_dictionary(file)
58
+ File.open(file, "r") { |f|
59
+ f.each_line { |line|
60
+ line.chomp!.freeze
61
+ @dic[line] = line
62
+ }
63
+ }
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,43 @@
1
+ # This file integrate RMMSeg with Ferret
2
+ require 'rubygems'
3
+ require 'ferret'
4
+
5
+ module RMMSeg
6
+ module Ferret
7
+ # The Analyzer class can be used with Ferret .
8
+ class Analyzer < ::Ferret::Analysis::Analyzer
9
+ def token_stream(field, text)
10
+ Tokenizer.new(text)
11
+ end
12
+ end
13
+
14
+ # The Tokenizer tokenize text with RMMSeg::Algorithm.
15
+ class Tokenizer < ::Ferret::Analysis::TokenStream
16
+ # Create a new Tokenizer to tokenize +text+
17
+ def initialize(str)
18
+ self.text = str
19
+ end
20
+
21
+ # Get next token
22
+ def next
23
+ tk = @algor.next_token
24
+ if tk.nil?
25
+ nil
26
+ else
27
+ ::Ferret::Analysis::Token.new(tk.text, tk.start_pos, tk.end_pos)
28
+ end
29
+ end
30
+
31
+ # Get the text being tokenized
32
+ def text
33
+ @text
34
+ end
35
+
36
+ # Set the text to be tokenized
37
+ def text=(str)
38
+ @text = str
39
+ @algor = RMMSeg::Config.algorithm_instance(@text)
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,14 @@
1
+ require 'rmmseg/rule_helper'
2
+
3
+ module RMMSeg
4
+ # Largest average word length rule.
5
+ class LAWLRule
6
+ def filter(chunks)
7
+ chunks.sort { |a, b|
8
+ b.average_length <=> a.average_length
9
+ }.similar_elements { |a, b|
10
+ a.average_length == b.average_length
11
+ }
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,15 @@
1
+ require 'rmmseg/rule_helper'
2
+
3
+ module RMMSeg
4
+ # Largest sum of degree of morphemic freedom of one-character
5
+ # words rule.
6
+ class LSDMFOCWRule
7
+ def filter(chunks)
8
+ chunks.sort { |a, b|
9
+ b.degree_of_morphemic_freedom <=> a.degree_of_morphemic_freedom
10
+ }.similar_elements { |a, b|
11
+ a.degree_of_morphemic_freedom == b.degree_of_morphemic_freedom
12
+ }
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,15 @@
1
+ require 'rmmseg/rule_helper'
2
+
3
+ module RMMSeg
4
+ # Maximum matching rule, select the chunks with the
5
+ # maximum length.
6
+ class MMRule
7
+ def filter(chunks)
8
+ chunks.sort { |a, b|
9
+ b.total_length <=> a.total_length
10
+ }.similar_elements { |a, b|
11
+ a.total_length == b.total_length
12
+ }
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,22 @@
1
+ class Array
2
+ # Return an array of _similar_ elements neighbouring to each
3
+ # other. e.g.
4
+ # [1,2,2,2,3,3,5].similar_elements(1) => [2,2,2]
5
+ # and (maybe more useful example)
6
+ # ["Kid", "Kily", "KDE", "Foo", "Food"].similar_elements { |a, b|
7
+ # a[0] == b[0]
8
+ # } => ["Kid", "Kily", "KDE"]
9
+ def similar_elements(index=0)
10
+ i = index+1
11
+ loop do
12
+ break if i >= self.length
13
+ if block_given?
14
+ break unless yield(self[index], self[i])
15
+ else
16
+ break if self[index] == self[i]
17
+ end
18
+ i += 1
19
+ end
20
+ self[index...i]
21
+ end
22
+ end
@@ -0,0 +1,22 @@
1
+ require 'rmmseg/algorithm'
2
+ require 'rmmseg/mm_rule'
3
+
4
+ module RMMSeg
5
+ class SimpleAlgorithm
6
+ include Algorithm
7
+
8
+ # Create a new SimpleAlgorithm . The only rule used by this
9
+ # algorithm is MMRule .
10
+ def initialize(text)
11
+ super
12
+ @rules = [ MMRule.new ]
13
+ end
14
+
15
+ # Create all possible one-word chunks starting from +@index+ .
16
+ def create_chunks
17
+ find_match_words(@chars, @index).map { |word|
18
+ Chunk.new([word])
19
+ }
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,14 @@
1
+ require 'rmmseg/rule_helper'
2
+
3
+ module RMMSeg
4
+ # Smallest variance of word length rule.
5
+ class SVWLRule
6
+ def filter(chunks)
7
+ chunks.sort { |a, b|
8
+ a.variance <=> b.variance
9
+ }.similar_elements { |a, b|
10
+ a.variance == b.variance
11
+ }
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,22 @@
1
+ module RMMSeg
2
+ # A Token consists of a term's text and the start and end offset
3
+ # of the term.
4
+ class Token
5
+ # Text of the token.
6
+ attr_reader :text
7
+
8
+ # The start position of the token. This is *byte* index instead of
9
+ # character.
10
+ attr_reader :start_pos
11
+
12
+ # The one greater than the position of the last byte of the
13
+ # token. This is *byte* index instead of character.
14
+ attr_reader :end_pos
15
+
16
+ def initialize(text, start_pos, end_pos)
17
+ @text = text
18
+ @start_pos = start_pos
19
+ @end_pos = end_pos
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,37 @@
1
+ module RMMSeg
2
+ # An object representing a CJK word.
3
+ class Word
4
+ TYPES = {
5
+ :unrecognized => :unrecognized,
6
+ :basic_latin_word => :basic_latin_word,
7
+ :cjk_word => :cjk_word
8
+ }.freeze
9
+
10
+ # The content text of the word.
11
+ attr_reader :text
12
+
13
+ # The type of the word, may be one of the key of TYPES .
14
+ attr_reader :type
15
+
16
+ # The frequency of the word. This value is meaningful only
17
+ # when this is a one-character word.
18
+ attr_reader :frequency
19
+
20
+ # Initialize a Word object.
21
+ def initialize(text, type=TYPES[:unrecognized], frequency=nil)
22
+ @text = text
23
+ @type = type
24
+ @frequency = frequency
25
+ end
26
+
27
+ # The number of characters in the word. *Not* number of bytes.
28
+ def length
29
+ @text.jlength
30
+ end
31
+
32
+ # The number of bytes in the word.
33
+ def byte_size
34
+ @text.length
35
+ end
36
+ end
37
+ end