rmmseg 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,51 @@
1
+ module RMMSeg
2
+ # A Chunk holds one or more successive Word .
3
+ class Chunk
4
+ # The words held by this chunk.
5
+ attr_reader :words
6
+
7
+ # Build a Chunk on an array of Word .
8
+ def initialize(words)
9
+ @words = words
10
+ @average_length = nil
11
+ @total_length = nil
12
+ @variance = nil
13
+ @degree_of_morphemic_freedom = nil
14
+ end
15
+
16
+ # The sum of length of all words held by this chunk.
17
+ def total_length
18
+ @total_length ||= @words.inject(0.0) { |len, word| len + word.length }
19
+ @total_length
20
+ end
21
+
22
+ # The average length of words held by this chunk.
23
+ def average_length
24
+ @average_length ||= total_length/@words.size
25
+ @average_length
26
+ end
27
+
28
+ # The square of the standard deviation of length of all words
29
+ # held by this chunk.
30
+ def variance
31
+ @variance ||= Math.sqrt(@words.inject(0.0) { |sqr_sum, word|
32
+ tmp = word.length - average_length
33
+ sqr_sum + tmp*tmp
34
+ })
35
+ @variance
36
+ end
37
+
38
+ # The sum of all frequencies of one-character words held by
39
+ # this chunk.
40
+ def degree_of_morphemic_freedom
41
+ @degree_of_morphemic_freedom ||= @words.inject(0) { |sum, word|
42
+ if word.length == 1 && word.type == Word::TYPES[:cjk_word]
43
+ sum + word.frequency
44
+ else
45
+ sum
46
+ end
47
+ }
48
+ @degree_of_morphemic_freedom
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,52 @@
1
+ require 'rmmseg/algorithm'
2
+ require 'rmmseg/mm_rule'
3
+ require 'rmmseg/lawl_rule'
4
+ require 'rmmseg/svwl_rule'
5
+ require 'rmmseg/lsdmfocw_rule'
6
+
7
+ module RMMSeg
8
+ class ComplexAlgorithm
9
+ include Algorithm
10
+
11
+ # Create a new ComplexAlgorithm . Rules used by this algorithm
12
+ # includes MMRule , LAWLRule , SVWLRule and LSDMFOCWRule .
13
+ def initialize(text)
14
+ super
15
+ @rules = [
16
+ MMRule.new,
17
+ LAWLRule.new,
18
+ SVWLRule.new,
19
+ LSDMFOCWRule.new
20
+ ]
21
+ end
22
+
23
+ # Create all possible three-word (or less) chunks
24
+ # starting from +@index+ .
25
+ def create_chunks
26
+ chunks = Array.new
27
+ find_match_words(@chars, @index).each { |w0|
28
+ index0 = @index + w0.length
29
+ if index0 < @chars.length
30
+ find_match_words(@chars, index0).each { |w1|
31
+ index1 = index0 + w1.length
32
+ if index1 < @chars.length
33
+ find_match_words(@chars, index1).each { |w2|
34
+ if w2.type == Word::TYPES[:unrecognized]
35
+ chunks << Chunk.new([w0, w1])
36
+ else
37
+ chunks << Chunk.new([w0, w1, w2])
38
+ end
39
+ }
40
+ elsif index1 == @chars.length
41
+ chunks << Chunk.new([w0, w1])
42
+ end
43
+ }
44
+ elsif index0 == @chars.length
45
+ chunks << Chunk.new([w0])
46
+ end
47
+ }
48
+
49
+ chunks
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,59 @@
1
+ require 'rmmseg/simple_algorithm'
2
+ require 'rmmseg/complex_algorithm'
3
+
4
+ module RMMSeg
5
+ # Configurations of RMMSeg.
6
+ class Config
7
+ @algorithm = :complex
8
+ @on_ambiguity = :select_first
9
+ @dictionaries = [[File.join(File.dirname(__FILE__), "chars.dic"), true],
10
+ [File.join(File.dirname(__FILE__), "words.dic"), false]]
11
+ @max_word_length = 4
12
+
13
+ class << self
14
+ # Get the algorithm name currently using
15
+ def algorithm
16
+ @algorithm
17
+ end
18
+ # Set the algorithm name used to segment. Valid values are
19
+ # +:complex+ and +:simple+ . The former is the default one.
20
+ def algorithm=(algor)
21
+ unless [:complex, :simple].include? algor
22
+ raise ArgumentError, "Unknown algorithm #{algor}"
23
+ end
24
+ @algorithm = algor
25
+ end
26
+ # Get an instance of the algorithm object corresponding to the
27
+ # algorithm name configured.
28
+ def algorithm_instance(text)
29
+ RMMSeg.const_get("#{@algorithm}".capitalize+"Algorithm").new(text)
30
+ end
31
+
32
+ # Get the behavior description when an unresolved ambiguity occured.
33
+ def on_ambiguity
34
+ @on_ambiguity
35
+ end
36
+ # Set the behavior on an unresolved ambiguity. Valid values are
37
+ # +:raise_exception+ and +:select_first+ . The latter is the default
38
+ # one.
39
+ def on_ambiguity=(behavior)
40
+ unless [:raise_exception, :select_first].include? behavior
41
+ raise ArgumentError, "Unknown behavior on ambiguity: #{behavior}"
42
+ end
43
+ @on_ambiguity = behavior
44
+ end
45
+
46
+ # An array of dictionary files. Each element should be of the
47
+ # form: [file, whether_dic_include_frequency_info]. This should
48
+ # be set before the dictionaries are loaded (They are loaded
49
+ # only when they are used). Or else you should call
50
+ # Dictionary.instance.reload manually to reload the
51
+ # dictionaries.
52
+ attr_accessor :dictionaries
53
+
54
+ # The maximum length of a CJK word. The default value is 4. Making
55
+ # this value too large might slow down the segment operations.
56
+ attr_accessor :max_word_length
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,66 @@
1
+ require 'singleton'
2
+
3
+ module RMMSeg
4
+ # The dictionary is a singleton object which is lazily initialized.
5
+ class Dictionary
6
+ include Singleton
7
+
8
+ # Initialize and load dictionaries from files specified by
9
+ # +Config.dictionaries+ .
10
+ def initialize
11
+ load_dictionaries
12
+ end
13
+
14
+ # Determin whether +value+ is a word in the dictionary.
15
+ def has_word?(value)
16
+ @dic.has_key?(value)
17
+ end
18
+
19
+ # Get an instance of Word corresponding to +value+ .
20
+ def get_word(value)
21
+ word = @dic[value]
22
+ # Construct a Word lazily
23
+ if word.is_a? String
24
+ arr = word.split(" ")
25
+ word = Word.new(arr[0], Word::TYPES[:cjk_word], arr[1].to_i)
26
+ @dic[value] = word
27
+ end
28
+ word
29
+ end
30
+
31
+ # Reload all dictionary files.
32
+ def reload
33
+ @dic = nil
34
+ load_dictionaries
35
+ end
36
+
37
+ private
38
+ def load_dictionaries
39
+ @dic = Hash.new
40
+ Config.dictionaries.each { |file, has_freq|
41
+ if has_freq
42
+ load_dictionary_with_freq(file)
43
+ else
44
+ load_dictionary(file)
45
+ end
46
+ }
47
+ end
48
+
49
+ def load_dictionary_with_freq(file)
50
+ File.open(file, "r") { |f|
51
+ f.each_line { |line|
52
+ pair = line.split(" ")
53
+ @dic[pair[0]] = line
54
+ }
55
+ }
56
+ end
57
+ def load_dictionary(file)
58
+ File.open(file, "r") { |f|
59
+ f.each_line { |line|
60
+ line.chomp!.freeze
61
+ @dic[line] = line
62
+ }
63
+ }
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,43 @@
1
+ # This file integrate RMMSeg with Ferret
2
+ require 'rubygems'
3
+ require 'ferret'
4
+
5
+ module RMMSeg
6
+ module Ferret
7
+ # The Analyzer class can be used with Ferret .
8
+ class Analyzer < ::Ferret::Analysis::Analyzer
9
+ def token_stream(field, text)
10
+ Tokenizer.new(text)
11
+ end
12
+ end
13
+
14
+ # The Tokenizer tokenize text with RMMSeg::Algorithm.
15
+ class Tokenizer < ::Ferret::Analysis::TokenStream
16
+ # Create a new Tokenizer to tokenize +text+
17
+ def initialize(str)
18
+ self.text = str
19
+ end
20
+
21
+ # Get next token
22
+ def next
23
+ tk = @algor.next_token
24
+ if tk.nil?
25
+ nil
26
+ else
27
+ ::Ferret::Analysis::Token.new(tk.text, tk.start_pos, tk.end_pos)
28
+ end
29
+ end
30
+
31
+ # Get the text being tokenized
32
+ def text
33
+ @text
34
+ end
35
+
36
+ # Set the text to be tokenized
37
+ def text=(str)
38
+ @text = str
39
+ @algor = RMMSeg::Config.algorithm_instance(@text)
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,14 @@
1
+ require 'rmmseg/rule_helper'
2
+
3
+ module RMMSeg
4
+ # Largest average word length rule.
5
+ class LAWLRule
6
+ def filter(chunks)
7
+ chunks.sort { |a, b|
8
+ b.average_length <=> a.average_length
9
+ }.similar_elements { |a, b|
10
+ a.average_length == b.average_length
11
+ }
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,15 @@
1
+ require 'rmmseg/rule_helper'
2
+
3
+ module RMMSeg
4
+ # Largest sum of degree of morphemic freedom of one-character
5
+ # words rule.
6
+ class LSDMFOCWRule
7
+ def filter(chunks)
8
+ chunks.sort { |a, b|
9
+ b.degree_of_morphemic_freedom <=> a.degree_of_morphemic_freedom
10
+ }.similar_elements { |a, b|
11
+ a.degree_of_morphemic_freedom == b.degree_of_morphemic_freedom
12
+ }
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,15 @@
1
+ require 'rmmseg/rule_helper'
2
+
3
+ module RMMSeg
4
+ # Maximum matching rule, select the chunks with the
5
+ # maximum length.
6
+ class MMRule
7
+ def filter(chunks)
8
+ chunks.sort { |a, b|
9
+ b.total_length <=> a.total_length
10
+ }.similar_elements { |a, b|
11
+ a.total_length == b.total_length
12
+ }
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,22 @@
1
+ class Array
2
+ # Return an array of _similar_ elements neighbouring to each
3
+ # other. e.g.
4
+ # [1,2,2,2,3,3,5].similar_elements(1) => [2,2,2]
5
+ # and (maybe more useful example)
6
+ # ["Kid", "Kily", "KDE", "Foo", "Food"].similar_elements { |a, b|
7
+ # a[0] == b[0]
8
+ # } => ["Kid", "Kily", "KDE"]
9
+ def similar_elements(index=0)
10
+ i = index+1
11
+ loop do
12
+ break if i >= self.length
13
+ if block_given?
14
+ break unless yield(self[index], self[i])
15
+ else
16
+ break if self[index] == self[i]
17
+ end
18
+ i += 1
19
+ end
20
+ self[index...i]
21
+ end
22
+ end
@@ -0,0 +1,22 @@
1
+ require 'rmmseg/algorithm'
2
+ require 'rmmseg/mm_rule'
3
+
4
+ module RMMSeg
5
+ class SimpleAlgorithm
6
+ include Algorithm
7
+
8
+ # Create a new SimpleAlgorithm . The only rule used by this
9
+ # algorithm is MMRule .
10
+ def initialize(text)
11
+ super
12
+ @rules = [ MMRule.new ]
13
+ end
14
+
15
+ # Create all possible one-word chunks starting from +@index+ .
16
+ def create_chunks
17
+ find_match_words(@chars, @index).map { |word|
18
+ Chunk.new([word])
19
+ }
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,14 @@
1
+ require 'rmmseg/rule_helper'
2
+
3
+ module RMMSeg
4
+ # Smallest variance of word length rule.
5
+ class SVWLRule
6
+ def filter(chunks)
7
+ chunks.sort { |a, b|
8
+ a.variance <=> b.variance
9
+ }.similar_elements { |a, b|
10
+ a.variance == b.variance
11
+ }
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,22 @@
1
+ module RMMSeg
2
+ # A Token consists of a term's text and the start and end offset
3
+ # of the term.
4
+ class Token
5
+ # Text of the token.
6
+ attr_reader :text
7
+
8
+ # The start position of the token. This is *byte* index instead of
9
+ # character.
10
+ attr_reader :start_pos
11
+
12
+ # The one greater than the position of the last byte of the
13
+ # token. This is *byte* index instead of character.
14
+ attr_reader :end_pos
15
+
16
+ def initialize(text, start_pos, end_pos)
17
+ @text = text
18
+ @start_pos = start_pos
19
+ @end_pos = end_pos
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,37 @@
1
+ module RMMSeg
2
+ # An object representing a CJK word.
3
+ class Word
4
+ TYPES = {
5
+ :unrecognized => :unrecognized,
6
+ :basic_latin_word => :basic_latin_word,
7
+ :cjk_word => :cjk_word
8
+ }.freeze
9
+
10
+ # The content text of the word.
11
+ attr_reader :text
12
+
13
+ # The type of the word, may be one of the key of TYPES .
14
+ attr_reader :type
15
+
16
+ # The frequency of the word. This value is meaningful only
17
+ # when this is a one-character word.
18
+ attr_reader :frequency
19
+
20
+ # Initialize a Word object.
21
+ def initialize(text, type=TYPES[:unrecognized], frequency=nil)
22
+ @text = text
23
+ @type = type
24
+ @frequency = frequency
25
+ end
26
+
27
+ # The number of characters in the word. *Not* number of bytes.
28
+ def length
29
+ @text.jlength
30
+ end
31
+
32
+ # The number of bytes in the word.
33
+ def byte_size
34
+ @text.length
35
+ end
36
+ end
37
+ end