migemo-lib 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,183 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # Ruby/Migemo - a library for Japanese incremental search.
4
+ #
5
+ # Copyright (C) 2001 Satoru Takabayashi <satoru@namazu.org>
6
+ # All rights reserved.
7
+ # This is free software with ABSOLUTELY NO WARRANTY.
8
+ #
9
+ # You can redistribute it and/or modify it under the terms of
10
+ # the GNU General Public License version 2.
11
+
12
+ $LOAD_PATH << File.dirname(File.expand_path(__FILE__))
13
+ require 'migemo/core_ext/string'
14
+ require 'migemo-dict'
15
+ require 'migemo-regex'
16
+ require 'migemo/version'
17
+ require 'romkan'
18
+ include MigemoRegex
19
+
20
+
21
+ class Migemo
22
+ def initialize (pattern, dict=nil)
23
+ @static_dict = if dict.nil?
24
+ MigemoStaticDict.new(File.dirname(File.expand_path(__FILE__)) + '/../data/migemo-dict')
25
+ elsif dict.is_a?(String)
26
+ MigemoStaticDict.new(dict)
27
+ else
28
+ dict
29
+ end
30
+ @type = "ruby"
31
+ @pattern = pattern
32
+ @insertion = ""
33
+ @optimization = 3
34
+
35
+ @dict_cache = nil
36
+ @user_dict = nil
37
+ @regex_dict = nil
38
+ @with_paren = false
39
+ end
40
+ attr_accessor :optimization
41
+ attr_accessor :type
42
+ attr_accessor :insertion
43
+ attr_accessor :dict_cache
44
+ attr_accessor :user_dict
45
+ attr_accessor :regex_dict
46
+ attr_accessor :with_paren
47
+
48
+ def lookup
49
+ if @pattern == ""
50
+ return RegexAlternation.new
51
+ end
52
+ result = if @dict_cache
53
+ lookup_cache || lookup0
54
+ else
55
+ lookup0
56
+ end
57
+ if @user_dict
58
+ lookup_user_dict.each{|x| result.push(x) }
59
+ end
60
+ result
61
+ end
62
+
63
+ def regex_tree
64
+ lookup
65
+ end
66
+
67
+ def regex
68
+ regex = lookup
69
+ renderer = RegexRendererFactory.new(regex, @type, @insertion)
70
+ renderer.with_paren = @with_paren
71
+ string = renderer.render
72
+ string = renderer.join_regexes(string, lookup_regex_dict) if @regex_dict
73
+ string
74
+ end
75
+
76
+ private
77
+ # `do' => (ど)
78
+ # `d' => (っ だ ぢ づ で ど)
79
+ # `sh' => (しゃ し しゅ しぇ しょ)
80
+ # `don' => (どん どな どに どぬ どね どの どっ) # special case 1
81
+ # `nodd' => (のっ) # special case 2
82
+ # `doc' => (どっ どち) # special case 3
83
+ # `dox' => (どっ どゃ どゅ どょ) # special case 4
84
+ # `essy' => (えっしゃ えっしゅ えっしょ) # special case 5
85
+ # `ny' => (にゃ にゅ にょ) # special case 6
86
+ def expand_kanas
87
+ kana = @pattern.downcase.to_kana
88
+ /^(.*)(.)$/ =~ kana ;
89
+ head = $1;
90
+ last = $2;
91
+
92
+ cand = Array.new;
93
+ return [] if last == nil
94
+ if last.consonant?
95
+ if /^(.*)(.)$/ =~ head && $2.consonant?
96
+ head2 = $1;
97
+ beforelast = $2;
98
+ if last == beforelast # special case 2
99
+ cand.push head2 + "っ"
100
+ elsif /^(.*)(.)$/ =~ head2 && beforelast == $2 && last.consonant?
101
+ # special case 5
102
+ cand += (beforelast + last).expand_consonant.map do |x|
103
+ $1 + "っ" + x.to_kana
104
+ end
105
+ else
106
+ cand += (beforelast + last).expand_consonant.map do |x|
107
+ head2 + x.to_kana
108
+ end
109
+ end
110
+ elsif /^(.*?)(n?)ny$/ =~ @pattern && $2 == "" # special case 6
111
+ head2 = $1
112
+ cand += "ny".expand_consonant.map do |x|
113
+ head2 + x.to_kana
114
+ end
115
+ else
116
+ deriv = last.expand_consonant
117
+ deriv.push "xtsu";
118
+ if last == "c" # special case 3
119
+ deriv.push "chi";
120
+ elsif last == "x" # special case 4
121
+ deriv.push "xya", "xyu", "xyo", "xwa"
122
+ end
123
+ cand += deriv.map do |x| head + x.to_kana end
124
+ end
125
+ elsif last == "ん" # speacial case 1
126
+ cand.push kana;
127
+ cand += ("n".expand_consonant + ["っ"]).map do |x|
128
+ head + x.to_kana
129
+ end
130
+ else
131
+ cand.push kana
132
+ end
133
+ return cand.sort
134
+ end
135
+
136
+ # `めし' => (飯 飯合 雌蘂 雌蕊 飯櫃 目下 飯粒 召使 飯屋)
137
+ def expand_words (dict, pattern)
138
+ raise if pattern == nil
139
+ words = Array.new
140
+ dict.lookup(pattern) do |item|
141
+ words += item.values
142
+ end
143
+ return words
144
+ end
145
+
146
+ def lookup_cache
147
+ @dict_cache.lookup(@pattern)
148
+ end
149
+
150
+ def lookup0
151
+ compiler = RegexCompiler.new
152
+ compiler.push(@pattern)
153
+ compiler.push(@pattern.to_fullwidth)
154
+ expand_kanas.each do |x|
155
+ compiler.push(x)
156
+ compiler.push(x.to_katakana)
157
+ expand_words(@static_dict, x).each do |_x| compiler.push(_x) end
158
+ end
159
+ expand_words(@static_dict, @pattern).each do |x| compiler.push(x) end
160
+ compiler.uniq
161
+ compiler.optimize(@optimization) if @optimization
162
+ compiler.regex
163
+ end
164
+
165
+ def lookup_user_dict
166
+ compiler = RegexCompiler.new
167
+ expand_kanas.each do |x|
168
+ expand_words(@user_dict, x).each do |_x| compiler.push(_x) end
169
+ end
170
+ expand_words(@user_dict, @pattern).each do |x| compiler.push(x) end
171
+ compiler.uniq
172
+ compiler.optimize(@optimization) if @optimization
173
+ compiler.regex
174
+ end
175
+
176
+ def lookup_regex_dict
177
+ regexes = []
178
+ @regex_dict.lookup(@pattern) do |item|
179
+ regexes += item.values
180
+ end
181
+ regexes
182
+ end
183
+ end
@@ -0,0 +1,60 @@
1
+ # -*- coding: utf-8 -*-
2
+ class String
3
+ # Hiragana to Katakana
4
+ def to_katakana
5
+ self.gsub(/う゛/, 'ヴ').tr('ぁ-ん', 'ァ-ン')
6
+ end
7
+
8
+ def first
9
+ /^(\\.|.)/ =~ self
10
+ $1
11
+ end
12
+
13
+ def last
14
+ /(\\.|.)$/ =~ self
15
+ $1
16
+ end
17
+
18
+ def rest
19
+ /^(\\.|.)(.*)/ =~ self
20
+ $2
21
+ end
22
+
23
+ HANZEN_TAB = {
24
+ " " => " ", "!" => "!", '"' => "”", "#" => "#",
25
+ "\$" => "$", "%" => "%", "&" => "&", "'" => "’",
26
+ "(" => "(", ")" => ")", "*" => "*", "+" => "+",
27
+ "," => ",", "-" => "−", "." => ".", "/" => "/",
28
+ "0" => "0", "1" => "1", "2" => "2", "3" => "3",
29
+ "4" => "4", "5" => "5", "6" => "6", "7" => "7",
30
+ "8" => "8", "9" => "9", ":" => ":", ";" => ";",
31
+ "<" => "<", "=" => "=", ">" => ">", "?" => "?",
32
+ '@' => "@", "A" => "A", "B" => "B", "C" => "C",
33
+ "D" => "D", "E" => "E", "F" => "F", "G" => "G",
34
+ "H" => "H", "I" => "I", "J" => "J", "K" => "K",
35
+ "L" => "L", "M" => "M", "N" => "N", "O" => "O",
36
+ "P" => "P", "Q" => "Q", "R" => "R", "S" => "S",
37
+ "T" => "T", "U" => "U", "V" => "V", "W" => "W",
38
+ "X" => "X", "Y" => "Y", "Z" => "Z", "[" => "[",
39
+ "\\" => "\", "]" => "]", "^" => "^", "_" => "_",
40
+ "`" => "‘", "a" => "a", "b" => "b", "c" => "c",
41
+ "d" => "d", "e" => "e", "f" => "f", "g" => "g",
42
+ "h" => "h", "i" => "i", "j" => "j", "k" => "k",
43
+ "l" => "l", "m" => "m", "n" => "n", "o" => "o",
44
+ "p" => "p", "q" => "q", "r" => "r", "s" => "s",
45
+ "t" => "t", "u" => "u", "v" => "v", "w" => "w",
46
+ "x" => "x", "y" => "y", "z" => "z", "{" => "{",
47
+ "|" => "|", "}" => "}", "~" => "‾"} #'
48
+
49
+ HANZEN_RE = Regexp.new(HANZEN_TAB.keys.sort.map {|x|
50
+ Regexp.quote(x)
51
+ }.join('|'))
52
+
53
+ def to_fullwidth
54
+ self.gsub(HANZEN_RE) {|s| HANZEN_TAB[s]}
55
+ end
56
+
57
+ def prefix_match (string)
58
+ self[0, string.length] <=> string
59
+ end
60
+ end
@@ -0,0 +1,8 @@
1
+ class Migemo
2
+ module VERSION #:nodoc:
3
+ MAJOR = 0
4
+ MINOR = 4
5
+ TINY = 3
6
+ STRING = [MAJOR, MINOR, TINY].compact.join('.')
7
+ end
8
+ end
@@ -0,0 +1,28 @@
1
+ require 'test_helper'
2
+ class CacheTest < Test::Unit::TestCase
3
+ def setup
4
+ base_dir = File.dirname(File.expand_path(__FILE__))
5
+ dict_file = base_dir+ '/../data/test-dict'
6
+ words = base_dir + '/../data/migemo-chars'
7
+ File.open(dict_file) do |f|
8
+ @words = f.readlines.map{|l| l =~ /^(\w).*? /; $1 }.compact.uniq
9
+ end
10
+ unless File.exists?(dict_file + '.cache')
11
+ Migemo::Cache.new(dict_file, words).generate.save(dict_file + '.cache')
12
+ end
13
+ @cache_dict = MigemoDictCache.new(dict_file + '.cache')
14
+ end
15
+
16
+ def test_caches
17
+ @words.each do |w|
18
+ [:normal, :cache].each do |ivar|
19
+ migemo = Migemo.new(w, migemo_dict)
20
+ migemo.user_dict = user_dict
21
+ migemo.regex_dict = regex_dict
22
+ instance_variable_set("@#{ivar}", migemo)
23
+ end
24
+ @cache.dict_cache = @cache_dict
25
+ assert_equal @normal.regex, @cache.regex
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,15 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'test_helper'
3
+
4
+ class CharclassTest < Test::Unit::TestCase
5
+ def test_ruby_type
6
+ migemo = Migemo.new('sym', migemo_dict)
7
+ assert_equal "[]$%@\\\\-]|sym|sym", migemo.regex
8
+ end
9
+
10
+ def test_emacs_type
11
+ migemo = Migemo.new('sym', migemo_dict)
12
+ migemo.type = 'emacs'
13
+ assert_equal "[]$%@\\-]\\|sym\\|sym", migemo.regex
14
+ end
15
+ end
@@ -0,0 +1,29 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'test_helper'
3
+
4
+ class ConvertTest < Test::Unit::TestCase
5
+ def test_convert
6
+ input =<<-EOF
7
+ ;;
8
+ ;; This is a comment line.
9
+ ;;
10
+ りかい /理解/
11
+ りかいs /理解/
12
+ motion /モーション/
13
+ りくとく /六徳;人が守るべき六つの徳。「ろくとく」とも/
14
+ EOF
15
+ expects =<<-EOF
16
+ ;;
17
+ ;; This is Migemo's dictionary generated from SKK's.
18
+ ;;
19
+ ;;
20
+ ;; This is a comment line.
21
+ ;;
22
+ motion モーション
23
+ りかい 理解
24
+ りくとく 六徳
25
+ EOF
26
+ convert = Migemo::Convert.new(input)
27
+ assert_equal expects, (convert.header + convert.transfer).join("\n") + "\n"
28
+ end
29
+ end
@@ -0,0 +1,10 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'test_helper'
3
+
4
+ class TestEmacsType < Test::Unit::TestCase
5
+ def test_mot
6
+ migemo = Migemo.new('mot', migemo_dict)
7
+ migemo.type = 'emacs'
8
+ assert_equal 'mot\|mot\|も[たちっつてと]\|モ\([タチッツテト]\|ー\(ション\|ター\)\|スラ\)', migemo.regex
9
+ end
10
+ end
@@ -0,0 +1,16 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'test_helper'
3
+ class InsertionTest < Test::Unit::TestCase
4
+ def test_mot
5
+ migemo = Migemo.new('mot', migemo_dict)
6
+ migemo.insertion = "\\s *"
7
+ assert_equal 'm\s *o\s *t|m\s *o\s *t|も\s *[たちっつてと]|モ\s *(?:[タチッツテト]|ー\s *(?:シ\s *ョ\s *ン|タ\s *ー)|ス\s *ラ)', migemo.regex
8
+ end
9
+
10
+ def test_mot_as_emacs
11
+ migemo = Migemo.new('mot', migemo_dict)
12
+ migemo.type = 'emacs'
13
+ migemo.insertion = "\\s *"
14
+ assert_equal 'm\s *o\s *t\|m\s *o\s *t\|も\s *[たちっつてと]\|モ\s *\([タチッツテト]\|ー\s *\(シ\s *ョ\s *ン\|タ\s *ー\)\|ス\s *ラ\)', migemo.regex
15
+ end
16
+ end
@@ -0,0 +1,50 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'test_helper'
3
+
4
+ class MigemoTest < Test::Unit::TestCase
5
+ def test_empty_string
6
+ migemo = Migemo.new("", migemo_dict)
7
+ assert_equal "", migemo.regex
8
+ end
9
+
10
+ def test_k
11
+ migemo = Migemo.new("k", migemo_dict)
12
+ assert_equal "[kkかきくけこっカキクケコッ機帰気]", migemo.regex
13
+ end
14
+
15
+ def test_ki_with_no_optimize
16
+ migemo = Migemo.new("ki", migemo_dict)
17
+ migemo.optimization = 0
18
+ assert_equal "ki|ki|き|キ|気|機|帰|機能|帰納|帰農|機能主義|機能的|帰納的", migemo.regex
19
+ end
20
+
21
+ def test_ki_with_optimize1
22
+ migemo = Migemo.new("ki", migemo_dict)
23
+ migemo.optimization = 1
24
+ assert_equal "ki|ki|き|キ|機|帰|気", migemo.regex
25
+ end
26
+
27
+ def test_ki_with_optimize3
28
+ migemo = Migemo.new("ki", migemo_dict)
29
+ migemo.optimization = 3
30
+ assert_equal "[きキ機帰気]|ki|ki", migemo.regex
31
+ end
32
+
33
+ def test_kin
34
+ migemo = Migemo.new("kin", migemo_dict)
35
+ migemo.optimization = 3
36
+ assert_equal "kin|kin|き[っなにぬねのん]|キ[ッナニヌネノン]|機能|帰[納農]", migemo.regex
37
+ end
38
+
39
+ def test_mot_with_optimze2
40
+ migemo = Migemo.new("mot", migemo_dict)
41
+ migemo.optimization = 2
42
+ assert_equal "mot|mot|も(?:た|ち|っ|つ|て|と)|モ(?:ー(?:ション|ター)|スラ|タ|チ|ッ|ツ|テ|ト)", migemo.regex
43
+ end
44
+
45
+ def test_mot_with_optimze3
46
+ migemo = Migemo.new("mot", migemo_dict)
47
+ migemo.optimization = 3
48
+ assert_equal "mot|mot|も[たちっつてと]|モ(?:[タチッツテト]|ー(?:ション|ター)|スラ)", migemo.regex
49
+ end
50
+ end
@@ -0,0 +1,24 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'test_helper'
3
+
4
+ class RegexDictTest < Test::Unit::TestCase
5
+ def test_m
6
+ migemo = Migemo.new('m', migemo_dict)
7
+ migemo.regex_dict = regex_dict
8
+ assert_equal '[mmっまみむめもッマミムメモ]|\([-0-9a-zA-Z_.]+@[-0-9a-zA-Z_.]+\)', migemo.regex
9
+ end
10
+
11
+
12
+ def test_ur
13
+ migemo = Migemo.new('ur', migemo_dict)
14
+ migemo.regex_dict = regex_dict
15
+ assert_equal 'ur|ur|う[っらりるれろ]|ウ[ッラリルレロ]|\(\(http\|https\|ftp\|afs\|wais\|telnet\|ldap\|gopher\|news\|nntp\|rsync\|mailto\)://[-_.!~*\'()a-zA-Z0-9;/?:@&=+$,%#]+\)', migemo.regex
16
+ end
17
+
18
+ def test_m_with_userdict
19
+ migemo = Migemo.new('m', migemo_dict)
20
+ migemo.regex_dict = regex_dict
21
+ migemo.user_dict = user_dict
22
+ assert_equal '[mmっまみむめもッマミムメモ]|Message Of The Day|\([-0-9a-zA-Z_.]+@[-0-9a-zA-Z_.]+\)', migemo.regex
23
+ end
24
+ end
@@ -0,0 +1,16 @@
1
+ require 'test_helper'
2
+
3
+ class RegexTest < Test::Unit::TestCase
4
+ def test_compile
5
+ patterns = []
6
+ File.open(File.dirname(File.expand_path(__FILE__)) + '/../data/migemo-dict') do |f|
7
+ lines = f.readlines.map(&:chomp)
8
+ 10.times{ patterns << lines.slice!(rand(lines.length)) }
9
+ end
10
+ patterns.each do |pattern|
11
+ migemo = Migemo.new(pattern, migemo_dict)
12
+ assert Regexp.compile(migemo.regex)
13
+ end
14
+ end
15
+ end
16
+