migemo-lib 0.4.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,183 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # Ruby/Migemo - a library for Japanese incremental search.
4
+ #
5
+ # Copyright (C) 2001 Satoru Takabayashi <satoru@namazu.org>
6
+ # All rights reserved.
7
+ # This is free software with ABSOLUTELY NO WARRANTY.
8
+ #
9
+ # You can redistribute it and/or modify it under the terms of
10
+ # the GNU General Public License version 2.
11
+
12
+ $LOAD_PATH << File.dirname(File.expand_path(__FILE__))
13
+ require 'migemo/core_ext/string'
14
+ require 'migemo-dict'
15
+ require 'migemo-regex'
16
+ require 'migemo/version'
17
+ require 'romkan'
18
+ include MigemoRegex
19
+
20
+
21
+ class Migemo
22
+ def initialize (pattern, dict=nil)
23
+ @static_dict = if dict.nil?
24
+ MigemoStaticDict.new(File.dirname(File.expand_path(__FILE__)) + '/../data/migemo-dict')
25
+ elsif dict.is_a?(String)
26
+ MigemoStaticDict.new(dict)
27
+ else
28
+ dict
29
+ end
30
+ @type = "ruby"
31
+ @pattern = pattern
32
+ @insertion = ""
33
+ @optimization = 3
34
+
35
+ @dict_cache = nil
36
+ @user_dict = nil
37
+ @regex_dict = nil
38
+ @with_paren = false
39
+ end
40
+ attr_accessor :optimization
41
+ attr_accessor :type
42
+ attr_accessor :insertion
43
+ attr_accessor :dict_cache
44
+ attr_accessor :user_dict
45
+ attr_accessor :regex_dict
46
+ attr_accessor :with_paren
47
+
48
+ def lookup
49
+ if @pattern == ""
50
+ return RegexAlternation.new
51
+ end
52
+ result = if @dict_cache
53
+ lookup_cache || lookup0
54
+ else
55
+ lookup0
56
+ end
57
+ if @user_dict
58
+ lookup_user_dict.each{|x| result.push(x) }
59
+ end
60
+ result
61
+ end
62
+
63
+ def regex_tree
64
+ lookup
65
+ end
66
+
67
+ def regex
68
+ regex = lookup
69
+ renderer = RegexRendererFactory.new(regex, @type, @insertion)
70
+ renderer.with_paren = @with_paren
71
+ string = renderer.render
72
+ string = renderer.join_regexes(string, lookup_regex_dict) if @regex_dict
73
+ string
74
+ end
75
+
76
+ private
77
+ # `do' => (ど)
78
+ # `d' => (っ だ ぢ づ で ど)
79
+ # `sh' => (しゃ し しゅ しぇ しょ)
80
+ # `don' => (どん どな どに どぬ どね どの どっ) # special case 1
81
+ # `nodd' => (のっ) # special case 2
82
+ # `doc' => (どっ どち) # special case 3
83
+ # `dox' => (どっ どゃ どゅ どょ) # special case 4
84
+ # `essy' => (えっしゃ えっしゅ えっしょ) # special case 5
85
+ # `ny' => (にゃ にゅ にょ) # special case 6
86
+ def expand_kanas
87
+ kana = @pattern.downcase.to_kana
88
+ /^(.*)(.)$/ =~ kana ;
89
+ head = $1;
90
+ last = $2;
91
+
92
+ cand = Array.new;
93
+ return [] if last == nil
94
+ if last.consonant?
95
+ if /^(.*)(.)$/ =~ head && $2.consonant?
96
+ head2 = $1;
97
+ beforelast = $2;
98
+ if last == beforelast # special case 2
99
+ cand.push head2 + "っ"
100
+ elsif /^(.*)(.)$/ =~ head2 && beforelast == $2 && last.consonant?
101
+ # special case 5
102
+ cand += (beforelast + last).expand_consonant.map do |x|
103
+ $1 + "っ" + x.to_kana
104
+ end
105
+ else
106
+ cand += (beforelast + last).expand_consonant.map do |x|
107
+ head2 + x.to_kana
108
+ end
109
+ end
110
+ elsif /^(.*?)(n?)ny$/ =~ @pattern && $2 == "" # special case 6
111
+ head2 = $1
112
+ cand += "ny".expand_consonant.map do |x|
113
+ head2 + x.to_kana
114
+ end
115
+ else
116
+ deriv = last.expand_consonant
117
+ deriv.push "xtsu";
118
+ if last == "c" # special case 3
119
+ deriv.push "chi";
120
+ elsif last == "x" # special case 4
121
+ deriv.push "xya", "xyu", "xyo", "xwa"
122
+ end
123
+ cand += deriv.map do |x| head + x.to_kana end
124
+ end
125
+ elsif last == "ん" # speacial case 1
126
+ cand.push kana;
127
+ cand += ("n".expand_consonant + ["っ"]).map do |x|
128
+ head + x.to_kana
129
+ end
130
+ else
131
+ cand.push kana
132
+ end
133
+ return cand.sort
134
+ end
135
+
136
+ # `めし' => (飯 飯合 雌蘂 雌蕊 飯櫃 目下 飯粒 召使 飯屋)
137
+ def expand_words (dict, pattern)
138
+ raise if pattern == nil
139
+ words = Array.new
140
+ dict.lookup(pattern) do |item|
141
+ words += item.values
142
+ end
143
+ return words
144
+ end
145
+
146
+ def lookup_cache
147
+ @dict_cache.lookup(@pattern)
148
+ end
149
+
150
+ def lookup0
151
+ compiler = RegexCompiler.new
152
+ compiler.push(@pattern)
153
+ compiler.push(@pattern.to_fullwidth)
154
+ expand_kanas.each do |x|
155
+ compiler.push(x)
156
+ compiler.push(x.to_katakana)
157
+ expand_words(@static_dict, x).each do |_x| compiler.push(_x) end
158
+ end
159
+ expand_words(@static_dict, @pattern).each do |x| compiler.push(x) end
160
+ compiler.uniq
161
+ compiler.optimize(@optimization) if @optimization
162
+ compiler.regex
163
+ end
164
+
165
+ def lookup_user_dict
166
+ compiler = RegexCompiler.new
167
+ expand_kanas.each do |x|
168
+ expand_words(@user_dict, x).each do |_x| compiler.push(_x) end
169
+ end
170
+ expand_words(@user_dict, @pattern).each do |x| compiler.push(x) end
171
+ compiler.uniq
172
+ compiler.optimize(@optimization) if @optimization
173
+ compiler.regex
174
+ end
175
+
176
+ def lookup_regex_dict
177
+ regexes = []
178
+ @regex_dict.lookup(@pattern) do |item|
179
+ regexes += item.values
180
+ end
181
+ regexes
182
+ end
183
+ end
@@ -0,0 +1,60 @@
1
+ # -*- coding: utf-8 -*-
2
+ class String
3
+ # Hiragana to Katakana
4
+ def to_katakana
5
+ self.gsub(/う゛/, 'ヴ').tr('ぁ-ん', 'ァ-ン')
6
+ end
7
+
8
+ def first
9
+ /^(\\.|.)/ =~ self
10
+ $1
11
+ end
12
+
13
+ def last
14
+ /(\\.|.)$/ =~ self
15
+ $1
16
+ end
17
+
18
+ def rest
19
+ /^(\\.|.)(.*)/ =~ self
20
+ $2
21
+ end
22
+
23
+ HANZEN_TAB = {
24
+ " " => " ", "!" => "!", '"' => "”", "#" => "#",
25
+ "\$" => "$", "%" => "%", "&" => "&", "'" => "’",
26
+ "(" => "(", ")" => ")", "*" => "*", "+" => "+",
27
+ "," => ",", "-" => "−", "." => ".", "/" => "/",
28
+ "0" => "0", "1" => "1", "2" => "2", "3" => "3",
29
+ "4" => "4", "5" => "5", "6" => "6", "7" => "7",
30
+ "8" => "8", "9" => "9", ":" => ":", ";" => ";",
31
+ "<" => "<", "=" => "=", ">" => ">", "?" => "?",
32
+ '@' => "@", "A" => "A", "B" => "B", "C" => "C",
33
+ "D" => "D", "E" => "E", "F" => "F", "G" => "G",
34
+ "H" => "H", "I" => "I", "J" => "J", "K" => "K",
35
+ "L" => "L", "M" => "M", "N" => "N", "O" => "O",
36
+ "P" => "P", "Q" => "Q", "R" => "R", "S" => "S",
37
+ "T" => "T", "U" => "U", "V" => "V", "W" => "W",
38
+ "X" => "X", "Y" => "Y", "Z" => "Z", "[" => "[",
39
+ "\\" => "\", "]" => "]", "^" => "^", "_" => "_",
40
+ "`" => "‘", "a" => "a", "b" => "b", "c" => "c",
41
+ "d" => "d", "e" => "e", "f" => "f", "g" => "g",
42
+ "h" => "h", "i" => "i", "j" => "j", "k" => "k",
43
+ "l" => "l", "m" => "m", "n" => "n", "o" => "o",
44
+ "p" => "p", "q" => "q", "r" => "r", "s" => "s",
45
+ "t" => "t", "u" => "u", "v" => "v", "w" => "w",
46
+ "x" => "x", "y" => "y", "z" => "z", "{" => "{",
47
+ "|" => "|", "}" => "}", "~" => "‾"} #'
48
+
49
+ HANZEN_RE = Regexp.new(HANZEN_TAB.keys.sort.map {|x|
50
+ Regexp.quote(x)
51
+ }.join('|'))
52
+
53
+ def to_fullwidth
54
+ self.gsub(HANZEN_RE) {|s| HANZEN_TAB[s]}
55
+ end
56
+
57
+ def prefix_match (string)
58
+ self[0, string.length] <=> string
59
+ end
60
+ end
@@ -0,0 +1,8 @@
1
+ class Migemo
2
+ module VERSION #:nodoc:
3
+ MAJOR = 0
4
+ MINOR = 4
5
+ TINY = 3
6
+ STRING = [MAJOR, MINOR, TINY].compact.join('.')
7
+ end
8
+ end
@@ -0,0 +1,28 @@
1
+ require 'test_helper'
2
+ class CacheTest < Test::Unit::TestCase
3
+ def setup
4
+ base_dir = File.dirname(File.expand_path(__FILE__))
5
+ dict_file = base_dir+ '/../data/test-dict'
6
+ words = base_dir + '/../data/migemo-chars'
7
+ File.open(dict_file) do |f|
8
+ @words = f.readlines.map{|l| l =~ /^(\w).*? /; $1 }.compact.uniq
9
+ end
10
+ unless File.exists?(dict_file + '.cache')
11
+ Migemo::Cache.new(dict_file, words).generate.save(dict_file + '.cache')
12
+ end
13
+ @cache_dict = MigemoDictCache.new(dict_file + '.cache')
14
+ end
15
+
16
+ def test_caches
17
+ @words.each do |w|
18
+ [:normal, :cache].each do |ivar|
19
+ migemo = Migemo.new(w, migemo_dict)
20
+ migemo.user_dict = user_dict
21
+ migemo.regex_dict = regex_dict
22
+ instance_variable_set("@#{ivar}", migemo)
23
+ end
24
+ @cache.dict_cache = @cache_dict
25
+ assert_equal @normal.regex, @cache.regex
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,15 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'test_helper'
3
+
4
+ class CharclassTest < Test::Unit::TestCase
5
+ def test_ruby_type
6
+ migemo = Migemo.new('sym', migemo_dict)
7
+ assert_equal "[]$%@\\\\-]|sym|sym", migemo.regex
8
+ end
9
+
10
+ def test_emacs_type
11
+ migemo = Migemo.new('sym', migemo_dict)
12
+ migemo.type = 'emacs'
13
+ assert_equal "[]$%@\\-]\\|sym\\|sym", migemo.regex
14
+ end
15
+ end
@@ -0,0 +1,29 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'test_helper'
3
+
4
+ class ConvertTest < Test::Unit::TestCase
5
+ def test_convert
6
+ input =<<-EOF
7
+ ;;
8
+ ;; This is a comment line.
9
+ ;;
10
+ りかい /理解/
11
+ りかいs /理解/
12
+ motion /モーション/
13
+ りくとく /六徳;人が守るべき六つの徳。「ろくとく」とも/
14
+ EOF
15
+ expects =<<-EOF
16
+ ;;
17
+ ;; This is Migemo's dictionary generated from SKK's.
18
+ ;;
19
+ ;;
20
+ ;; This is a comment line.
21
+ ;;
22
+ motion モーション
23
+ りかい 理解
24
+ りくとく 六徳
25
+ EOF
26
+ convert = Migemo::Convert.new(input)
27
+ assert_equal expects, (convert.header + convert.transfer).join("\n") + "\n"
28
+ end
29
+ end
@@ -0,0 +1,10 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'test_helper'
3
+
4
+ class TestEmacsType < Test::Unit::TestCase
5
+ def test_mot
6
+ migemo = Migemo.new('mot', migemo_dict)
7
+ migemo.type = 'emacs'
8
+ assert_equal 'mot\|mot\|も[たちっつてと]\|モ\([タチッツテト]\|ー\(ション\|ター\)\|スラ\)', migemo.regex
9
+ end
10
+ end
@@ -0,0 +1,16 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'test_helper'
3
+ class InsertionTest < Test::Unit::TestCase
4
+ def test_mot
5
+ migemo = Migemo.new('mot', migemo_dict)
6
+ migemo.insertion = "\\s *"
7
+ assert_equal 'm\s *o\s *t|m\s *o\s *t|も\s *[たちっつてと]|モ\s *(?:[タチッツテト]|ー\s *(?:シ\s *ョ\s *ン|タ\s *ー)|ス\s *ラ)', migemo.regex
8
+ end
9
+
10
+ def test_mot_as_emacs
11
+ migemo = Migemo.new('mot', migemo_dict)
12
+ migemo.type = 'emacs'
13
+ migemo.insertion = "\\s *"
14
+ assert_equal 'm\s *o\s *t\|m\s *o\s *t\|も\s *[たちっつてと]\|モ\s *\([タチッツテト]\|ー\s *\(シ\s *ョ\s *ン\|タ\s *ー\)\|ス\s *ラ\)', migemo.regex
15
+ end
16
+ end
@@ -0,0 +1,50 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'test_helper'
3
+
4
+ class MigemoTest < Test::Unit::TestCase
5
+ def test_empty_string
6
+ migemo = Migemo.new("", migemo_dict)
7
+ assert_equal "", migemo.regex
8
+ end
9
+
10
+ def test_k
11
+ migemo = Migemo.new("k", migemo_dict)
12
+ assert_equal "[kkかきくけこっカキクケコッ機帰気]", migemo.regex
13
+ end
14
+
15
+ def test_ki_with_no_optimize
16
+ migemo = Migemo.new("ki", migemo_dict)
17
+ migemo.optimization = 0
18
+ assert_equal "ki|ki|き|キ|気|機|帰|機能|帰納|帰農|機能主義|機能的|帰納的", migemo.regex
19
+ end
20
+
21
+ def test_ki_with_optimize1
22
+ migemo = Migemo.new("ki", migemo_dict)
23
+ migemo.optimization = 1
24
+ assert_equal "ki|ki|き|キ|機|帰|気", migemo.regex
25
+ end
26
+
27
+ def test_ki_with_optimize3
28
+ migemo = Migemo.new("ki", migemo_dict)
29
+ migemo.optimization = 3
30
+ assert_equal "[きキ機帰気]|ki|ki", migemo.regex
31
+ end
32
+
33
+ def test_kin
34
+ migemo = Migemo.new("kin", migemo_dict)
35
+ migemo.optimization = 3
36
+ assert_equal "kin|kin|き[っなにぬねのん]|キ[ッナニヌネノン]|機能|帰[納農]", migemo.regex
37
+ end
38
+
39
+ def test_mot_with_optimze2
40
+ migemo = Migemo.new("mot", migemo_dict)
41
+ migemo.optimization = 2
42
+ assert_equal "mot|mot|も(?:た|ち|っ|つ|て|と)|モ(?:ー(?:ション|ター)|スラ|タ|チ|ッ|ツ|テ|ト)", migemo.regex
43
+ end
44
+
45
+ def test_mot_with_optimze3
46
+ migemo = Migemo.new("mot", migemo_dict)
47
+ migemo.optimization = 3
48
+ assert_equal "mot|mot|も[たちっつてと]|モ(?:[タチッツテト]|ー(?:ション|ター)|スラ)", migemo.regex
49
+ end
50
+ end
@@ -0,0 +1,24 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'test_helper'
3
+
4
+ class RegexDictTest < Test::Unit::TestCase
5
+ def test_m
6
+ migemo = Migemo.new('m', migemo_dict)
7
+ migemo.regex_dict = regex_dict
8
+ assert_equal '[mmっまみむめもッマミムメモ]|\([-0-9a-zA-Z_.]+@[-0-9a-zA-Z_.]+\)', migemo.regex
9
+ end
10
+
11
+
12
+ def test_ur
13
+ migemo = Migemo.new('ur', migemo_dict)
14
+ migemo.regex_dict = regex_dict
15
+ assert_equal 'ur|ur|う[っらりるれろ]|ウ[ッラリルレロ]|\(\(http\|https\|ftp\|afs\|wais\|telnet\|ldap\|gopher\|news\|nntp\|rsync\|mailto\)://[-_.!~*\'()a-zA-Z0-9;/?:@&=+$,%#]+\)', migemo.regex
16
+ end
17
+
18
+ def test_m_with_userdict
19
+ migemo = Migemo.new('m', migemo_dict)
20
+ migemo.regex_dict = regex_dict
21
+ migemo.user_dict = user_dict
22
+ assert_equal '[mmっまみむめもッマミムメモ]|Message Of The Day|\([-0-9a-zA-Z_.]+@[-0-9a-zA-Z_.]+\)', migemo.regex
23
+ end
24
+ end
@@ -0,0 +1,16 @@
1
+ require 'test_helper'
2
+
3
+ class RegexTest < Test::Unit::TestCase
4
+ def test_compile
5
+ patterns = []
6
+ File.open(File.dirname(File.expand_path(__FILE__)) + '/../data/migemo-dict') do |f|
7
+ lines = f.readlines.map(&:chomp)
8
+ 10.times{ patterns << lines.slice!(rand(lines.length)) }
9
+ end
10
+ patterns.each do |pattern|
11
+ migemo = Migemo.new(pattern, migemo_dict)
12
+ assert Regexp.compile(migemo.regex)
13
+ end
14
+ end
15
+ end
16
+