migemo-lib 0.4.3
Sign up to get free protection for your applications and to get access to all the features.
- data/README +6 -0
- data/data/migemo-dict +166746 -0
- data/data/migemo-dict.cache +0 -0
- data/data/migemo-dict.cache.idx +0 -0
- data/data/migemo-dict.idx +0 -0
- data/doc/migemo.ja.rd +66 -0
- data/lib/migemo-dict.rb +126 -0
- data/lib/migemo-regex.rb +340 -0
- data/lib/migemo.rb +183 -0
- data/lib/migemo/core_ext/string.rb +60 -0
- data/lib/migemo/version.rb +8 -0
- data/test/cache_test.rb +28 -0
- data/test/charclass_test.rb +15 -0
- data/test/convert_test.rb +29 -0
- data/test/emacs_type_test.rb +10 -0
- data/test/insertion_test.rb +16 -0
- data/test/migemo_test.rb +50 -0
- data/test/regex_dict_test.rb +24 -0
- data/test/regex_test.rb +16 -0
- data/test/symbols_test.rb +16 -0
- data/test/test_helper.rb +22 -0
- data/test/user_dict_test.rb +22 -0
- metadata +102 -0
data/lib/migemo.rb
ADDED
@@ -0,0 +1,183 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#
|
3
|
+
# Ruby/Migemo - a library for Japanese incremental search.
|
4
|
+
#
|
5
|
+
# Copyright (C) 2001 Satoru Takabayashi <satoru@namazu.org>
|
6
|
+
# All rights reserved.
|
7
|
+
# This is free software with ABSOLUTELY NO WARRANTY.
|
8
|
+
#
|
9
|
+
# You can redistribute it and/or modify it under the terms of
|
10
|
+
# the GNU General Public License version 2.
|
11
|
+
|
12
|
+
$LOAD_PATH << File.dirname(File.expand_path(__FILE__))
|
13
|
+
require 'migemo/core_ext/string'
|
14
|
+
require 'migemo-dict'
|
15
|
+
require 'migemo-regex'
|
16
|
+
require 'migemo/version'
|
17
|
+
require 'romkan'
|
18
|
+
include MigemoRegex
|
19
|
+
|
20
|
+
|
21
|
+
class Migemo
|
22
|
+
def initialize (pattern, dict=nil)
|
23
|
+
@static_dict = if dict.nil?
|
24
|
+
MigemoStaticDict.new(File.dirname(File.expand_path(__FILE__)) + '/../data/migemo-dict')
|
25
|
+
elsif dict.is_a?(String)
|
26
|
+
MigemoStaticDict.new(dict)
|
27
|
+
else
|
28
|
+
dict
|
29
|
+
end
|
30
|
+
@type = "ruby"
|
31
|
+
@pattern = pattern
|
32
|
+
@insertion = ""
|
33
|
+
@optimization = 3
|
34
|
+
|
35
|
+
@dict_cache = nil
|
36
|
+
@user_dict = nil
|
37
|
+
@regex_dict = nil
|
38
|
+
@with_paren = false
|
39
|
+
end
|
40
|
+
attr_accessor :optimization
|
41
|
+
attr_accessor :type
|
42
|
+
attr_accessor :insertion
|
43
|
+
attr_accessor :dict_cache
|
44
|
+
attr_accessor :user_dict
|
45
|
+
attr_accessor :regex_dict
|
46
|
+
attr_accessor :with_paren
|
47
|
+
|
48
|
+
def lookup
|
49
|
+
if @pattern == ""
|
50
|
+
return RegexAlternation.new
|
51
|
+
end
|
52
|
+
result = if @dict_cache
|
53
|
+
lookup_cache || lookup0
|
54
|
+
else
|
55
|
+
lookup0
|
56
|
+
end
|
57
|
+
if @user_dict
|
58
|
+
lookup_user_dict.each{|x| result.push(x) }
|
59
|
+
end
|
60
|
+
result
|
61
|
+
end
|
62
|
+
|
63
|
+
def regex_tree
|
64
|
+
lookup
|
65
|
+
end
|
66
|
+
|
67
|
+
def regex
|
68
|
+
regex = lookup
|
69
|
+
renderer = RegexRendererFactory.new(regex, @type, @insertion)
|
70
|
+
renderer.with_paren = @with_paren
|
71
|
+
string = renderer.render
|
72
|
+
string = renderer.join_regexes(string, lookup_regex_dict) if @regex_dict
|
73
|
+
string
|
74
|
+
end
|
75
|
+
|
76
|
+
private
|
77
|
+
# `do' => (ど)
|
78
|
+
# `d' => (っ だ ぢ づ で ど)
|
79
|
+
# `sh' => (しゃ し しゅ しぇ しょ)
|
80
|
+
# `don' => (どん どな どに どぬ どね どの どっ) # special case 1
|
81
|
+
# `nodd' => (のっ) # special case 2
|
82
|
+
# `doc' => (どっ どち) # special case 3
|
83
|
+
# `dox' => (どっ どゃ どゅ どょ) # special case 4
|
84
|
+
# `essy' => (えっしゃ えっしゅ えっしょ) # special case 5
|
85
|
+
# `ny' => (にゃ にゅ にょ) # special case 6
|
86
|
+
def expand_kanas
|
87
|
+
kana = @pattern.downcase.to_kana
|
88
|
+
/^(.*)(.)$/ =~ kana ;
|
89
|
+
head = $1;
|
90
|
+
last = $2;
|
91
|
+
|
92
|
+
cand = Array.new;
|
93
|
+
return [] if last == nil
|
94
|
+
if last.consonant?
|
95
|
+
if /^(.*)(.)$/ =~ head && $2.consonant?
|
96
|
+
head2 = $1;
|
97
|
+
beforelast = $2;
|
98
|
+
if last == beforelast # special case 2
|
99
|
+
cand.push head2 + "っ"
|
100
|
+
elsif /^(.*)(.)$/ =~ head2 && beforelast == $2 && last.consonant?
|
101
|
+
# special case 5
|
102
|
+
cand += (beforelast + last).expand_consonant.map do |x|
|
103
|
+
$1 + "っ" + x.to_kana
|
104
|
+
end
|
105
|
+
else
|
106
|
+
cand += (beforelast + last).expand_consonant.map do |x|
|
107
|
+
head2 + x.to_kana
|
108
|
+
end
|
109
|
+
end
|
110
|
+
elsif /^(.*?)(n?)ny$/ =~ @pattern && $2 == "" # special case 6
|
111
|
+
head2 = $1
|
112
|
+
cand += "ny".expand_consonant.map do |x|
|
113
|
+
head2 + x.to_kana
|
114
|
+
end
|
115
|
+
else
|
116
|
+
deriv = last.expand_consonant
|
117
|
+
deriv.push "xtsu";
|
118
|
+
if last == "c" # special case 3
|
119
|
+
deriv.push "chi";
|
120
|
+
elsif last == "x" # special case 4
|
121
|
+
deriv.push "xya", "xyu", "xyo", "xwa"
|
122
|
+
end
|
123
|
+
cand += deriv.map do |x| head + x.to_kana end
|
124
|
+
end
|
125
|
+
elsif last == "ん" # speacial case 1
|
126
|
+
cand.push kana;
|
127
|
+
cand += ("n".expand_consonant + ["っ"]).map do |x|
|
128
|
+
head + x.to_kana
|
129
|
+
end
|
130
|
+
else
|
131
|
+
cand.push kana
|
132
|
+
end
|
133
|
+
return cand.sort
|
134
|
+
end
|
135
|
+
|
136
|
+
# `めし' => (飯 飯合 雌蘂 雌蕊 飯櫃 目下 飯粒 召使 飯屋)
|
137
|
+
def expand_words (dict, pattern)
|
138
|
+
raise if pattern == nil
|
139
|
+
words = Array.new
|
140
|
+
dict.lookup(pattern) do |item|
|
141
|
+
words += item.values
|
142
|
+
end
|
143
|
+
return words
|
144
|
+
end
|
145
|
+
|
146
|
+
def lookup_cache
|
147
|
+
@dict_cache.lookup(@pattern)
|
148
|
+
end
|
149
|
+
|
150
|
+
def lookup0
|
151
|
+
compiler = RegexCompiler.new
|
152
|
+
compiler.push(@pattern)
|
153
|
+
compiler.push(@pattern.to_fullwidth)
|
154
|
+
expand_kanas.each do |x|
|
155
|
+
compiler.push(x)
|
156
|
+
compiler.push(x.to_katakana)
|
157
|
+
expand_words(@static_dict, x).each do |_x| compiler.push(_x) end
|
158
|
+
end
|
159
|
+
expand_words(@static_dict, @pattern).each do |x| compiler.push(x) end
|
160
|
+
compiler.uniq
|
161
|
+
compiler.optimize(@optimization) if @optimization
|
162
|
+
compiler.regex
|
163
|
+
end
|
164
|
+
|
165
|
+
def lookup_user_dict
|
166
|
+
compiler = RegexCompiler.new
|
167
|
+
expand_kanas.each do |x|
|
168
|
+
expand_words(@user_dict, x).each do |_x| compiler.push(_x) end
|
169
|
+
end
|
170
|
+
expand_words(@user_dict, @pattern).each do |x| compiler.push(x) end
|
171
|
+
compiler.uniq
|
172
|
+
compiler.optimize(@optimization) if @optimization
|
173
|
+
compiler.regex
|
174
|
+
end
|
175
|
+
|
176
|
+
def lookup_regex_dict
|
177
|
+
regexes = []
|
178
|
+
@regex_dict.lookup(@pattern) do |item|
|
179
|
+
regexes += item.values
|
180
|
+
end
|
181
|
+
regexes
|
182
|
+
end
|
183
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
class String
|
3
|
+
# Hiragana to Katakana
|
4
|
+
def to_katakana
|
5
|
+
self.gsub(/う゛/, 'ヴ').tr('ぁ-ん', 'ァ-ン')
|
6
|
+
end
|
7
|
+
|
8
|
+
def first
|
9
|
+
/^(\\.|.)/ =~ self
|
10
|
+
$1
|
11
|
+
end
|
12
|
+
|
13
|
+
def last
|
14
|
+
/(\\.|.)$/ =~ self
|
15
|
+
$1
|
16
|
+
end
|
17
|
+
|
18
|
+
def rest
|
19
|
+
/^(\\.|.)(.*)/ =~ self
|
20
|
+
$2
|
21
|
+
end
|
22
|
+
|
23
|
+
HANZEN_TAB = {
|
24
|
+
" " => " ", "!" => "!", '"' => "”", "#" => "#",
|
25
|
+
"\$" => "$", "%" => "%", "&" => "&", "'" => "’",
|
26
|
+
"(" => "(", ")" => ")", "*" => "*", "+" => "+",
|
27
|
+
"," => ",", "-" => "−", "." => ".", "/" => "/",
|
28
|
+
"0" => "0", "1" => "1", "2" => "2", "3" => "3",
|
29
|
+
"4" => "4", "5" => "5", "6" => "6", "7" => "7",
|
30
|
+
"8" => "8", "9" => "9", ":" => ":", ";" => ";",
|
31
|
+
"<" => "<", "=" => "=", ">" => ">", "?" => "?",
|
32
|
+
'@' => "@", "A" => "A", "B" => "B", "C" => "C",
|
33
|
+
"D" => "D", "E" => "E", "F" => "F", "G" => "G",
|
34
|
+
"H" => "H", "I" => "I", "J" => "J", "K" => "K",
|
35
|
+
"L" => "L", "M" => "M", "N" => "N", "O" => "O",
|
36
|
+
"P" => "P", "Q" => "Q", "R" => "R", "S" => "S",
|
37
|
+
"T" => "T", "U" => "U", "V" => "V", "W" => "W",
|
38
|
+
"X" => "X", "Y" => "Y", "Z" => "Z", "[" => "[",
|
39
|
+
"\\" => "\", "]" => "]", "^" => "^", "_" => "_",
|
40
|
+
"`" => "‘", "a" => "a", "b" => "b", "c" => "c",
|
41
|
+
"d" => "d", "e" => "e", "f" => "f", "g" => "g",
|
42
|
+
"h" => "h", "i" => "i", "j" => "j", "k" => "k",
|
43
|
+
"l" => "l", "m" => "m", "n" => "n", "o" => "o",
|
44
|
+
"p" => "p", "q" => "q", "r" => "r", "s" => "s",
|
45
|
+
"t" => "t", "u" => "u", "v" => "v", "w" => "w",
|
46
|
+
"x" => "x", "y" => "y", "z" => "z", "{" => "{",
|
47
|
+
"|" => "|", "}" => "}", "~" => "‾"} #'
|
48
|
+
|
49
|
+
HANZEN_RE = Regexp.new(HANZEN_TAB.keys.sort.map {|x|
|
50
|
+
Regexp.quote(x)
|
51
|
+
}.join('|'))
|
52
|
+
|
53
|
+
def to_fullwidth
|
54
|
+
self.gsub(HANZEN_RE) {|s| HANZEN_TAB[s]}
|
55
|
+
end
|
56
|
+
|
57
|
+
def prefix_match (string)
|
58
|
+
self[0, string.length] <=> string
|
59
|
+
end
|
60
|
+
end
|
data/test/cache_test.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
class CacheTest < Test::Unit::TestCase
|
3
|
+
def setup
|
4
|
+
base_dir = File.dirname(File.expand_path(__FILE__))
|
5
|
+
dict_file = base_dir+ '/../data/test-dict'
|
6
|
+
words = base_dir + '/../data/migemo-chars'
|
7
|
+
File.open(dict_file) do |f|
|
8
|
+
@words = f.readlines.map{|l| l =~ /^(\w).*? /; $1 }.compact.uniq
|
9
|
+
end
|
10
|
+
unless File.exists?(dict_file + '.cache')
|
11
|
+
Migemo::Cache.new(dict_file, words).generate.save(dict_file + '.cache')
|
12
|
+
end
|
13
|
+
@cache_dict = MigemoDictCache.new(dict_file + '.cache')
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_caches
|
17
|
+
@words.each do |w|
|
18
|
+
[:normal, :cache].each do |ivar|
|
19
|
+
migemo = Migemo.new(w, migemo_dict)
|
20
|
+
migemo.user_dict = user_dict
|
21
|
+
migemo.regex_dict = regex_dict
|
22
|
+
instance_variable_set("@#{ivar}", migemo)
|
23
|
+
end
|
24
|
+
@cache.dict_cache = @cache_dict
|
25
|
+
assert_equal @normal.regex, @cache.regex
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require 'test_helper'
|
3
|
+
|
4
|
+
class CharclassTest < Test::Unit::TestCase
|
5
|
+
def test_ruby_type
|
6
|
+
migemo = Migemo.new('sym', migemo_dict)
|
7
|
+
assert_equal "[]$%@\\\\-]|sym|sym", migemo.regex
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_emacs_type
|
11
|
+
migemo = Migemo.new('sym', migemo_dict)
|
12
|
+
migemo.type = 'emacs'
|
13
|
+
assert_equal "[]$%@\\-]\\|sym\\|sym", migemo.regex
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require 'test_helper'
|
3
|
+
|
4
|
+
class ConvertTest < Test::Unit::TestCase
|
5
|
+
def test_convert
|
6
|
+
input =<<-EOF
|
7
|
+
;;
|
8
|
+
;; This is a comment line.
|
9
|
+
;;
|
10
|
+
りかい /理解/
|
11
|
+
りかいs /理解/
|
12
|
+
motion /モーション/
|
13
|
+
りくとく /六徳;人が守るべき六つの徳。「ろくとく」とも/
|
14
|
+
EOF
|
15
|
+
expects =<<-EOF
|
16
|
+
;;
|
17
|
+
;; This is Migemo's dictionary generated from SKK's.
|
18
|
+
;;
|
19
|
+
;;
|
20
|
+
;; This is a comment line.
|
21
|
+
;;
|
22
|
+
motion モーション
|
23
|
+
りかい 理解
|
24
|
+
りくとく 六徳
|
25
|
+
EOF
|
26
|
+
convert = Migemo::Convert.new(input)
|
27
|
+
assert_equal expects, (convert.header + convert.transfer).join("\n") + "\n"
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require 'test_helper'
|
3
|
+
|
4
|
+
class TestEmacsType < Test::Unit::TestCase
|
5
|
+
def test_mot
|
6
|
+
migemo = Migemo.new('mot', migemo_dict)
|
7
|
+
migemo.type = 'emacs'
|
8
|
+
assert_equal 'mot\|mot\|も[たちっつてと]\|モ\([タチッツテト]\|ー\(ション\|ター\)\|スラ\)', migemo.regex
|
9
|
+
end
|
10
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require 'test_helper'
|
3
|
+
class InsertionTest < Test::Unit::TestCase
|
4
|
+
def test_mot
|
5
|
+
migemo = Migemo.new('mot', migemo_dict)
|
6
|
+
migemo.insertion = "\\s *"
|
7
|
+
assert_equal 'm\s *o\s *t|m\s *o\s *t|も\s *[たちっつてと]|モ\s *(?:[タチッツテト]|ー\s *(?:シ\s *ョ\s *ン|タ\s *ー)|ス\s *ラ)', migemo.regex
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_mot_as_emacs
|
11
|
+
migemo = Migemo.new('mot', migemo_dict)
|
12
|
+
migemo.type = 'emacs'
|
13
|
+
migemo.insertion = "\\s *"
|
14
|
+
assert_equal 'm\s *o\s *t\|m\s *o\s *t\|も\s *[たちっつてと]\|モ\s *\([タチッツテト]\|ー\s *\(シ\s *ョ\s *ン\|タ\s *ー\)\|ス\s *ラ\)', migemo.regex
|
15
|
+
end
|
16
|
+
end
|
data/test/migemo_test.rb
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require 'test_helper'
|
3
|
+
|
4
|
+
class MigemoTest < Test::Unit::TestCase
|
5
|
+
def test_empty_string
|
6
|
+
migemo = Migemo.new("", migemo_dict)
|
7
|
+
assert_equal "", migemo.regex
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_k
|
11
|
+
migemo = Migemo.new("k", migemo_dict)
|
12
|
+
assert_equal "[kkかきくけこっカキクケコッ機帰気]", migemo.regex
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_ki_with_no_optimize
|
16
|
+
migemo = Migemo.new("ki", migemo_dict)
|
17
|
+
migemo.optimization = 0
|
18
|
+
assert_equal "ki|ki|き|キ|気|機|帰|機能|帰納|帰農|機能主義|機能的|帰納的", migemo.regex
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_ki_with_optimize1
|
22
|
+
migemo = Migemo.new("ki", migemo_dict)
|
23
|
+
migemo.optimization = 1
|
24
|
+
assert_equal "ki|ki|き|キ|機|帰|気", migemo.regex
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_ki_with_optimize3
|
28
|
+
migemo = Migemo.new("ki", migemo_dict)
|
29
|
+
migemo.optimization = 3
|
30
|
+
assert_equal "[きキ機帰気]|ki|ki", migemo.regex
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_kin
|
34
|
+
migemo = Migemo.new("kin", migemo_dict)
|
35
|
+
migemo.optimization = 3
|
36
|
+
assert_equal "kin|kin|き[っなにぬねのん]|キ[ッナニヌネノン]|機能|帰[納農]", migemo.regex
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_mot_with_optimze2
|
40
|
+
migemo = Migemo.new("mot", migemo_dict)
|
41
|
+
migemo.optimization = 2
|
42
|
+
assert_equal "mot|mot|も(?:た|ち|っ|つ|て|と)|モ(?:ー(?:ション|ター)|スラ|タ|チ|ッ|ツ|テ|ト)", migemo.regex
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_mot_with_optimze3
|
46
|
+
migemo = Migemo.new("mot", migemo_dict)
|
47
|
+
migemo.optimization = 3
|
48
|
+
assert_equal "mot|mot|も[たちっつてと]|モ(?:[タチッツテト]|ー(?:ション|ター)|スラ)", migemo.regex
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require 'test_helper'
|
3
|
+
|
4
|
+
class RegexDictTest < Test::Unit::TestCase
|
5
|
+
def test_m
|
6
|
+
migemo = Migemo.new('m', migemo_dict)
|
7
|
+
migemo.regex_dict = regex_dict
|
8
|
+
assert_equal '[mmっまみむめもッマミムメモ]|\([-0-9a-zA-Z_.]+@[-0-9a-zA-Z_.]+\)', migemo.regex
|
9
|
+
end
|
10
|
+
|
11
|
+
|
12
|
+
def test_ur
|
13
|
+
migemo = Migemo.new('ur', migemo_dict)
|
14
|
+
migemo.regex_dict = regex_dict
|
15
|
+
assert_equal 'ur|ur|う[っらりるれろ]|ウ[ッラリルレロ]|\(\(http\|https\|ftp\|afs\|wais\|telnet\|ldap\|gopher\|news\|nntp\|rsync\|mailto\)://[-_.!~*\'()a-zA-Z0-9;/?:@&=+$,%#]+\)', migemo.regex
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_m_with_userdict
|
19
|
+
migemo = Migemo.new('m', migemo_dict)
|
20
|
+
migemo.regex_dict = regex_dict
|
21
|
+
migemo.user_dict = user_dict
|
22
|
+
assert_equal '[mmっまみむめもッマミムメモ]|Message Of The Day|\([-0-9a-zA-Z_.]+@[-0-9a-zA-Z_.]+\)', migemo.regex
|
23
|
+
end
|
24
|
+
end
|
data/test/regex_test.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class RegexTest < Test::Unit::TestCase
|
4
|
+
def test_compile
|
5
|
+
patterns = []
|
6
|
+
File.open(File.dirname(File.expand_path(__FILE__)) + '/../data/migemo-dict') do |f|
|
7
|
+
lines = f.readlines.map(&:chomp)
|
8
|
+
10.times{ patterns << lines.slice!(rand(lines.length)) }
|
9
|
+
end
|
10
|
+
patterns.each do |pattern|
|
11
|
+
migemo = Migemo.new(pattern, migemo_dict)
|
12
|
+
assert Regexp.compile(migemo.regex)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|