migemo-lib 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +6 -0
- data/data/migemo-dict +166746 -0
- data/data/migemo-dict.cache +0 -0
- data/data/migemo-dict.cache.idx +0 -0
- data/data/migemo-dict.idx +0 -0
- data/doc/migemo.ja.rd +66 -0
- data/lib/migemo-dict.rb +126 -0
- data/lib/migemo-regex.rb +340 -0
- data/lib/migemo.rb +183 -0
- data/lib/migemo/core_ext/string.rb +60 -0
- data/lib/migemo/version.rb +8 -0
- data/test/cache_test.rb +28 -0
- data/test/charclass_test.rb +15 -0
- data/test/convert_test.rb +29 -0
- data/test/emacs_type_test.rb +10 -0
- data/test/insertion_test.rb +16 -0
- data/test/migemo_test.rb +50 -0
- data/test/regex_dict_test.rb +24 -0
- data/test/regex_test.rb +16 -0
- data/test/symbols_test.rb +16 -0
- data/test/test_helper.rb +22 -0
- data/test/user_dict_test.rb +22 -0
- metadata +102 -0
data/lib/migemo.rb
ADDED
@@ -0,0 +1,183 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#
|
3
|
+
# Ruby/Migemo - a library for Japanese incremental search.
|
4
|
+
#
|
5
|
+
# Copyright (C) 2001 Satoru Takabayashi <satoru@namazu.org>
|
6
|
+
# All rights reserved.
|
7
|
+
# This is free software with ABSOLUTELY NO WARRANTY.
|
8
|
+
#
|
9
|
+
# You can redistribute it and/or modify it under the terms of
|
10
|
+
# the GNU General Public License version 2.
|
11
|
+
|
12
|
+
$LOAD_PATH << File.dirname(File.expand_path(__FILE__))
|
13
|
+
require 'migemo/core_ext/string'
|
14
|
+
require 'migemo-dict'
|
15
|
+
require 'migemo-regex'
|
16
|
+
require 'migemo/version'
|
17
|
+
require 'romkan'
|
18
|
+
include MigemoRegex
|
19
|
+
|
20
|
+
|
21
|
+
class Migemo
|
22
|
+
def initialize (pattern, dict=nil)
|
23
|
+
@static_dict = if dict.nil?
|
24
|
+
MigemoStaticDict.new(File.dirname(File.expand_path(__FILE__)) + '/../data/migemo-dict')
|
25
|
+
elsif dict.is_a?(String)
|
26
|
+
MigemoStaticDict.new(dict)
|
27
|
+
else
|
28
|
+
dict
|
29
|
+
end
|
30
|
+
@type = "ruby"
|
31
|
+
@pattern = pattern
|
32
|
+
@insertion = ""
|
33
|
+
@optimization = 3
|
34
|
+
|
35
|
+
@dict_cache = nil
|
36
|
+
@user_dict = nil
|
37
|
+
@regex_dict = nil
|
38
|
+
@with_paren = false
|
39
|
+
end
|
40
|
+
attr_accessor :optimization
|
41
|
+
attr_accessor :type
|
42
|
+
attr_accessor :insertion
|
43
|
+
attr_accessor :dict_cache
|
44
|
+
attr_accessor :user_dict
|
45
|
+
attr_accessor :regex_dict
|
46
|
+
attr_accessor :with_paren
|
47
|
+
|
48
|
+
def lookup
|
49
|
+
if @pattern == ""
|
50
|
+
return RegexAlternation.new
|
51
|
+
end
|
52
|
+
result = if @dict_cache
|
53
|
+
lookup_cache || lookup0
|
54
|
+
else
|
55
|
+
lookup0
|
56
|
+
end
|
57
|
+
if @user_dict
|
58
|
+
lookup_user_dict.each{|x| result.push(x) }
|
59
|
+
end
|
60
|
+
result
|
61
|
+
end
|
62
|
+
|
63
|
+
def regex_tree
|
64
|
+
lookup
|
65
|
+
end
|
66
|
+
|
67
|
+
def regex
|
68
|
+
regex = lookup
|
69
|
+
renderer = RegexRendererFactory.new(regex, @type, @insertion)
|
70
|
+
renderer.with_paren = @with_paren
|
71
|
+
string = renderer.render
|
72
|
+
string = renderer.join_regexes(string, lookup_regex_dict) if @regex_dict
|
73
|
+
string
|
74
|
+
end
|
75
|
+
|
76
|
+
private
|
77
|
+
# `do' => (ど)
|
78
|
+
# `d' => (っ だ ぢ づ で ど)
|
79
|
+
# `sh' => (しゃ し しゅ しぇ しょ)
|
80
|
+
# `don' => (どん どな どに どぬ どね どの どっ) # special case 1
|
81
|
+
# `nodd' => (のっ) # special case 2
|
82
|
+
# `doc' => (どっ どち) # special case 3
|
83
|
+
# `dox' => (どっ どゃ どゅ どょ) # special case 4
|
84
|
+
# `essy' => (えっしゃ えっしゅ えっしょ) # special case 5
|
85
|
+
# `ny' => (にゃ にゅ にょ) # special case 6
|
86
|
+
def expand_kanas
|
87
|
+
kana = @pattern.downcase.to_kana
|
88
|
+
/^(.*)(.)$/ =~ kana ;
|
89
|
+
head = $1;
|
90
|
+
last = $2;
|
91
|
+
|
92
|
+
cand = Array.new;
|
93
|
+
return [] if last == nil
|
94
|
+
if last.consonant?
|
95
|
+
if /^(.*)(.)$/ =~ head && $2.consonant?
|
96
|
+
head2 = $1;
|
97
|
+
beforelast = $2;
|
98
|
+
if last == beforelast # special case 2
|
99
|
+
cand.push head2 + "っ"
|
100
|
+
elsif /^(.*)(.)$/ =~ head2 && beforelast == $2 && last.consonant?
|
101
|
+
# special case 5
|
102
|
+
cand += (beforelast + last).expand_consonant.map do |x|
|
103
|
+
$1 + "っ" + x.to_kana
|
104
|
+
end
|
105
|
+
else
|
106
|
+
cand += (beforelast + last).expand_consonant.map do |x|
|
107
|
+
head2 + x.to_kana
|
108
|
+
end
|
109
|
+
end
|
110
|
+
elsif /^(.*?)(n?)ny$/ =~ @pattern && $2 == "" # special case 6
|
111
|
+
head2 = $1
|
112
|
+
cand += "ny".expand_consonant.map do |x|
|
113
|
+
head2 + x.to_kana
|
114
|
+
end
|
115
|
+
else
|
116
|
+
deriv = last.expand_consonant
|
117
|
+
deriv.push "xtsu";
|
118
|
+
if last == "c" # special case 3
|
119
|
+
deriv.push "chi";
|
120
|
+
elsif last == "x" # special case 4
|
121
|
+
deriv.push "xya", "xyu", "xyo", "xwa"
|
122
|
+
end
|
123
|
+
cand += deriv.map do |x| head + x.to_kana end
|
124
|
+
end
|
125
|
+
elsif last == "ん" # speacial case 1
|
126
|
+
cand.push kana;
|
127
|
+
cand += ("n".expand_consonant + ["っ"]).map do |x|
|
128
|
+
head + x.to_kana
|
129
|
+
end
|
130
|
+
else
|
131
|
+
cand.push kana
|
132
|
+
end
|
133
|
+
return cand.sort
|
134
|
+
end
|
135
|
+
|
136
|
+
# `めし' => (飯 飯合 雌蘂 雌蕊 飯櫃 目下 飯粒 召使 飯屋)
|
137
|
+
def expand_words (dict, pattern)
|
138
|
+
raise if pattern == nil
|
139
|
+
words = Array.new
|
140
|
+
dict.lookup(pattern) do |item|
|
141
|
+
words += item.values
|
142
|
+
end
|
143
|
+
return words
|
144
|
+
end
|
145
|
+
|
146
|
+
def lookup_cache
|
147
|
+
@dict_cache.lookup(@pattern)
|
148
|
+
end
|
149
|
+
|
150
|
+
def lookup0
|
151
|
+
compiler = RegexCompiler.new
|
152
|
+
compiler.push(@pattern)
|
153
|
+
compiler.push(@pattern.to_fullwidth)
|
154
|
+
expand_kanas.each do |x|
|
155
|
+
compiler.push(x)
|
156
|
+
compiler.push(x.to_katakana)
|
157
|
+
expand_words(@static_dict, x).each do |_x| compiler.push(_x) end
|
158
|
+
end
|
159
|
+
expand_words(@static_dict, @pattern).each do |x| compiler.push(x) end
|
160
|
+
compiler.uniq
|
161
|
+
compiler.optimize(@optimization) if @optimization
|
162
|
+
compiler.regex
|
163
|
+
end
|
164
|
+
|
165
|
+
def lookup_user_dict
|
166
|
+
compiler = RegexCompiler.new
|
167
|
+
expand_kanas.each do |x|
|
168
|
+
expand_words(@user_dict, x).each do |_x| compiler.push(_x) end
|
169
|
+
end
|
170
|
+
expand_words(@user_dict, @pattern).each do |x| compiler.push(x) end
|
171
|
+
compiler.uniq
|
172
|
+
compiler.optimize(@optimization) if @optimization
|
173
|
+
compiler.regex
|
174
|
+
end
|
175
|
+
|
176
|
+
def lookup_regex_dict
|
177
|
+
regexes = []
|
178
|
+
@regex_dict.lookup(@pattern) do |item|
|
179
|
+
regexes += item.values
|
180
|
+
end
|
181
|
+
regexes
|
182
|
+
end
|
183
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
class String
|
3
|
+
# Hiragana to Katakana
|
4
|
+
def to_katakana
|
5
|
+
self.gsub(/う゛/, 'ヴ').tr('ぁ-ん', 'ァ-ン')
|
6
|
+
end
|
7
|
+
|
8
|
+
def first
|
9
|
+
/^(\\.|.)/ =~ self
|
10
|
+
$1
|
11
|
+
end
|
12
|
+
|
13
|
+
def last
|
14
|
+
/(\\.|.)$/ =~ self
|
15
|
+
$1
|
16
|
+
end
|
17
|
+
|
18
|
+
def rest
|
19
|
+
/^(\\.|.)(.*)/ =~ self
|
20
|
+
$2
|
21
|
+
end
|
22
|
+
|
23
|
+
HANZEN_TAB = {
|
24
|
+
" " => " ", "!" => "!", '"' => "”", "#" => "#",
|
25
|
+
"\$" => "$", "%" => "%", "&" => "&", "'" => "’",
|
26
|
+
"(" => "(", ")" => ")", "*" => "*", "+" => "+",
|
27
|
+
"," => ",", "-" => "−", "." => ".", "/" => "/",
|
28
|
+
"0" => "0", "1" => "1", "2" => "2", "3" => "3",
|
29
|
+
"4" => "4", "5" => "5", "6" => "6", "7" => "7",
|
30
|
+
"8" => "8", "9" => "9", ":" => ":", ";" => ";",
|
31
|
+
"<" => "<", "=" => "=", ">" => ">", "?" => "?",
|
32
|
+
'@' => "@", "A" => "A", "B" => "B", "C" => "C",
|
33
|
+
"D" => "D", "E" => "E", "F" => "F", "G" => "G",
|
34
|
+
"H" => "H", "I" => "I", "J" => "J", "K" => "K",
|
35
|
+
"L" => "L", "M" => "M", "N" => "N", "O" => "O",
|
36
|
+
"P" => "P", "Q" => "Q", "R" => "R", "S" => "S",
|
37
|
+
"T" => "T", "U" => "U", "V" => "V", "W" => "W",
|
38
|
+
"X" => "X", "Y" => "Y", "Z" => "Z", "[" => "[",
|
39
|
+
"\\" => "\", "]" => "]", "^" => "^", "_" => "_",
|
40
|
+
"`" => "‘", "a" => "a", "b" => "b", "c" => "c",
|
41
|
+
"d" => "d", "e" => "e", "f" => "f", "g" => "g",
|
42
|
+
"h" => "h", "i" => "i", "j" => "j", "k" => "k",
|
43
|
+
"l" => "l", "m" => "m", "n" => "n", "o" => "o",
|
44
|
+
"p" => "p", "q" => "q", "r" => "r", "s" => "s",
|
45
|
+
"t" => "t", "u" => "u", "v" => "v", "w" => "w",
|
46
|
+
"x" => "x", "y" => "y", "z" => "z", "{" => "{",
|
47
|
+
"|" => "|", "}" => "}", "~" => "‾"} #'
|
48
|
+
|
49
|
+
HANZEN_RE = Regexp.new(HANZEN_TAB.keys.sort.map {|x|
|
50
|
+
Regexp.quote(x)
|
51
|
+
}.join('|'))
|
52
|
+
|
53
|
+
def to_fullwidth
|
54
|
+
self.gsub(HANZEN_RE) {|s| HANZEN_TAB[s]}
|
55
|
+
end
|
56
|
+
|
57
|
+
def prefix_match (string)
|
58
|
+
self[0, string.length] <=> string
|
59
|
+
end
|
60
|
+
end
|
data/test/cache_test.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
class CacheTest < Test::Unit::TestCase
|
3
|
+
def setup
|
4
|
+
base_dir = File.dirname(File.expand_path(__FILE__))
|
5
|
+
dict_file = base_dir+ '/../data/test-dict'
|
6
|
+
words = base_dir + '/../data/migemo-chars'
|
7
|
+
File.open(dict_file) do |f|
|
8
|
+
@words = f.readlines.map{|l| l =~ /^(\w).*? /; $1 }.compact.uniq
|
9
|
+
end
|
10
|
+
unless File.exists?(dict_file + '.cache')
|
11
|
+
Migemo::Cache.new(dict_file, words).generate.save(dict_file + '.cache')
|
12
|
+
end
|
13
|
+
@cache_dict = MigemoDictCache.new(dict_file + '.cache')
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_caches
|
17
|
+
@words.each do |w|
|
18
|
+
[:normal, :cache].each do |ivar|
|
19
|
+
migemo = Migemo.new(w, migemo_dict)
|
20
|
+
migemo.user_dict = user_dict
|
21
|
+
migemo.regex_dict = regex_dict
|
22
|
+
instance_variable_set("@#{ivar}", migemo)
|
23
|
+
end
|
24
|
+
@cache.dict_cache = @cache_dict
|
25
|
+
assert_equal @normal.regex, @cache.regex
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require 'test_helper'
|
3
|
+
|
4
|
+
class CharclassTest < Test::Unit::TestCase
|
5
|
+
def test_ruby_type
|
6
|
+
migemo = Migemo.new('sym', migemo_dict)
|
7
|
+
assert_equal "[]$%@\\\\-]|sym|sym", migemo.regex
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_emacs_type
|
11
|
+
migemo = Migemo.new('sym', migemo_dict)
|
12
|
+
migemo.type = 'emacs'
|
13
|
+
assert_equal "[]$%@\\-]\\|sym\\|sym", migemo.regex
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require 'test_helper'
|
3
|
+
|
4
|
+
class ConvertTest < Test::Unit::TestCase
|
5
|
+
def test_convert
|
6
|
+
input =<<-EOF
|
7
|
+
;;
|
8
|
+
;; This is a comment line.
|
9
|
+
;;
|
10
|
+
りかい /理解/
|
11
|
+
りかいs /理解/
|
12
|
+
motion /モーション/
|
13
|
+
りくとく /六徳;人が守るべき六つの徳。「ろくとく」とも/
|
14
|
+
EOF
|
15
|
+
expects =<<-EOF
|
16
|
+
;;
|
17
|
+
;; This is Migemo's dictionary generated from SKK's.
|
18
|
+
;;
|
19
|
+
;;
|
20
|
+
;; This is a comment line.
|
21
|
+
;;
|
22
|
+
motion モーション
|
23
|
+
りかい 理解
|
24
|
+
りくとく 六徳
|
25
|
+
EOF
|
26
|
+
convert = Migemo::Convert.new(input)
|
27
|
+
assert_equal expects, (convert.header + convert.transfer).join("\n") + "\n"
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require 'test_helper'
|
3
|
+
|
4
|
+
class TestEmacsType < Test::Unit::TestCase
|
5
|
+
def test_mot
|
6
|
+
migemo = Migemo.new('mot', migemo_dict)
|
7
|
+
migemo.type = 'emacs'
|
8
|
+
assert_equal 'mot\|mot\|も[たちっつてと]\|モ\([タチッツテト]\|ー\(ション\|ター\)\|スラ\)', migemo.regex
|
9
|
+
end
|
10
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require 'test_helper'
|
3
|
+
class InsertionTest < Test::Unit::TestCase
|
4
|
+
def test_mot
|
5
|
+
migemo = Migemo.new('mot', migemo_dict)
|
6
|
+
migemo.insertion = "\\s *"
|
7
|
+
assert_equal 'm\s *o\s *t|m\s *o\s *t|も\s *[たちっつてと]|モ\s *(?:[タチッツテト]|ー\s *(?:シ\s *ョ\s *ン|タ\s *ー)|ス\s *ラ)', migemo.regex
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_mot_as_emacs
|
11
|
+
migemo = Migemo.new('mot', migemo_dict)
|
12
|
+
migemo.type = 'emacs'
|
13
|
+
migemo.insertion = "\\s *"
|
14
|
+
assert_equal 'm\s *o\s *t\|m\s *o\s *t\|も\s *[たちっつてと]\|モ\s *\([タチッツテト]\|ー\s *\(シ\s *ョ\s *ン\|タ\s *ー\)\|ス\s *ラ\)', migemo.regex
|
15
|
+
end
|
16
|
+
end
|
data/test/migemo_test.rb
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require 'test_helper'
|
3
|
+
|
4
|
+
class MigemoTest < Test::Unit::TestCase
|
5
|
+
def test_empty_string
|
6
|
+
migemo = Migemo.new("", migemo_dict)
|
7
|
+
assert_equal "", migemo.regex
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_k
|
11
|
+
migemo = Migemo.new("k", migemo_dict)
|
12
|
+
assert_equal "[kkかきくけこっカキクケコッ機帰気]", migemo.regex
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_ki_with_no_optimize
|
16
|
+
migemo = Migemo.new("ki", migemo_dict)
|
17
|
+
migemo.optimization = 0
|
18
|
+
assert_equal "ki|ki|き|キ|気|機|帰|機能|帰納|帰農|機能主義|機能的|帰納的", migemo.regex
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_ki_with_optimize1
|
22
|
+
migemo = Migemo.new("ki", migemo_dict)
|
23
|
+
migemo.optimization = 1
|
24
|
+
assert_equal "ki|ki|き|キ|機|帰|気", migemo.regex
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_ki_with_optimize3
|
28
|
+
migemo = Migemo.new("ki", migemo_dict)
|
29
|
+
migemo.optimization = 3
|
30
|
+
assert_equal "[きキ機帰気]|ki|ki", migemo.regex
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_kin
|
34
|
+
migemo = Migemo.new("kin", migemo_dict)
|
35
|
+
migemo.optimization = 3
|
36
|
+
assert_equal "kin|kin|き[っなにぬねのん]|キ[ッナニヌネノン]|機能|帰[納農]", migemo.regex
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_mot_with_optimze2
|
40
|
+
migemo = Migemo.new("mot", migemo_dict)
|
41
|
+
migemo.optimization = 2
|
42
|
+
assert_equal "mot|mot|も(?:た|ち|っ|つ|て|と)|モ(?:ー(?:ション|ター)|スラ|タ|チ|ッ|ツ|テ|ト)", migemo.regex
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_mot_with_optimze3
|
46
|
+
migemo = Migemo.new("mot", migemo_dict)
|
47
|
+
migemo.optimization = 3
|
48
|
+
assert_equal "mot|mot|も[たちっつてと]|モ(?:[タチッツテト]|ー(?:ション|ター)|スラ)", migemo.regex
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require 'test_helper'
|
3
|
+
|
4
|
+
class RegexDictTest < Test::Unit::TestCase
|
5
|
+
def test_m
|
6
|
+
migemo = Migemo.new('m', migemo_dict)
|
7
|
+
migemo.regex_dict = regex_dict
|
8
|
+
assert_equal '[mmっまみむめもッマミムメモ]|\([-0-9a-zA-Z_.]+@[-0-9a-zA-Z_.]+\)', migemo.regex
|
9
|
+
end
|
10
|
+
|
11
|
+
|
12
|
+
def test_ur
|
13
|
+
migemo = Migemo.new('ur', migemo_dict)
|
14
|
+
migemo.regex_dict = regex_dict
|
15
|
+
assert_equal 'ur|ur|う[っらりるれろ]|ウ[ッラリルレロ]|\(\(http\|https\|ftp\|afs\|wais\|telnet\|ldap\|gopher\|news\|nntp\|rsync\|mailto\)://[-_.!~*\'()a-zA-Z0-9;/?:@&=+$,%#]+\)', migemo.regex
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_m_with_userdict
|
19
|
+
migemo = Migemo.new('m', migemo_dict)
|
20
|
+
migemo.regex_dict = regex_dict
|
21
|
+
migemo.user_dict = user_dict
|
22
|
+
assert_equal '[mmっまみむめもッマミムメモ]|Message Of The Day|\([-0-9a-zA-Z_.]+@[-0-9a-zA-Z_.]+\)', migemo.regex
|
23
|
+
end
|
24
|
+
end
|
data/test/regex_test.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class RegexTest < Test::Unit::TestCase
|
4
|
+
def test_compile
|
5
|
+
patterns = []
|
6
|
+
File.open(File.dirname(File.expand_path(__FILE__)) + '/../data/migemo-dict') do |f|
|
7
|
+
lines = f.readlines.map(&:chomp)
|
8
|
+
10.times{ patterns << lines.slice!(rand(lines.length)) }
|
9
|
+
end
|
10
|
+
patterns.each do |pattern|
|
11
|
+
migemo = Migemo.new(pattern, migemo_dict)
|
12
|
+
assert Regexp.compile(migemo.regex)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|