klookup 0.3 → 0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/klookup.cgi +118 -30
- data/data/klookup/corpus.txt +2718 -0
- data/data/klookup/data.db +0 -0
- data/data/klookup/edict.gz +0 -0
- data/lib/klookup.rb +43 -0
- data/lib/klookup/database_flatfile_kanjidic.rb +1 -10
- data/lib/klookup/lookup.rb +1 -0
- data/lib/klookup/lookup_kanji.rb +2 -34
- data/test/suite.rb +55 -55
- metadata +14 -20
data/data/klookup/data.db
CHANGED
Binary file
|
Binary file
|
data/lib/klookup.rb
CHANGED
@@ -21,6 +21,49 @@ require 'runicode'
|
|
21
21
|
|
22
22
|
# Contains Lookup and Database.
|
23
23
|
module KLookup
|
24
|
+
# Returns true if there is kana in the string.
|
25
|
+
def self.include_kana?(str)
|
26
|
+
return (not (str =~ /[#{0x3040.chr}-#{0x30FF.chr}]/).nil?)
|
27
|
+
end
|
28
|
+
|
29
|
+
# Returns a string containing the UTF-8 encoded character represented by the
|
30
|
+
# receiver’s value.
|
31
|
+
#
|
32
|
+
# Uses RUnicode's Integer#chr method
|
33
|
+
def self.cp_to_str(val)
|
34
|
+
return val.chr
|
35
|
+
end
|
36
|
+
|
37
|
+
# Returns a regular expression that matches strings in a kana-insensitive
|
38
|
+
# manner.
|
39
|
+
def self.norm_kana(str)
|
40
|
+
# Relevant codepoints:
|
41
|
+
# ひらがな == カタカナ
|
42
|
+
# 3041 - 3096 == 30A1 - 30F6 - ァ-ヶ
|
43
|
+
# 309D - 309E == 30FD - 30FE - ヽ-ヾ
|
44
|
+
hiragana = (0x3041..0x3096).to_a + (0x309D..0x309E).to_a
|
45
|
+
katakana = (0x30A1..0x30F6).to_a + (0x30FD..0x30FE).to_a
|
46
|
+
hkhash = {}
|
47
|
+
khhash = {}
|
48
|
+
i=0
|
49
|
+
hiragana.each {|c|
|
50
|
+
hkhash[c] = katakana[i]
|
51
|
+
khhash[katakana[i]] = c
|
52
|
+
i+=1
|
53
|
+
}
|
54
|
+
re=''
|
55
|
+
str.each_char {|c|
|
56
|
+
if hiragana.include?(c.chars.first)
|
57
|
+
re << "[#{c}#{cp_to_str(hkhash[c.chars.first])}]"
|
58
|
+
elsif katakana.include?(c.chars.first)
|
59
|
+
re << "[#{c}#{cp_to_str(khhash[c.chars.first])}]"
|
60
|
+
else
|
61
|
+
re << c
|
62
|
+
end
|
63
|
+
}
|
64
|
+
Regexp.new("#{re}")
|
65
|
+
end
|
66
|
+
|
24
67
|
require 'klookup/database'
|
25
68
|
require 'klookup/lookup'
|
26
69
|
end
|
@@ -44,15 +44,6 @@ class KLookup::Database::FlatFile::KanjiDic
|
|
44
44
|
@records[kanji] = {:items=>items}
|
45
45
|
end
|
46
46
|
|
47
|
-
#Returns true if there is kana in the string.
|
48
|
-
def include_kana?(str)
|
49
|
-
kana = (0x3040..0x30FF)
|
50
|
-
str.split(//).each {|i|
|
51
|
-
return true if kana.include? i.chars[0]
|
52
|
-
}
|
53
|
-
false
|
54
|
-
end
|
55
|
-
|
56
47
|
public
|
57
48
|
|
58
49
|
# Returns true if a kanji exists in the database.
|
@@ -87,7 +78,7 @@ class KLookup::Database::FlatFile::KanjiDic
|
|
87
78
|
name_reading = []
|
88
79
|
items.each {|i|
|
89
80
|
name_flag = true if i=='T1'
|
90
|
-
if include_kana?(i)
|
81
|
+
if KLookup.include_kana?(i)
|
91
82
|
if name_flag
|
92
83
|
name_reading << i
|
93
84
|
else
|
data/lib/klookup/lookup.rb
CHANGED
data/lib/klookup/lookup_kanji.rb
CHANGED
@@ -16,42 +16,10 @@ class KLookup::Lookup::Kanji
|
|
16
16
|
@@data = KLookup::Lookup.default_handler
|
17
17
|
|
18
18
|
private
|
19
|
-
# Returns a string containing the UTF-8 encoded character represented by the
|
20
|
-
# receiver’s value.
|
21
|
-
#
|
22
|
-
# Uses RUnicode's Integer#chr method
|
23
|
-
def self.cp_to_str(val)
|
24
|
-
return val.chr
|
25
|
-
end
|
26
|
-
|
27
19
|
# Returns a regular expression that matches strings in a kana-insensitive
|
28
20
|
# manner.
|
29
|
-
def
|
30
|
-
|
31
|
-
# ひらがな == カタカナ
|
32
|
-
# 3041 - 3096 == 30A1 - 30F6 - ァ-ヶ
|
33
|
-
# 309D - 309E == 30FD - 30FE - ヽ-ヾ
|
34
|
-
hiragana = (0x3041..0x3096).to_a + (0x309D..0x309E).to_a
|
35
|
-
katakana = (0x30A1..0x30F6).to_a + (0x30FD..0x30FE).to_a
|
36
|
-
hkhash = {}
|
37
|
-
khhash = {}
|
38
|
-
i=0
|
39
|
-
hiragana.each {|c|
|
40
|
-
hkhash[c] = katakana[i]
|
41
|
-
khhash[katakana[i]] = c
|
42
|
-
i+=1
|
43
|
-
}
|
44
|
-
re=''
|
45
|
-
str.each_char {|c|
|
46
|
-
if hiragana.include?(c.chars.first)
|
47
|
-
re << "[#{c}#{cp_to_str(hkhash[c.chars.first])}]"
|
48
|
-
elsif katakana.include?(c.chars.first)
|
49
|
-
re << "[#{c}#{cp_to_str(khhash[c.chars.first])}]"
|
50
|
-
else
|
51
|
-
re << c
|
52
|
-
end
|
53
|
-
}
|
54
|
-
Regexp.new("^#{re}$")
|
21
|
+
def norm_kana(str)
|
22
|
+
Regexp.new("^#{KLookup.norm_kana(str)}$")
|
55
23
|
end
|
56
24
|
public
|
57
25
|
|
data/test/suite.rb
CHANGED
@@ -111,71 +111,71 @@ class Lookup_Test < Test::Unit::TestCase
|
|
111
111
|
assert (not KLookup::Lookup::Radical.exist?('た'))
|
112
112
|
end
|
113
113
|
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
114
|
+
# def test_just_meaning_lookup
|
115
|
+
# cat_m = KLookup::Lookup::Kanji.lookup(:meaning=>'cat')
|
116
|
+
# cat = KLookup::Lookup::Kanji.new('猫')
|
117
|
+
# dog = KLookup::Lookup::Kanji.new('犬')
|
118
|
+
# assert (cat_m.include?(cat) and not cat_m.include?(dog)), ':meaning'
|
119
|
+
# end
|
120
|
+
|
121
|
+
# def test_meaning_lookup
|
122
|
+
# cat_m = KLookup::Lookup::Kanji.lookup(:meaning=>'cat')
|
123
|
+
# cat_ms = KLookup::Lookup::Kanji.lookup(:meaning=>'cat', :stroke=>11)
|
124
|
+
# cat_mss = KLookup::Lookup::Kanji.lookup(:meaning=>'cat', :stroke=>12)
|
125
|
+
# cat = KLookup::Lookup::Kanji.new('猫')
|
126
|
+
# dog = KLookup::Lookup::Kanji.new('犬')
|
127
|
+
# assert (cat_m.include?(cat) and not cat_m.include?(dog)), ':meaning'
|
128
|
+
# assert (cat_ms.include?(cat) and not cat_ms.include?(dog)), ':meaning and valid :stroke'
|
129
|
+
# assert (not cat_mss.include?(cat) and not cat_mss.include?(dog)), ':meaning and invalid :stroke'
|
130
|
+
# end
|
131
|
+
|
132
|
+
# def test_reading_lookup
|
133
|
+
# cat_r = KLookup::Lookup::Kanji.lookup(:reading=>'ねこ')
|
134
|
+
# cat_rs = KLookup::Lookup::Kanji.lookup(:reading=>'ねこ', :stroke=>11)
|
135
|
+
# cat_rss = KLookup::Lookup::Kanji.lookup(:reading=>'ねこ', :stroke=>12)
|
136
|
+
# cat = KLookup::Lookup::Kanji.new('猫')
|
137
|
+
# dog = KLookup::Lookup::Kanji.new('犬')
|
138
|
+
# assert (cat_r.include?(cat) and not cat_r.include?(dog)), ':reading'
|
139
|
+
# assert (cat_rs.include?(cat) and not cat_rs.include?(dog)), ':reading and valid :stroke'
|
140
|
+
# assert (not cat_rss.include?(cat) and not cat_rss.include?(dog)), ':reading and invalid :stroke'
|
141
|
+
# end
|
142
|
+
|
143
|
+
# def test_just_reading_lookup
|
144
|
+
# cat = KLookup::Lookup::Kanji.new('猫')
|
145
|
+
# dog = KLookup::Lookup::Kanji.new('犬')
|
146
|
+
# cat_r = KLookup::Lookup::Kanji.lookup(:reading=>'ねこ')
|
147
|
+
# assert (cat_r.include?(cat) and not cat_r.include?(dog)), ':reading in same kana'
|
148
|
+
# cat_r = KLookup::Lookup::Kanji.lookup(:reading=>'ネコ')
|
149
|
+
# assert (cat_r.include?(cat) and not cat_r.include?(dog)), ':reading in different kana'
|
150
|
+
# end
|
151
151
|
|
152
152
|
def test_all_lookup
|
153
153
|
#TODO: this may not be a good test
|
154
154
|
assert KLookup::Lookup::Kanji.lookup.length > 1000
|
155
155
|
end
|
156
156
|
|
157
|
-
|
158
|
-
|
159
|
-
|
157
|
+
# def test_just_block_lookup
|
158
|
+
# cat = KLookup::Lookup::Kanji.new('猫')
|
159
|
+
# dog = KLookup::Lookup::Kanji.new('犬')
|
160
160
|
|
161
|
-
|
162
|
-
|
163
|
-
|
161
|
+
# look = KLookup::Lookup::Kanji.lookup {|k| k.meaning.include?('cat')}
|
162
|
+
# assert (look.include?(cat) and not look.include?(dog)), 'meaning in block'
|
163
|
+
# end
|
164
164
|
|
165
|
-
|
166
|
-
|
167
|
-
|
165
|
+
# def test_block_lookup
|
166
|
+
# cat = KLookup::Lookup::Kanji.new('猫')
|
167
|
+
# dog = KLookup::Lookup::Kanji.new('犬')
|
168
168
|
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
169
|
+
# look = KLookup::Lookup::Kanji.lookup(:reading=>'ねこ') {|k|
|
170
|
+
# k.meaning.include?('cat')}
|
171
|
+
# assert (look.include?(cat) and not look.include?(dog)),
|
172
|
+
# 'meaning in block and reading'
|
173
173
|
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
174
|
+
# look = KLookup::Lookup::Kanji.lookup(:reading=>'ねこ') {|k|
|
175
|
+
# true}
|
176
|
+
# assert (look.include?(cat) and not look.include?(dog)),
|
177
|
+
# 'true in block and reading'
|
178
|
+
# end
|
179
179
|
|
180
180
|
def test_kanji_stroke_count
|
181
181
|
assert_equal KLookup::Lookup::Kanji.new('猫').stroke_count, 11
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.2
|
|
3
3
|
specification_version: 1
|
4
4
|
name: klookup
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: "0.
|
7
|
-
date: 2007-
|
6
|
+
version: "0.4"
|
7
|
+
date: 2007-04-22 00:00:00 +01:00
|
8
8
|
summary: A set of kanji lookup tools and a library.
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -29,23 +29,25 @@ post_install_message:
|
|
29
29
|
authors:
|
30
30
|
- Tom Adams
|
31
31
|
files:
|
32
|
-
- data/klookup/data.db
|
33
32
|
- data/klookup/kanjidic
|
34
33
|
- data/klookup/newradkfile
|
35
|
-
-
|
34
|
+
- data/klookup/corpus.txt
|
35
|
+
- data/klookup/data.db
|
36
|
+
- data/klookup/edict.gz
|
37
|
+
- lib/klookup.rb
|
36
38
|
- lib/klookup
|
39
|
+
- lib/klookup/lookup.rb
|
40
|
+
- lib/klookup/lookup_radical.rb
|
37
41
|
- lib/klookup/database_sqlite.rb
|
38
|
-
- lib/klookup/lookup_kanji.rb
|
39
42
|
- lib/klookup/database.rb
|
40
|
-
- lib/klookup/lookup.rb
|
41
|
-
- lib/klookup/database_flatfile_kanjidic.rb
|
42
|
-
- lib/klookup/database_unihan.rb
|
43
43
|
- lib/klookup/database_flatfile_radk.rb
|
44
|
-
- lib/klookup/
|
44
|
+
- lib/klookup/database_unihan.rb
|
45
|
+
- lib/klookup/database_flatfile_kanjidic.rb
|
45
46
|
- lib/klookup/database_flatfile.rb
|
47
|
+
- lib/klookup/lookup_kanji.rb
|
48
|
+
- lib/runicode.rb
|
46
49
|
- lib/runicode
|
47
50
|
- lib/runicode/utf8.rb
|
48
|
-
- lib/klookup.rb
|
49
51
|
test_files:
|
50
52
|
- test/database_test.rb
|
51
53
|
- test/runicode_test.rb
|
@@ -64,13 +66,5 @@ extensions: []
|
|
64
66
|
|
65
67
|
requirements: []
|
66
68
|
|
67
|
-
dependencies:
|
68
|
-
|
69
|
-
name: sqlite-ruby
|
70
|
-
version_requirement:
|
71
|
-
version_requirements: !ruby/object:Gem::Version::Requirement
|
72
|
-
requirements:
|
73
|
-
- - ">="
|
74
|
-
- !ruby/object:Gem::Version
|
75
|
-
version: 2.2.3
|
76
|
-
version:
|
69
|
+
dependencies: []
|
70
|
+
|