klookup 0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/cklookup +91 -0
- data/bin/gklookup +48 -0
- data/bin/klookup.cgi +206 -0
- data/data/klookup/kanjidic +6356 -0
- data/data/klookup/newradkfile +1068 -0
- data/lib/klookup.rb +27 -0
- data/lib/klookup/database.rb +66 -0
- data/lib/klookup/database_flatfile.rb +52 -0
- data/lib/klookup/database_flatfile_kanjidic.rb +128 -0
- data/lib/klookup/database_flatfile_radk.rb +136 -0
- data/lib/klookup/database_unihan.rb +101 -0
- data/lib/klookup/lookup.rb +29 -0
- data/lib/klookup/lookup_kanji.rb +107 -0
- data/lib/klookup/lookup_radical.rb +74 -0
- data/test/suite.rb +148 -0
- metadata +78 -0
data/lib/klookup.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
=begin
|
2
|
+
|
3
|
+
lib/klookup.rb
|
4
|
+
|
5
|
+
Copyright © Tom Adams 2006
|
6
|
+
|
7
|
+
This programme is free software.
|
8
|
+
You can distribute/modify this program under
|
9
|
+
the terms of the Ruby License.
|
10
|
+
|
11
|
+
=end
|
12
|
+
|
13
|
+
begin
|
14
|
+
require 'active_support'
|
15
|
+
rescue LoadError
|
16
|
+
require 'runicode'
|
17
|
+
end
|
18
|
+
|
19
|
+
#This allows regular expressions to work on characters.
|
20
|
+
$KCODE='u'
|
21
|
+
require 'jcode'
|
22
|
+
|
23
|
+
# Contains Lookup and Database.
|
24
|
+
module KLookup
|
25
|
+
require 'klookup/database'
|
26
|
+
require 'klookup/lookup'
|
27
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
=begin
|
2
|
+
|
3
|
+
lib/klookup/database.rb
|
4
|
+
|
5
|
+
Copyright © Tom Adams 2006
|
6
|
+
|
7
|
+
This programme is free software.
|
8
|
+
You can distribute/modify this program under
|
9
|
+
the terms of the Ruby License.
|
10
|
+
|
11
|
+
=end
|
12
|
+
|
13
|
+
# Database access modules:
|
14
|
+
# - FlatFile
|
15
|
+
# - Unihan
|
16
|
+
class KLookup::Database
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def undefined(*args)
|
21
|
+
raise NotImplementedError
|
22
|
+
end
|
23
|
+
|
24
|
+
public
|
25
|
+
|
26
|
+
# Opens a resource.
|
27
|
+
#
|
28
|
+
# Priority: LOOKUP_PATH environment variable, Gem load path.
|
29
|
+
def self.open_resource(path)
|
30
|
+
# Choose a directory
|
31
|
+
env=ENV['KLOOKUP_PATH']
|
32
|
+
if env and env != ''
|
33
|
+
dir=env+'/klookup'
|
34
|
+
else
|
35
|
+
dir=gem_path
|
36
|
+
begin
|
37
|
+
gem 'klookup'
|
38
|
+
dir=Gem.datadir 'klookup'
|
39
|
+
rescue NameError
|
40
|
+
raise IOError, 'Could not find resource %s' % path
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Open a file
|
45
|
+
file = open("#{dir}/#{path}")
|
46
|
+
if block_given?
|
47
|
+
yield file
|
48
|
+
else
|
49
|
+
return file
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
alias :stroke_count_list :undefined
|
54
|
+
alias :radicals_by_strokes :undefined
|
55
|
+
alias :get_kanji :undefined
|
56
|
+
alias :get_radical_strokes :undefined
|
57
|
+
alias :get_kanji_strokes :undefined
|
58
|
+
alias :get_radicals :undefined
|
59
|
+
alias :get_reading :undefined
|
60
|
+
alias :get_meaning :undefined
|
61
|
+
alias :is_kanji? :undefined
|
62
|
+
alias :is_radical? :undefined
|
63
|
+
|
64
|
+
require 'klookup/database_flatfile'
|
65
|
+
require 'klookup/database_unihan'
|
66
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
=begin
|
2
|
+
|
3
|
+
lib/klookup/database_flatfile.rb
|
4
|
+
|
5
|
+
Copyright © Tom Adams 2006
|
6
|
+
|
7
|
+
This programme is free software.
|
8
|
+
You can distribute/modify this program under
|
9
|
+
the terms of the Ruby License.
|
10
|
+
|
11
|
+
=end
|
12
|
+
|
13
|
+
# A singleton class to abstract RadK and KanjiDic.
|
14
|
+
class KLookup::Database::FlatFile < KLookup::Database
|
15
|
+
require 'klookup/database_flatfile_radk'
|
16
|
+
require 'klookup/database_flatfile_kanjidic'
|
17
|
+
|
18
|
+
require 'singleton'
|
19
|
+
include Singleton
|
20
|
+
|
21
|
+
def stroke_count_list(*args)
|
22
|
+
RadK.instance.stroke_count_list(*args)
|
23
|
+
end
|
24
|
+
def radicals_by_strokes
|
25
|
+
RadK.instance.radicals_by_strokes
|
26
|
+
end
|
27
|
+
def get_kanji(strokes, *args)
|
28
|
+
RadK.instance.get_kanji(*args).delete_if {|k|
|
29
|
+
not strokes.nil? and not get_kanji_strokes(k) == strokes }
|
30
|
+
end
|
31
|
+
def get_radical_strokes(*args)
|
32
|
+
RadK.instance.get_strokes(*args)
|
33
|
+
end
|
34
|
+
def get_kanji_strokes(*args)
|
35
|
+
KanjiDic.instance.get_strokes(*args)
|
36
|
+
end
|
37
|
+
def get_radicals(*args)
|
38
|
+
RadK.instance.get_radicals(*args)
|
39
|
+
end
|
40
|
+
def get_reading(*args)
|
41
|
+
KanjiDic.instance.get_reading(*args)
|
42
|
+
end
|
43
|
+
def get_meaning(*args)
|
44
|
+
KanjiDic.instance.get_meaning(*args)
|
45
|
+
end
|
46
|
+
def is_kanji?(*args)
|
47
|
+
KanjiDic.instance.is_kanji?(*args)
|
48
|
+
end
|
49
|
+
def is_radical?(*args)
|
50
|
+
RadK.instance.is_radical?(*args)
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,128 @@
|
|
1
|
+
=begin
|
2
|
+
|
3
|
+
lib/klookup/database_flatfile_kanjidic.rb
|
4
|
+
|
5
|
+
Copyright © Tom Adams 2006
|
6
|
+
|
7
|
+
This programme is free software.
|
8
|
+
You can distribute/modify this program under
|
9
|
+
the terms of the Ruby License.
|
10
|
+
|
11
|
+
=end
|
12
|
+
|
13
|
+
# Access to the KANJIDIC (additional information about kanji).
|
14
|
+
class KLookup::Database::FlatFile::KanjiDic
|
15
|
+
require 'singleton'
|
16
|
+
include Singleton
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def initialize(path='kanjidic')
|
21
|
+
@records={}
|
22
|
+
KLookup::Database.open_resource(path) {|kanjidic|
|
23
|
+
extract_records kanjidic
|
24
|
+
}
|
25
|
+
end
|
26
|
+
|
27
|
+
# Runs process_line on each line of +file+.
|
28
|
+
def extract_records(file)
|
29
|
+
file.each_line { |line|
|
30
|
+
if line =~ /^\s*#/ or line =~ /^\s*$/
|
31
|
+
next # No content, so skip
|
32
|
+
else
|
33
|
+
process_line line
|
34
|
+
end
|
35
|
+
}
|
36
|
+
end
|
37
|
+
|
38
|
+
# Sets +@records+.
|
39
|
+
def process_line(line)
|
40
|
+
items = line.split(/ /)
|
41
|
+
kanji = items.delete_at 0
|
42
|
+
|
43
|
+
# Set the record
|
44
|
+
@records[kanji] = {:items=>items}
|
45
|
+
end
|
46
|
+
|
47
|
+
#Returns true if there is kana in the string.
|
48
|
+
def include_kana?(str)
|
49
|
+
kana = (0x3040..0x30FF)
|
50
|
+
str.split(//).each {|i|
|
51
|
+
return true if kana.include? i.chars[0]
|
52
|
+
}
|
53
|
+
false
|
54
|
+
end
|
55
|
+
|
56
|
+
public
|
57
|
+
|
58
|
+
# Returns true if a kanji exists in the database.
|
59
|
+
def is_kanji?(kanji)
|
60
|
+
return false unless kanji.respond_to?(:to_s)
|
61
|
+
return false if kanji.to_s.chars.length != 1
|
62
|
+
return false if @records[kanji.to_s].nil?
|
63
|
+
return true
|
64
|
+
end
|
65
|
+
|
66
|
+
# Returns the number of strokes needed to write a given kanji.
|
67
|
+
def get_strokes(kanji)
|
68
|
+
@records[kanji][:items].select {|i|
|
69
|
+
i =~ /^S\d+$/}.first.sub(/^S(\d+)$/, '\1').to_i
|
70
|
+
end
|
71
|
+
|
72
|
+
#Returns a Struct of arrays of Japanese readings.
|
73
|
+
#
|
74
|
+
# KLookup::Database::FlatFile::KanjiDic.instance.get_reading('富')
|
75
|
+
# #=> #<struct #<Class:0xb7d3b5dc> reading=["フ", "フウ", "と.む", "とみ"],
|
76
|
+
# name_reading=["と", "とん", "ふっ"]>
|
77
|
+
def get_reading(kanji)
|
78
|
+
items = @records[kanji][:items]
|
79
|
+
# Set readings
|
80
|
+
name_flag = false
|
81
|
+
reading = []
|
82
|
+
name_reading = []
|
83
|
+
items.each {|i|
|
84
|
+
name_flag = true if i=='T1'
|
85
|
+
if include_kana?(i)
|
86
|
+
if name_flag
|
87
|
+
name_reading << i
|
88
|
+
else
|
89
|
+
reading << i
|
90
|
+
end
|
91
|
+
end
|
92
|
+
}
|
93
|
+
|
94
|
+
return Struct.new(:reading,:name_reading).new(reading, name_reading)
|
95
|
+
end
|
96
|
+
|
97
|
+
# Returns an array of English meanings of kanji.
|
98
|
+
def get_meaning(kanji)
|
99
|
+
items = @records[kanji][:items]
|
100
|
+
# Select the items with the meaning
|
101
|
+
flag=false
|
102
|
+
temp_meaning = items.select {|i|
|
103
|
+
if i =~ /^\{|\}$/ or flag
|
104
|
+
flag=true
|
105
|
+
end
|
106
|
+
}
|
107
|
+
temp_meaning.map! {|i| i.sub(/^\{(.*)\}$/, '\1') }
|
108
|
+
temp_meaning.delete_if {|i| i=~/^\s*$/}
|
109
|
+
|
110
|
+
# Stick the meanings together
|
111
|
+
finish=true
|
112
|
+
meaning=[]
|
113
|
+
temp_meaning.each {|m|
|
114
|
+
if m =~ /^\{(.*)$/ # {one
|
115
|
+
meaning << $1
|
116
|
+
finish = false
|
117
|
+
elsif finish # there are no strings being constructed
|
118
|
+
meaning << m
|
119
|
+
elsif m =~ /^(.*)\}$/ # three}
|
120
|
+
meaning.last<< ' ' + $1
|
121
|
+
finish = true
|
122
|
+
else # two (when there are strings being finished)
|
123
|
+
meaning.last<< ' ' + m.dup
|
124
|
+
end
|
125
|
+
}
|
126
|
+
meaning
|
127
|
+
end
|
128
|
+
end
|
@@ -0,0 +1,136 @@
|
|
1
|
+
=begin
|
2
|
+
|
3
|
+
lib/klookup/database_flatfile_radk.rb
|
4
|
+
|
5
|
+
Copyright © Tom Adams 2006
|
6
|
+
|
7
|
+
This programme is free software.
|
8
|
+
You can distribute/modify this program under
|
9
|
+
the terms of the Ruby License.
|
10
|
+
|
11
|
+
=end
|
12
|
+
|
13
|
+
# Access to RADKFILE (mappings from radicals to kanji).
|
14
|
+
class KLookup::Database::FlatFile::RadK
|
15
|
+
require 'singleton'
|
16
|
+
include Singleton
|
17
|
+
|
18
|
+
attr_reader :stroke_count_list
|
19
|
+
attr_reader :radicals_by_strokes
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
class MalformedDatabaseException < Exception; end
|
24
|
+
|
25
|
+
def initialize(path='newradkfile')
|
26
|
+
@stroke_count_list = []
|
27
|
+
@radicals_by_strokes = {}
|
28
|
+
|
29
|
+
KLookup::Database.open_resource(path) {|radkfile|
|
30
|
+
extract_records radkfile
|
31
|
+
}
|
32
|
+
end
|
33
|
+
|
34
|
+
# Saves a record in the class variable +@records+.
|
35
|
+
def save_record(radical,strokes,kanji)
|
36
|
+
unless radical.respond_to?(:to_s) and
|
37
|
+
strokes.respond_to?(:to_i) and
|
38
|
+
kanji.respond_to?(:to_a)
|
39
|
+
raise ArgumentError
|
40
|
+
end
|
41
|
+
radical=radical.to_s
|
42
|
+
strokes=strokes.to_i
|
43
|
+
kanji=kanji.to_a
|
44
|
+
|
45
|
+
# @records is a hash of hashes:
|
46
|
+
# @records = {
|
47
|
+
# ...
|
48
|
+
# '龠' => {:strokes=>17, :kanji=>['籥', '鑰', '龠']}
|
49
|
+
# }
|
50
|
+
@records[radical]={:strokes=>strokes, :kanji=>kanji}
|
51
|
+
|
52
|
+
(@stroke_count_list << strokes).sort!.uniq!
|
53
|
+
|
54
|
+
unless @radicals_by_strokes[strokes]
|
55
|
+
@radicals_by_strokes[strokes]=[]
|
56
|
+
end
|
57
|
+
@radicals_by_strokes[strokes] << radical
|
58
|
+
end
|
59
|
+
|
60
|
+
# Takes a string of data and interprets it.
|
61
|
+
def extract_records(radk)
|
62
|
+
@records = {}
|
63
|
+
@stroke_count_list = []
|
64
|
+
radical_line_passed=false
|
65
|
+
radical = nil
|
66
|
+
strokes = nil
|
67
|
+
kanji = nil
|
68
|
+
|
69
|
+
radk.each_line { |line|
|
70
|
+
if line.match(/^#/) or line.match(/^$/)
|
71
|
+
next # because it's a comment
|
72
|
+
elsif line.match(/^\$\s+([^\s])\s+(\d+)/)
|
73
|
+
# It's a radical line
|
74
|
+
|
75
|
+
# Save the previous record
|
76
|
+
save_record(radical,strokes,kanji) if radical_line_passed
|
77
|
+
radical_line_passed=true
|
78
|
+
|
79
|
+
radical = $1
|
80
|
+
strokes = $2
|
81
|
+
kanji = [] # Reset
|
82
|
+
|
83
|
+
elsif not radical_line_passed
|
84
|
+
raise MalformedDatabaseException
|
85
|
+
elsif
|
86
|
+
# It must be a line of kanji
|
87
|
+
kanji << line.split(//)
|
88
|
+
kanji.flatten!
|
89
|
+
kanji.delete("\n")
|
90
|
+
end
|
91
|
+
}
|
92
|
+
# And save the record at the end
|
93
|
+
save_record(radical,strokes,kanji) if radical_line_passed
|
94
|
+
end
|
95
|
+
|
96
|
+
public
|
97
|
+
|
98
|
+
# Tests if a radical exists.
|
99
|
+
def is_radical?(radical)
|
100
|
+
return false unless radical.respond_to?(:to_s)
|
101
|
+
return false if radical.to_s.chars.length != 1
|
102
|
+
return false if @records[radical.to_s].nil?
|
103
|
+
return true
|
104
|
+
end
|
105
|
+
|
106
|
+
# Returns a list of kanji corresponding to given radicals.
|
107
|
+
def get_kanji(*radicals)
|
108
|
+
kanji = nil
|
109
|
+
radicals.flatten.each { |rad|
|
110
|
+
raise ArgumentError unless @records[rad.to_s]
|
111
|
+
current_kanji = @records[rad.to_s][:kanji]
|
112
|
+
if kanji==nil
|
113
|
+
kanji = current_kanji
|
114
|
+
else
|
115
|
+
kanji &= current_kanji
|
116
|
+
end
|
117
|
+
}
|
118
|
+
kanji.to_a
|
119
|
+
end
|
120
|
+
|
121
|
+
# Returns the number of strokes corresponding to a radical.
|
122
|
+
def get_strokes(radical)
|
123
|
+
@records[radical][:strokes]
|
124
|
+
end
|
125
|
+
|
126
|
+
# Returns a list of radicals corresponding to a kanji.
|
127
|
+
def get_radicals(kanji)
|
128
|
+
radicals = []
|
129
|
+
@records.each { |radical,data|
|
130
|
+
if data[:kanji].include? kanji
|
131
|
+
radicals << radical
|
132
|
+
end
|
133
|
+
}
|
134
|
+
radicals
|
135
|
+
end
|
136
|
+
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
=begin
|
2
|
+
|
3
|
+
lib/klookup/database_unihan.rb
|
4
|
+
|
5
|
+
Copyright © Tom Adams 2006
|
6
|
+
|
7
|
+
This programme is free software.
|
8
|
+
You can distribute/modify this program under
|
9
|
+
the terms of the Ruby License.
|
10
|
+
|
11
|
+
=end
|
12
|
+
|
13
|
+
# Properties:
|
14
|
+
# - kJapaneseKun
|
15
|
+
# - kJapaneseOn
|
16
|
+
# - kDefinition (Chinese, with Japanese marked by '(J)')
|
17
|
+
# (contains radical number %d for radicals)
|
18
|
+
# - kFrequency (not language-specific)
|
19
|
+
# - kRSJapanese
|
20
|
+
# - kTotalStrokes
|
21
|
+
# Properties to find:
|
22
|
+
# - alternate radical representation
|
23
|
+
# - alternate kanji characters
|
24
|
+
|
25
|
+
# Other:
|
26
|
+
# - Add ability to lookup arbitrary Property for character
|
27
|
+
# - It may be faster to re-open the file every time or keep a file descriptor
|
28
|
+
# than to keep it all in memory
|
29
|
+
|
30
|
+
# A singleton class to access Unihan.txt.
|
31
|
+
class KLookup::Database::Unihan < KLookup::Database
|
32
|
+
require 'singleton'
|
33
|
+
include Singleton
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
def initialize
|
38
|
+
@unihan = KLookup::Database::Unihan.open_resource('Unihan.txt')
|
39
|
+
end
|
40
|
+
|
41
|
+
# Returns the numeric codepoint of the first codepoint in +str+.
|
42
|
+
def get_codepoint(str)
|
43
|
+
str.chars[0]
|
44
|
+
end
|
45
|
+
|
46
|
+
# Returns the hexadecimal representation used by the Unicode Standard
|
47
|
+
# (+%04X+) of the first codepoint in +str+.
|
48
|
+
def get_hexcp(str)
|
49
|
+
"%04X" % get_codepoint(str)
|
50
|
+
end
|
51
|
+
|
52
|
+
public
|
53
|
+
|
54
|
+
# Opens a resource.
|
55
|
+
#
|
56
|
+
# Priority: LOOKUP_PATH environment variable, Gem load path.
|
57
|
+
def self.open_resource(path)
|
58
|
+
# Choose a directory
|
59
|
+
env=ENV['KLOOKUP_PATH']
|
60
|
+
if env and env != ''
|
61
|
+
dir=env+'/unihan'
|
62
|
+
else
|
63
|
+
dir=gem_path
|
64
|
+
begin
|
65
|
+
gem 'unihan'
|
66
|
+
dir=Gem.datadir 'unihan'
|
67
|
+
rescue NameError
|
68
|
+
raise IOError, 'Could not find resource %s' % path
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
# Open a file
|
73
|
+
file = open("#{dir}/#{path}")
|
74
|
+
if block_given?
|
75
|
+
yield file
|
76
|
+
else
|
77
|
+
return file
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def stroke_count_list(*args)
|
82
|
+
end
|
83
|
+
def radicals_by_strokes
|
84
|
+
end
|
85
|
+
def get_kanji(strokes, *args)
|
86
|
+
end
|
87
|
+
def get_radical_strokes(*args)
|
88
|
+
end
|
89
|
+
def get_kanji_strokes(*args)
|
90
|
+
end
|
91
|
+
def get_radicals(*args)
|
92
|
+
end
|
93
|
+
def get_reading(*args)
|
94
|
+
end
|
95
|
+
def get_meaning(*args)
|
96
|
+
end
|
97
|
+
def is_kanji?(*args)
|
98
|
+
end
|
99
|
+
def is_radical?(*args)
|
100
|
+
end
|
101
|
+
end
|