japanese_names 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/LICENSE +24 -0
- data/README.md +52 -0
- data/bin/enamdict.min +674110 -0
- data/lib/japanese_names/enamdict.rb +104 -0
- data/lib/japanese_names/parser.rb +76 -0
- data/lib/japanese_names/version.rb +6 -0
- data/lib/japanese_names.rb +7 -0
- data/spec/spec_helper.rb +11 -0
- data/spec/unit/enamdict_spec.rb +59 -0
- data/spec/unit/parser_spec.rb +42 -0
- metadata +113 -0
@@ -0,0 +1,104 @@
|
|
1
|
+
#!/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
module JapaneseNames
|
5
|
+
|
6
|
+
# Query interface for the ENAMDICT file (http://www.csse.monash.edu.au/~jwb/enamdict_doc.html)
|
7
|
+
module Enamdict
|
8
|
+
|
9
|
+
# s - surname (138,500)
|
10
|
+
# p - place-name (99,500)
|
11
|
+
# u - person name, either given or surname, as-yet unclassified (139,000)
|
12
|
+
# g - given name, as-yet not classified by sex (64,600)
|
13
|
+
# f - female given name (106,300)
|
14
|
+
# m - male given name (14,500)
|
15
|
+
NAME_FAM = %w(s p u)
|
16
|
+
NAME_GIV = %w(u g f m)
|
17
|
+
NAME_ANY = NAME_FAM | NAME_GIV
|
18
|
+
|
19
|
+
class << self
|
20
|
+
|
21
|
+
# Public: Matches kanji and/or kana regex strings in the dictionary.
|
22
|
+
#
|
23
|
+
# opts - The Hash options used to match the dictionary (default: {}):
|
24
|
+
# kanji: Regex to match kanji name (optional)
|
25
|
+
# kana: Regex to match kana name (optional)
|
26
|
+
# flags: Flag or Array of flags to filter the match (optional)
|
27
|
+
#
|
28
|
+
# Returns the dict entries as an Array of Arrays [[kanji, kana, flags], ...]
|
29
|
+
def match(opts={})
|
30
|
+
return [] unless opts[:kanji] || opts[:kana]
|
31
|
+
|
32
|
+
kanji = name_regex opts.delete(:kanji)
|
33
|
+
kana = name_regex opts.delete(:kana)
|
34
|
+
flags = flags_regex opts.delete(:flags)
|
35
|
+
regex = /^#{kanji}\|#{kana}\|#{flags}$/
|
36
|
+
|
37
|
+
search{|line| line[regex]}
|
38
|
+
end
|
39
|
+
|
40
|
+
# Public: Selects entries in the enamdict based on a block which should
|
41
|
+
# evaluate true or false (typically a regex).
|
42
|
+
#
|
43
|
+
# Returns the dict entries as an Array of Arrays [[kanji, kana, flags], ...]
|
44
|
+
def search(&block)
|
45
|
+
sel = []
|
46
|
+
each_line do |line|
|
47
|
+
if block.call(line)
|
48
|
+
sel << unpack_line(line)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
sel
|
52
|
+
end
|
53
|
+
|
54
|
+
protected
|
55
|
+
|
56
|
+
# Internal: Returns the filepath to the enamdict.min file.
|
57
|
+
def filepath
|
58
|
+
File.join(File.dirname(__FILE__), '../../bin/enamdict.min')
|
59
|
+
end
|
60
|
+
|
61
|
+
# Internal: The memoized dictionary instance.
|
62
|
+
def dict
|
63
|
+
return @dict if @dict
|
64
|
+
@dict = []
|
65
|
+
File.open(self.filepath, 'r:utf-8') do |f|
|
66
|
+
while(line = f.gets) != nil
|
67
|
+
@dict << line[0..-2] # omit trailing newline char
|
68
|
+
end
|
69
|
+
end
|
70
|
+
@dict.freeze
|
71
|
+
end
|
72
|
+
|
73
|
+
# Internal: Calls the given block for each line in the dict.
|
74
|
+
def each_line(&block)
|
75
|
+
dict.each{|line| block.call(line) }
|
76
|
+
end
|
77
|
+
|
78
|
+
# Internal: Formats a line as a 3-tuple Array [kanji, kana, flags]
|
79
|
+
def unpack_line(line)
|
80
|
+
line.split('|')
|
81
|
+
end
|
82
|
+
|
83
|
+
# Internal: Builds regex criteria for name.
|
84
|
+
def name_regex(name)
|
85
|
+
case name
|
86
|
+
when String then name
|
87
|
+
when Array then "(?:#{name.join('|')})"
|
88
|
+
else '.+?'
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
# Internal: Builds regex criteria for flags.
|
93
|
+
def flags_regex(flags)
|
94
|
+
if !flags || flags == NAME_ANY
|
95
|
+
'.+?'
|
96
|
+
elsif flags.is_a?(Array)
|
97
|
+
".*?[#{flags.join}].*?"
|
98
|
+
else
|
99
|
+
flags
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
#!/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
module JapaneseNames
|
5
|
+
|
6
|
+
# Provides methods for parsing Japanese name strings.
|
7
|
+
class Parser
|
8
|
+
|
9
|
+
# Given a kanji and kana representation of a name splits into to family/given names.
|
10
|
+
#
|
11
|
+
# The choice to prioritize family name is arbitrary. Further analysis is needed
|
12
|
+
# for whether given or family name should be prioritized.
|
13
|
+
#
|
14
|
+
# Returns Array [[kanji_fam, kanji_giv], [kana_fam, kana_giv]] if there was a match.
|
15
|
+
# Returns nil if there was no match.
|
16
|
+
def split(kanji, kana)
|
17
|
+
split_fam(kanji, kana) || split_giv(kanji, kana)
|
18
|
+
end
|
19
|
+
|
20
|
+
def split_giv(kanji, kana)
|
21
|
+
dict = Enamdict.match(kanji: window_right(kanji))
|
22
|
+
dict.sort!{|x,y| y[0].size <=> x[0].size}
|
23
|
+
kana_match = nil
|
24
|
+
if match = dict.detect{|m| kana_match = kana[/#{hk m[1]}$/]}
|
25
|
+
return [[mask_right(kanji, match[0]), match[0]],[mask_right(kana, kana_match), kana_match]]
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def split_fam(kanji, kana)
|
30
|
+
dict = Enamdict.match(kanji: window_left(kanji))
|
31
|
+
dict.sort!{|x,y| y[0].size <=> x[0].size}
|
32
|
+
kana_match = nil
|
33
|
+
if match = dict.detect{|m| kana_match = kana[/^#{hk m[1]}/]}
|
34
|
+
return [[match[0], mask_left(kanji, match[0])],[kana_match, mask_left(kana, kana_match)]]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# TODO: add option to strip honorific '様'
|
39
|
+
# TODO: add option to infer sex (0 = unknown, 1 = male, 2 = female as per ISO/IEC 5218)
|
40
|
+
|
41
|
+
protected
|
42
|
+
|
43
|
+
# Returns a regex string which matches both hiragana and katakana variations of a String.
|
44
|
+
def hk(str)
|
45
|
+
"(?:#{Moji.kata_to_hira(str)}|#{Moji.hira_to_kata(str)})"
|
46
|
+
end
|
47
|
+
|
48
|
+
# Masks a String from the left side and returns the remaining (right) portion of the String.
|
49
|
+
#
|
50
|
+
# Example: mask_left("abcde", "ab") #=> "cde"
|
51
|
+
def mask_left(str, mask)
|
52
|
+
str.gsub(/^#{mask}/, '')
|
53
|
+
end
|
54
|
+
|
55
|
+
# Masks a String from the right side and returns the remaining (left) portion of the String.
|
56
|
+
#
|
57
|
+
# Example: mask_right("abcde", "de") #=> "abc"
|
58
|
+
def mask_right(str, mask)
|
59
|
+
str.gsub(/#{mask}$/, '')
|
60
|
+
end
|
61
|
+
|
62
|
+
# Given a String, returns an array of progressively smaller substrings anchored on the left side.
|
63
|
+
#
|
64
|
+
# Example: window_left("abcde") #=> ["abcd", "abc", "ab", "a"]
|
65
|
+
def window_left(str)
|
66
|
+
(0..str.size-2).to_a.reverse.map{|i| str[0..i]}
|
67
|
+
end
|
68
|
+
|
69
|
+
# Given a String, returns an array of progressively smaller substrings anchored on the right side.
|
70
|
+
#
|
71
|
+
# Example: window_right("abcde") #=> ["bcde", "cde", "de", "e"]
|
72
|
+
def window_right(str)
|
73
|
+
(1..str.size-1).map{|i| str[i..-1]}
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
#!/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
require 'spec_helper'
|
5
|
+
|
6
|
+
describe JapaneseNames::Enamdict do
|
7
|
+
|
8
|
+
subject { JapaneseNames::Enamdict }
|
9
|
+
|
10
|
+
describe '#search' do
|
11
|
+
|
12
|
+
it 'should select only lines which match criteria' do
|
13
|
+
result = subject.search{|line| line =~ /^.+?\|あわのはら\|.+?$/}
|
14
|
+
result.should eq [["粟野原", "あわのはら", "s"]]
|
15
|
+
end
|
16
|
+
|
17
|
+
it 'should select multiple lines' do
|
18
|
+
result = subject.search{|line| line =~ /^.+?\|はしの\|.+?$/}
|
19
|
+
result.should eq [["橋之", "はしの", "p"],
|
20
|
+
["橋埜", "はしの", "s"],
|
21
|
+
["橋野", "はしの", "s"],
|
22
|
+
["端野", "はしの", "s"],
|
23
|
+
["箸野", "はしの", "s"]]
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
describe '#lookup' do
|
28
|
+
|
29
|
+
it 'should match kanji only' do
|
30
|
+
result = subject.match(kanji: '外世子')
|
31
|
+
result.should eq [["外世子", "とよこ", "f"]]
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'should match kana only' do
|
35
|
+
result = subject.match(kana: 'ならしま')
|
36
|
+
result.should eq [["樽島", "ならしま", "u"],
|
37
|
+
["奈良島", "ならしま", "s"],
|
38
|
+
["楢島", "ならしま", "s"],
|
39
|
+
["楢嶋", "ならしま", "s"]]
|
40
|
+
end
|
41
|
+
|
42
|
+
it 'should match both kanji and kana only' do
|
43
|
+
result = subject.match(kanji: '楢二郎', kana: 'ならじろう')
|
44
|
+
result.should eq [["楢二郎", "ならじろう", "m"]]
|
45
|
+
end
|
46
|
+
|
47
|
+
it 'should match flags as String' do
|
48
|
+
result = subject.match(kana: 'ならしま', flags: 's')
|
49
|
+
result.should eq [["奈良島", "ならしま", "s"],
|
50
|
+
["楢島", "ならしま", "s"],
|
51
|
+
["楢嶋", "ならしま", "s"]]
|
52
|
+
end
|
53
|
+
|
54
|
+
it 'should match flags as Array' do
|
55
|
+
result = subject.match(kana: 'ならしま', flags: ['u','g'])
|
56
|
+
result.should eq [["樽島", "ならしま", "u"]]
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
#!/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
require 'spec_helper'
|
5
|
+
|
6
|
+
describe JapaneseNames::Parser do
|
7
|
+
|
8
|
+
subject { JapaneseNames::Parser.new }
|
9
|
+
|
10
|
+
describe '#split' do
|
11
|
+
|
12
|
+
[['上原','望','ウエハラ', 'ノゾミ'],
|
13
|
+
['樋口','知美','ヒグチ', 'ともみ'],
|
14
|
+
['堺','雅美','さかい', 'マサミ'],
|
15
|
+
['中村','幸子','ナカムラ', 'サチコ'],
|
16
|
+
['秋保','郁子','アキホ', 'いくこ'],
|
17
|
+
['光野','亜佐子','ミツノ', 'アサコ'],
|
18
|
+
['熊澤','貴子','クマザワ', 'タカコ']].each do |kanji_fam, kanji_giv, kana_fam, kana_giv|
|
19
|
+
it "should parse #{kanji_fam+kanji_giv} #{kana_fam+kana_giv}" do
|
20
|
+
result = subject.split(kanji_fam+kanji_giv, kana_fam+kana_giv)
|
21
|
+
result.should eq [[kanji_fam, kanji_giv], [kana_fam, kana_giv]]
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should parse #{kanji_fam+kanji_giv} #{kana_fam+kana_giv} by given name" do
|
25
|
+
result = subject.split_giv(kanji_fam+kanji_giv, kana_fam+kana_giv)
|
26
|
+
result.should eq [[kanji_fam, kanji_giv], [kana_fam, kana_giv]]
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should parse #{kanji_fam+kanji_giv} #{kana_fam+kana_giv} by family name" do
|
30
|
+
result = subject.split_fam(kanji_fam+kanji_giv, kana_fam+kana_giv)
|
31
|
+
result.should eq [[kanji_fam, kanji_giv], [kana_fam, kana_giv]]
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
[['XXX','XXX','XXX', 'XXX']].each do |kanji_fam, kanji_giv, kana_fam, kana_giv|
|
36
|
+
it "should return nil for invalid name #{kanji_fam+kanji_giv} #{kana_fam+kana_giv}" do
|
37
|
+
result = subject.split(kanji_fam+kanji_giv, kana_fam+kana_giv)
|
38
|
+
result.should be_nil
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
metadata
ADDED
@@ -0,0 +1,113 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: japanese_names
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Johnny Shields
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-09-07 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: moji
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ! '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.6'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ! '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.6'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ! '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ! '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ! '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 3.0.0
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 3.0.0
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: gem-release
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ! '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
description: Japanese name parser based on ENAMDICT
|
70
|
+
email: johnny.shields@gmail.com
|
71
|
+
executables: []
|
72
|
+
extensions: []
|
73
|
+
extra_rdoc_files: []
|
74
|
+
files:
|
75
|
+
- LICENSE
|
76
|
+
- README.md
|
77
|
+
- bin/enamdict.min
|
78
|
+
- lib/japanese_names.rb
|
79
|
+
- lib/japanese_names/enamdict.rb
|
80
|
+
- lib/japanese_names/parser.rb
|
81
|
+
- lib/japanese_names/version.rb
|
82
|
+
- spec/spec_helper.rb
|
83
|
+
- spec/unit/enamdict_spec.rb
|
84
|
+
- spec/unit/parser_spec.rb
|
85
|
+
homepage: https://github.com/johnnyshields/japanese_names
|
86
|
+
licenses:
|
87
|
+
- MIT
|
88
|
+
metadata: {}
|
89
|
+
post_install_message:
|
90
|
+
rdoc_options: []
|
91
|
+
require_paths:
|
92
|
+
- lib
|
93
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
94
|
+
requirements:
|
95
|
+
- - ! '>='
|
96
|
+
- !ruby/object:Gem::Version
|
97
|
+
version: '0'
|
98
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
99
|
+
requirements:
|
100
|
+
- - ! '>='
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
version: '0'
|
103
|
+
requirements: []
|
104
|
+
rubyforge_project:
|
105
|
+
rubygems_version: 2.2.1
|
106
|
+
signing_key:
|
107
|
+
specification_version: 4
|
108
|
+
summary: Tools for parsing japanese names
|
109
|
+
test_files:
|
110
|
+
- spec/spec_helper.rb
|
111
|
+
- spec/unit/enamdict_spec.rb
|
112
|
+
- spec/unit/parser_spec.rb
|
113
|
+
has_rdoc:
|