japanese_names 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,104 @@
1
+ #!/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ module JapaneseNames
5
+
6
+ # Query interface for the ENAMDICT file (http://www.csse.monash.edu.au/~jwb/enamdict_doc.html)
7
+ module Enamdict
8
+
9
+ # s - surname (138,500)
10
+ # p - place-name (99,500)
11
+ # u - person name, either given or surname, as-yet unclassified (139,000)
12
+ # g - given name, as-yet not classified by sex (64,600)
13
+ # f - female given name (106,300)
14
+ # m - male given name (14,500)
15
+ NAME_FAM = %w(s p u)
16
+ NAME_GIV = %w(u g f m)
17
+ NAME_ANY = NAME_FAM | NAME_GIV
18
+
19
+ class << self
20
+
21
+ # Public: Matches kanji and/or kana regex strings in the dictionary.
22
+ #
23
+ # opts - The Hash options used to match the dictionary (default: {}):
24
+ # kanji: Regex to match kanji name (optional)
25
+ # kana: Regex to match kana name (optional)
26
+ # flags: Flag or Array of flags to filter the match (optional)
27
+ #
28
+ # Returns the dict entries as an Array of Arrays [[kanji, kana, flags], ...]
29
+ def match(opts={})
30
+ return [] unless opts[:kanji] || opts[:kana]
31
+
32
+ kanji = name_regex opts.delete(:kanji)
33
+ kana = name_regex opts.delete(:kana)
34
+ flags = flags_regex opts.delete(:flags)
35
+ regex = /^#{kanji}\|#{kana}\|#{flags}$/
36
+
37
+ search{|line| line[regex]}
38
+ end
39
+
40
+ # Public: Selects entries in the enamdict based on a block which should
41
+ # evaluate true or false (typically a regex).
42
+ #
43
+ # Returns the dict entries as an Array of Arrays [[kanji, kana, flags], ...]
44
+ def search(&block)
45
+ sel = []
46
+ each_line do |line|
47
+ if block.call(line)
48
+ sel << unpack_line(line)
49
+ end
50
+ end
51
+ sel
52
+ end
53
+
54
+ protected
55
+
56
+ # Internal: Returns the filepath to the enamdict.min file.
57
+ def filepath
58
+ File.join(File.dirname(__FILE__), '../../bin/enamdict.min')
59
+ end
60
+
61
+ # Internal: The memoized dictionary instance.
62
+ def dict
63
+ return @dict if @dict
64
+ @dict = []
65
+ File.open(self.filepath, 'r:utf-8') do |f|
66
+ while(line = f.gets) != nil
67
+ @dict << line[0..-2] # omit trailing newline char
68
+ end
69
+ end
70
+ @dict.freeze
71
+ end
72
+
73
+ # Internal: Calls the given block for each line in the dict.
74
+ def each_line(&block)
75
+ dict.each{|line| block.call(line) }
76
+ end
77
+
78
+ # Internal: Formats a line as a 3-tuple Array [kanji, kana, flags]
79
+ def unpack_line(line)
80
+ line.split('|')
81
+ end
82
+
83
+ # Internal: Builds regex criteria for name.
84
+ def name_regex(name)
85
+ case name
86
+ when String then name
87
+ when Array then "(?:#{name.join('|')})"
88
+ else '.+?'
89
+ end
90
+ end
91
+
92
+ # Internal: Builds regex criteria for flags.
93
+ def flags_regex(flags)
94
+ if !flags || flags == NAME_ANY
95
+ '.+?'
96
+ elsif flags.is_a?(Array)
97
+ ".*?[#{flags.join}].*?"
98
+ else
99
+ flags
100
+ end
101
+ end
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,76 @@
1
+ #!/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ module JapaneseNames
5
+
6
+ # Provides methods for parsing Japanese name strings.
7
+ class Parser
8
+
9
+ # Given a kanji and kana representation of a name splits into to family/given names.
10
+ #
11
+ # The choice to prioritize family name is arbitrary. Further analysis is needed
12
+ # for whether given or family name should be prioritized.
13
+ #
14
+ # Returns Array [[kanji_fam, kanji_giv], [kana_fam, kana_giv]] if there was a match.
15
+ # Returns nil if there was no match.
16
+ def split(kanji, kana)
17
+ split_fam(kanji, kana) || split_giv(kanji, kana)
18
+ end
19
+
20
+ def split_giv(kanji, kana)
21
+ dict = Enamdict.match(kanji: window_right(kanji))
22
+ dict.sort!{|x,y| y[0].size <=> x[0].size}
23
+ kana_match = nil
24
+ if match = dict.detect{|m| kana_match = kana[/#{hk m[1]}$/]}
25
+ return [[mask_right(kanji, match[0]), match[0]],[mask_right(kana, kana_match), kana_match]]
26
+ end
27
+ end
28
+
29
+ def split_fam(kanji, kana)
30
+ dict = Enamdict.match(kanji: window_left(kanji))
31
+ dict.sort!{|x,y| y[0].size <=> x[0].size}
32
+ kana_match = nil
33
+ if match = dict.detect{|m| kana_match = kana[/^#{hk m[1]}/]}
34
+ return [[match[0], mask_left(kanji, match[0])],[kana_match, mask_left(kana, kana_match)]]
35
+ end
36
+ end
37
+
38
+ # TODO: add option to strip honorific '様'
39
+ # TODO: add option to infer sex (0 = unknown, 1 = male, 2 = female as per ISO/IEC 5218)
40
+
41
+ protected
42
+
43
+ # Returns a regex string which matches both hiragana and katakana variations of a String.
44
+ def hk(str)
45
+ "(?:#{Moji.kata_to_hira(str)}|#{Moji.hira_to_kata(str)})"
46
+ end
47
+
48
+ # Masks a String from the left side and returns the remaining (right) portion of the String.
49
+ #
50
+ # Example: mask_left("abcde", "ab") #=> "cde"
51
+ def mask_left(str, mask)
52
+ str.gsub(/^#{mask}/, '')
53
+ end
54
+
55
+ # Masks a String from the right side and returns the remaining (left) portion of the String.
56
+ #
57
+ # Example: mask_right("abcde", "de") #=> "abc"
58
+ def mask_right(str, mask)
59
+ str.gsub(/#{mask}$/, '')
60
+ end
61
+
62
+ # Given a String, returns an array of progressively smaller substrings anchored on the left side.
63
+ #
64
+ # Example: window_left("abcde") #=> ["abcd", "abc", "ab", "a"]
65
+ def window_left(str)
66
+ (0..str.size-2).to_a.reverse.map{|i| str[0..i]}
67
+ end
68
+
69
+ # Given a String, returns an array of progressively smaller substrings anchored on the right side.
70
+ #
71
+ # Example: window_right("abcde") #=> ["bcde", "cde", "de", "e"]
72
+ def window_right(str)
73
+ (1..str.size-1).map{|i| str[i..-1]}
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,6 @@
1
+ #!/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ module JapaneseNames
5
+ VERSION = '0.0.1'
6
+ end
@@ -0,0 +1,7 @@
1
+ $:.unshift File.dirname(__FILE__)
2
+
3
+ require 'moji'
4
+
5
+ require 'japanese_names/version'
6
+ require 'japanese_names/enamdict'
7
+ require 'japanese_names/parser'
@@ -0,0 +1,11 @@
1
+ #!/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ $:.push File.expand_path('../../lib', __FILE__)
5
+
6
+ require 'rubygems'
7
+ require 'japanese_names'
8
+
9
+ RSpec.configure do |config|
10
+ config.mock_with :rspec
11
+ end
@@ -0,0 +1,59 @@
1
+ #!/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ require 'spec_helper'
5
+
6
+ describe JapaneseNames::Enamdict do
7
+
8
+ subject { JapaneseNames::Enamdict }
9
+
10
+ describe '#search' do
11
+
12
+ it 'should select only lines which match criteria' do
13
+ result = subject.search{|line| line =~ /^.+?\|あわのはら\|.+?$/}
14
+ result.should eq [["粟野原", "あわのはら", "s"]]
15
+ end
16
+
17
+ it 'should select multiple lines' do
18
+ result = subject.search{|line| line =~ /^.+?\|はしの\|.+?$/}
19
+ result.should eq [["橋之", "はしの", "p"],
20
+ ["橋埜", "はしの", "s"],
21
+ ["橋野", "はしの", "s"],
22
+ ["端野", "はしの", "s"],
23
+ ["箸野", "はしの", "s"]]
24
+ end
25
+ end
26
+
27
+ describe '#lookup' do
28
+
29
+ it 'should match kanji only' do
30
+ result = subject.match(kanji: '外世子')
31
+ result.should eq [["外世子", "とよこ", "f"]]
32
+ end
33
+
34
+ it 'should match kana only' do
35
+ result = subject.match(kana: 'ならしま')
36
+ result.should eq [["樽島", "ならしま", "u"],
37
+ ["奈良島", "ならしま", "s"],
38
+ ["楢島", "ならしま", "s"],
39
+ ["楢嶋", "ならしま", "s"]]
40
+ end
41
+
42
+ it 'should match both kanji and kana only' do
43
+ result = subject.match(kanji: '楢二郎', kana: 'ならじろう')
44
+ result.should eq [["楢二郎", "ならじろう", "m"]]
45
+ end
46
+
47
+ it 'should match flags as String' do
48
+ result = subject.match(kana: 'ならしま', flags: 's')
49
+ result.should eq [["奈良島", "ならしま", "s"],
50
+ ["楢島", "ならしま", "s"],
51
+ ["楢嶋", "ならしま", "s"]]
52
+ end
53
+
54
+ it 'should match flags as Array' do
55
+ result = subject.match(kana: 'ならしま', flags: ['u','g'])
56
+ result.should eq [["樽島", "ならしま", "u"]]
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,42 @@
1
+ #!/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ require 'spec_helper'
5
+
6
+ describe JapaneseNames::Parser do
7
+
8
+ subject { JapaneseNames::Parser.new }
9
+
10
+ describe '#split' do
11
+
12
+ [['上原','望','ウエハラ', 'ノゾミ'],
13
+ ['樋口','知美','ヒグチ', 'ともみ'],
14
+ ['堺','雅美','さかい', 'マサミ'],
15
+ ['中村','幸子','ナカムラ', 'サチコ'],
16
+ ['秋保','郁子','アキホ', 'いくこ'],
17
+ ['光野','亜佐子','ミツノ', 'アサコ'],
18
+ ['熊澤','貴子','クマザワ', 'タカコ']].each do |kanji_fam, kanji_giv, kana_fam, kana_giv|
19
+ it "should parse #{kanji_fam+kanji_giv} #{kana_fam+kana_giv}" do
20
+ result = subject.split(kanji_fam+kanji_giv, kana_fam+kana_giv)
21
+ result.should eq [[kanji_fam, kanji_giv], [kana_fam, kana_giv]]
22
+ end
23
+
24
+ it "should parse #{kanji_fam+kanji_giv} #{kana_fam+kana_giv} by given name" do
25
+ result = subject.split_giv(kanji_fam+kanji_giv, kana_fam+kana_giv)
26
+ result.should eq [[kanji_fam, kanji_giv], [kana_fam, kana_giv]]
27
+ end
28
+
29
+ it "should parse #{kanji_fam+kanji_giv} #{kana_fam+kana_giv} by family name" do
30
+ result = subject.split_fam(kanji_fam+kanji_giv, kana_fam+kana_giv)
31
+ result.should eq [[kanji_fam, kanji_giv], [kana_fam, kana_giv]]
32
+ end
33
+ end
34
+
35
+ [['XXX','XXX','XXX', 'XXX']].each do |kanji_fam, kanji_giv, kana_fam, kana_giv|
36
+ it "should return nil for invalid name #{kanji_fam+kanji_giv} #{kana_fam+kana_giv}" do
37
+ result = subject.split(kanji_fam+kanji_giv, kana_fam+kana_giv)
38
+ result.should be_nil
39
+ end
40
+ end
41
+ end
42
+ end
metadata ADDED
@@ -0,0 +1,113 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: japanese_names
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Johnny Shields
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-09-07 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: moji
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ! '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ! '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ! '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ! '>='
46
+ - !ruby/object:Gem::Version
47
+ version: 3.0.0
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: 3.0.0
55
+ - !ruby/object:Gem::Dependency
56
+ name: gem-release
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ! '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ description: Japanese name parser based on ENAMDICT
70
+ email: johnny.shields@gmail.com
71
+ executables: []
72
+ extensions: []
73
+ extra_rdoc_files: []
74
+ files:
75
+ - LICENSE
76
+ - README.md
77
+ - bin/enamdict.min
78
+ - lib/japanese_names.rb
79
+ - lib/japanese_names/enamdict.rb
80
+ - lib/japanese_names/parser.rb
81
+ - lib/japanese_names/version.rb
82
+ - spec/spec_helper.rb
83
+ - spec/unit/enamdict_spec.rb
84
+ - spec/unit/parser_spec.rb
85
+ homepage: https://github.com/johnnyshields/japanese_names
86
+ licenses:
87
+ - MIT
88
+ metadata: {}
89
+ post_install_message:
90
+ rdoc_options: []
91
+ require_paths:
92
+ - lib
93
+ required_ruby_version: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - ! '>='
96
+ - !ruby/object:Gem::Version
97
+ version: '0'
98
+ required_rubygems_version: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - ! '>='
101
+ - !ruby/object:Gem::Version
102
+ version: '0'
103
+ requirements: []
104
+ rubyforge_project:
105
+ rubygems_version: 2.2.1
106
+ signing_key:
107
+ specification_version: 4
108
+ summary: Tools for parsing japanese names
109
+ test_files:
110
+ - spec/spec_helper.rb
111
+ - spec/unit/enamdict_spec.rb
112
+ - spec/unit/parser_spec.rb
113
+ has_rdoc: