ke2daira 0.1.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/Gemfile.lock +39 -23
- data/ke2daira.gemspec +40 -0
- data/lib/ke2daira/kana2mora.rb +23 -0
- data/lib/ke2daira/ke2dairanizer.rb +71 -0
- data/lib/ke2daira/version.rb +1 -1
- data/lib/ke2daira.rb +6 -60
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c4aaa1e18e53c6f8aa6c87b8457da21af7212910cfa195993c19675e4ba7e70f
|
4
|
+
data.tar.gz: 4f547aa4bd60e8eea105883e08b29d7c19cdda509dff8cb0d7a472f3db715b1a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0166f9cdce6444a1aa3e1bb80061d2d125eae8163a184c210cf0e77ca725b11a761bc76082f95c6b03060ef40f5cc26f883566ad023231a980914844d5ec2ee1
|
7
|
+
data.tar.gz: ddc9c67a576e5cfab2d7a330cfce56fbc13b79f3006ab3ac8906f9ee441087d5a764bddbdd88d368e342da591fb714cfda94372318fb875237940e4efa09507b
|
data/CHANGELOG.md
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,24 +1,27 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
ke2daira (0.1
|
4
|
+
ke2daira (0.2.1)
|
5
5
|
suika (~> 0.3.2)
|
6
6
|
|
7
7
|
GEM
|
8
8
|
remote: https://rubygems.org/
|
9
9
|
specs:
|
10
|
-
ast (2.4.
|
10
|
+
ast (2.4.3)
|
11
11
|
dartsclone (0.3.2)
|
12
12
|
diff-lcs (1.5.0)
|
13
|
-
json (2.
|
14
|
-
language_server-protocol (3.17.0.
|
15
|
-
|
16
|
-
|
13
|
+
json (2.12.2)
|
14
|
+
language_server-protocol (3.17.0.5)
|
15
|
+
lint_roller (1.1.0)
|
16
|
+
parallel (1.27.0)
|
17
|
+
parser (3.3.8.0)
|
17
18
|
ast (~> 2.4.1)
|
19
|
+
racc
|
20
|
+
prism (1.4.0)
|
21
|
+
racc (1.8.1)
|
18
22
|
rainbow (3.1.1)
|
19
23
|
rake (13.0.6)
|
20
|
-
regexp_parser (2.
|
21
|
-
rexml (3.2.5)
|
24
|
+
regexp_parser (2.10.0)
|
22
25
|
rspec (3.12.0)
|
23
26
|
rspec-core (~> 3.12.0)
|
24
27
|
rspec-expectations (~> 3.12.0)
|
@@ -32,29 +35,42 @@ GEM
|
|
32
35
|
diff-lcs (>= 1.2.0, < 2.0)
|
33
36
|
rspec-support (~> 3.12.0)
|
34
37
|
rspec-support (3.12.0)
|
35
|
-
rubocop (1.
|
38
|
+
rubocop (1.75.8)
|
36
39
|
json (~> 2.3)
|
40
|
+
language_server-protocol (~> 3.17.0.2)
|
41
|
+
lint_roller (~> 1.1.0)
|
37
42
|
parallel (~> 1.10)
|
38
|
-
parser (>= 3.
|
43
|
+
parser (>= 3.3.0.2)
|
39
44
|
rainbow (>= 2.2.2, < 4.0)
|
40
|
-
regexp_parser (>=
|
41
|
-
|
42
|
-
rubocop-ast (>= 1.26.0, < 2.0)
|
45
|
+
regexp_parser (>= 2.9.3, < 3.0)
|
46
|
+
rubocop-ast (>= 1.44.0, < 2.0)
|
43
47
|
ruby-progressbar (~> 1.7)
|
44
|
-
unicode-display_width (>= 2.4.0, <
|
45
|
-
rubocop-ast (1.
|
46
|
-
parser (>= 3.
|
47
|
-
|
48
|
-
|
49
|
-
|
48
|
+
unicode-display_width (>= 2.4.0, < 4.0)
|
49
|
+
rubocop-ast (1.45.1)
|
50
|
+
parser (>= 3.3.7.2)
|
51
|
+
prism (~> 1.4)
|
52
|
+
rubocop-performance (1.25.0)
|
53
|
+
lint_roller (~> 1.1)
|
54
|
+
rubocop (>= 1.75.0, < 2.0)
|
55
|
+
rubocop-ast (>= 1.38.0, < 2.0)
|
50
56
|
ruby-progressbar (1.13.0)
|
51
|
-
standard (1.
|
57
|
+
standard (1.50.0)
|
52
58
|
language_server-protocol (~> 3.17.0.2)
|
53
|
-
|
54
|
-
rubocop
|
59
|
+
lint_roller (~> 1.0)
|
60
|
+
rubocop (~> 1.75.5)
|
61
|
+
standard-custom (~> 1.0.0)
|
62
|
+
standard-performance (~> 1.8)
|
63
|
+
standard-custom (1.0.2)
|
64
|
+
lint_roller (~> 1.0)
|
65
|
+
rubocop (~> 1.50)
|
66
|
+
standard-performance (1.8.0)
|
67
|
+
lint_roller (~> 1.1)
|
68
|
+
rubocop-performance (~> 1.25.0)
|
55
69
|
suika (0.3.2)
|
56
70
|
dartsclone (>= 0.2.0)
|
57
|
-
unicode-display_width (
|
71
|
+
unicode-display_width (3.1.4)
|
72
|
+
unicode-emoji (~> 4.0, >= 4.0.4)
|
73
|
+
unicode-emoji (4.0.4)
|
58
74
|
|
59
75
|
PLATFORMS
|
60
76
|
x86_64-linux
|
data/ke2daira.gemspec
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "lib/ke2daira/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = "ke2daira"
|
7
|
+
spec.version = Ke2daira::VERSION
|
8
|
+
spec.authors = ["otariidae"]
|
9
|
+
spec.email = ["otariidae@users.noreply.github.com"]
|
10
|
+
|
11
|
+
spec.summary = "ke2daira"
|
12
|
+
spec.description = "A Ruby implementation of ke2daira"
|
13
|
+
spec.homepage = "https://github.com/otariidae/ke2daira.rb"
|
14
|
+
spec.license = "MIT"
|
15
|
+
spec.required_ruby_version = ">= 3.2.0"
|
16
|
+
|
17
|
+
spec.metadata["allowed_push_host"] = "https://rubygems.org"
|
18
|
+
|
19
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
20
|
+
spec.metadata["source_code_uri"] = spec.homepage
|
21
|
+
spec.metadata["changelog_uri"] = "https://github.com/otariidae/ke2daira.rb/blob/main/CHANGELOG.md"
|
22
|
+
|
23
|
+
# Specify which files should be added to the gem when it is released.
|
24
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
25
|
+
spec.files = Dir.chdir(__dir__) do
|
26
|
+
`git ls-files -z`.split("\x0").reject do |f|
|
27
|
+
(f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
|
28
|
+
end
|
29
|
+
end
|
30
|
+
spec.bindir = "exe"
|
31
|
+
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
32
|
+
spec.require_paths = ["lib"]
|
33
|
+
|
34
|
+
# Uncomment to register a new dependency of your gem
|
35
|
+
spec.add_dependency "suika", "~> 0.3.2"
|
36
|
+
|
37
|
+
# For more information and examples about making a new gem, check out our
|
38
|
+
# guide at: https://bundler.io/guides/creating_gem.html
|
39
|
+
spec.metadata["rubygems_mfa_required"] = "true"
|
40
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Ke2daira
|
4
|
+
module Kana2Mora
|
5
|
+
SUTEKANA = Set["ァ", "ィ", "ゥ", "ェ", "ォ", "ャ", "ュ", "ョ", "ヮ"]
|
6
|
+
|
7
|
+
# converts Katakana into a list of mora.
|
8
|
+
def self.katakana2mora(katakana)
|
9
|
+
chars = katakana.chars
|
10
|
+
moras = []
|
11
|
+
chars.each do |char|
|
12
|
+
if SUTEKANA.include?(char)
|
13
|
+
previous_char = moras.pop || ""
|
14
|
+
mora = previous_char + char
|
15
|
+
moras << mora
|
16
|
+
next
|
17
|
+
end
|
18
|
+
moras << char
|
19
|
+
end
|
20
|
+
moras
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "suika"
|
4
|
+
|
5
|
+
module Ke2daira
|
6
|
+
class Ke2dairanizer
|
7
|
+
def initialize(tagger: ::Suika::Tagger.new, separator: " ")
|
8
|
+
@tagger = tagger
|
9
|
+
@separator = separator
|
10
|
+
end
|
11
|
+
|
12
|
+
attr_reader :tagger, :separator
|
13
|
+
|
14
|
+
# Ke2dairanize the givin name
|
15
|
+
# @param fullname [String]
|
16
|
+
# @return [String] ke2dairanized name
|
17
|
+
def ke2dairanize(fullname) # rubocop:disable Metrics/AbcSize
|
18
|
+
names = fullname.strip.split(separator)
|
19
|
+
yomis = names.map { |name| to_yomi(name) }
|
20
|
+
|
21
|
+
return yomis[0] if yomis.length == 1
|
22
|
+
|
23
|
+
first_word_moras = Kana2Mora.katakana2mora(yomis[0])
|
24
|
+
first_word_head = first_word_moras[0]
|
25
|
+
first_word_tail = first_word_moras[1..]
|
26
|
+
|
27
|
+
last_word_moras = Kana2Mora.katakana2mora(yomis[-1])
|
28
|
+
last_word_head = last_word_moras[0]
|
29
|
+
last_word_tail = last_word_moras[1..]
|
30
|
+
|
31
|
+
new_first_word = last_word_head + first_word_tail.join
|
32
|
+
new_last_word = first_word_head + last_word_tail.join
|
33
|
+
|
34
|
+
yomis[0] = new_first_word
|
35
|
+
yomis[-1] = new_last_word
|
36
|
+
yomis.join(separator)
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
# Tokenize the givin sentence
|
42
|
+
# @param sentence [String] Japanese text to be tokenized
|
43
|
+
# @return [Array<SuikaToken>]
|
44
|
+
def tokenize(sentence)
|
45
|
+
raw_tokens = tagger.parse(sentence)
|
46
|
+
raw_tokens.map { |raw_token| raw_token2suikatoken(raw_token) }
|
47
|
+
end
|
48
|
+
|
49
|
+
# Convert Suika raw token to SuikaToken
|
50
|
+
# @param raw_token [String] Suika raw token
|
51
|
+
# @return [SuikaToken]
|
52
|
+
def raw_token2suikatoken(raw_token)
|
53
|
+
surface_form, rest_raw_token = raw_token.split("\t")
|
54
|
+
pos, pos_detail1, pos_detail2, pos_detail3, conjugated_type, conjugated_form,
|
55
|
+
basic_form, reading, pronunciation = rest_raw_token.split(",")
|
56
|
+
SuikaToken.new(surface_form, pos, pos_detail1, pos_detail2, pos_detail3, conjugated_type,
|
57
|
+
conjugated_form, basic_form, reading, pronunciation)
|
58
|
+
end
|
59
|
+
|
60
|
+
# Convert the givin word to its reading
|
61
|
+
# @param word [String] a Japanese word
|
62
|
+
# @return [String] reading of the word
|
63
|
+
def to_yomi(word)
|
64
|
+
tokens = tokenize(word)
|
65
|
+
# fallback to the surface form when the reading is missing
|
66
|
+
tokens.map do |token|
|
67
|
+
token.reading || token.surface_form
|
68
|
+
end.join
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
data/lib/ke2daira/version.rb
CHANGED
data/lib/ke2daira.rb
CHANGED
@@ -1,69 +1,15 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require_relative "ke2daira/kana2mora"
|
4
|
+
require_relative "ke2daira/ke2dairanizer"
|
3
5
|
require_relative "ke2daira/version"
|
4
|
-
require "suika"
|
5
6
|
|
6
7
|
# Ke2daira is a library to ke2dairanize
|
7
8
|
module Ke2daira
|
8
|
-
|
9
|
-
private_constant :SEPARATOR
|
9
|
+
SuikaToken = Data.define(:surface_form, :pos, :pos_detail1, :pos_detail2, :pos_detail3, :conjugated_type, :conjugated_form, :basic_form, :reading, :pronunciation)
|
10
10
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
SuikaToken = Data.define(:surface_form, :pos, :pos_detail1, :pos_detail2, :pos_detail3, :conjugated_type,
|
15
|
-
:conjugated_form, :basic_form, :reading, :pronunciation)
|
16
|
-
private_constant :SuikaToken
|
17
|
-
|
18
|
-
# Ke2dairanize the givin name
|
19
|
-
# @param fullname [String]
|
20
|
-
# @return [String] ke2dairanized name
|
21
|
-
def self.ke2dairanize(fullname) # rubocop:disable Metrics/AbcSize
|
22
|
-
names = fullname.strip.split(SEPARATOR)
|
23
|
-
yomis = names.map { |name| to_yomi(name) }
|
24
|
-
|
25
|
-
return yomis[0] if yomis.length == 1
|
26
|
-
|
27
|
-
first_word = yomis[0]
|
28
|
-
last_word = yomis[-1]
|
29
|
-
|
30
|
-
# swap first characters of first and last name
|
31
|
-
yomis[0] = last_word[0] + first_word[1..]
|
32
|
-
yomis[-1] = first_word[0] + last_word[1..]
|
33
|
-
yomis.join(SEPARATOR)
|
34
|
-
end
|
35
|
-
|
36
|
-
class << self
|
37
|
-
private
|
38
|
-
|
39
|
-
# Tokenize the givin sentence
|
40
|
-
# @param sentence [String] Japanese text to be tokenized
|
41
|
-
# @return [Array<SuikaToken>]
|
42
|
-
def tokenize(sentence)
|
43
|
-
raw_tokens = TAGGER.parse(sentence)
|
44
|
-
raw_tokens.map { |raw_token| raw_token2suikatoken(raw_token) }
|
45
|
-
end
|
46
|
-
|
47
|
-
# Convert Suika raw token to SuikaToken
|
48
|
-
# @param raw_token [String] Suika raw token
|
49
|
-
# @return [SuikaToken]
|
50
|
-
def raw_token2suikatoken(raw_token)
|
51
|
-
surface_form, rest_raw_token = raw_token.split("\t")
|
52
|
-
pos, pos_detail1, pos_detail2, pos_detail3, conjugated_type, conjugated_form,
|
53
|
-
basic_form, reading, pronunciation = rest_raw_token.split(",")
|
54
|
-
SuikaToken.new(surface_form, pos, pos_detail1, pos_detail2, pos_detail3, conjugated_type,
|
55
|
-
conjugated_form, basic_form, reading, pronunciation)
|
56
|
-
end
|
57
|
-
|
58
|
-
# Convert the givin word to its reading
|
59
|
-
# @param word [String] a Japanese word
|
60
|
-
# @return [String] reading of the word
|
61
|
-
def to_yomi(word)
|
62
|
-
tokens = tokenize(word)
|
63
|
-
# fallback to the surface form when the reading is missing
|
64
|
-
tokens.map do |token|
|
65
|
-
token.reading || token.surface_form
|
66
|
-
end.join
|
67
|
-
end
|
11
|
+
def self.ke2dairanize(fullname)
|
12
|
+
@ke2dairanizer ||= Ke2dairanizer.new
|
13
|
+
@ke2dairanizer.ke2dairanize(fullname)
|
68
14
|
end
|
69
15
|
end
|
metadata
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ke2daira
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- otariidae
|
8
|
-
autorequire:
|
9
8
|
bindir: exe
|
10
9
|
cert_chain: []
|
11
|
-
date:
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
12
11
|
dependencies:
|
13
12
|
- !ruby/object:Gem::Dependency
|
14
13
|
name: suika
|
@@ -38,7 +37,10 @@ files:
|
|
38
37
|
- LICENSE.txt
|
39
38
|
- README.md
|
40
39
|
- Rakefile
|
40
|
+
- ke2daira.gemspec
|
41
41
|
- lib/ke2daira.rb
|
42
|
+
- lib/ke2daira/kana2mora.rb
|
43
|
+
- lib/ke2daira/ke2dairanizer.rb
|
42
44
|
- lib/ke2daira/version.rb
|
43
45
|
- sig/ke2daira.rbs
|
44
46
|
homepage: https://github.com/otariidae/ke2daira.rb
|
@@ -50,7 +52,6 @@ metadata:
|
|
50
52
|
source_code_uri: https://github.com/otariidae/ke2daira.rb
|
51
53
|
changelog_uri: https://github.com/otariidae/ke2daira.rb/blob/main/CHANGELOG.md
|
52
54
|
rubygems_mfa_required: 'true'
|
53
|
-
post_install_message:
|
54
55
|
rdoc_options: []
|
55
56
|
require_paths:
|
56
57
|
- lib
|
@@ -65,8 +66,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
65
66
|
- !ruby/object:Gem::Version
|
66
67
|
version: '0'
|
67
68
|
requirements: []
|
68
|
-
rubygems_version: 3.
|
69
|
-
signing_key:
|
69
|
+
rubygems_version: 3.6.7
|
70
70
|
specification_version: 4
|
71
71
|
summary: ke2daira
|
72
72
|
test_files: []
|