unihan_lang 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.github/workflows/main.yml +30 -0
- data/.gitignore +10 -0
- data/.rspec +2 -0
- data/.rubocop.yml +102 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +68 -0
- data/README.md +57 -0
- data/Rakefile +12 -0
- data/data/Unihan_Variants.txt +17429 -0
- data/data/traditional_chinese_list.txt +6017 -0
- data/lib/unihan_lang/chinese_processor.rb +88 -0
- data/lib/unihan_lang/version.rb +5 -0
- data/lib/unihan_lang.rb +68 -0
- data/test.rb +58 -0
- data/traditional_characters.txt +0 -0
- data/unihan_lang.gemspec +30 -0
- metadata +102 -0
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module UnihanLang
|
|
4
|
+
class ChineseProcessor
|
|
5
|
+
attr_reader :zh_tw, :zh_cn, :common
|
|
6
|
+
|
|
7
|
+
def initialize
|
|
8
|
+
@zh_tw = Set.new
|
|
9
|
+
@zh_cn = Set.new
|
|
10
|
+
@common = Set.new
|
|
11
|
+
load_chinese_characters
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def zh_tw?(char)
|
|
15
|
+
@zh_tw.include?(char) || @common.include?(char)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def zh_cn?(char)
|
|
19
|
+
@zh_cn.include?(char) || @common.include?(char)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def only_zh_tw?(char)
|
|
23
|
+
@zh_tw.include?(char) && !@common.include?(char)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def only_zh_cn?(char)
|
|
27
|
+
@zh_cn.include?(char)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def chinese?(char)
|
|
31
|
+
zh_tw?(char) || zh_cn?(char) || cjk?(char)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def chinese_character?(char)
|
|
35
|
+
chinese?(char)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
private
|
|
39
|
+
|
|
40
|
+
def cjk?(char)
|
|
41
|
+
char.ord >= 0x4E00 && char.ord <= 0x9FFF
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def load_chinese_characters
|
|
45
|
+
load_unihan_variants
|
|
46
|
+
load_traditional_chinese_list
|
|
47
|
+
process_character_sets
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def load_unihan_variants
|
|
51
|
+
file_path = File.join(File.dirname(__FILE__), "..", "..", "data", "Unihan_Variants.txt")
|
|
52
|
+
File.foreach(file_path, encoding: "UTF-8") do |line|
|
|
53
|
+
next if line.start_with?("#") || line.strip.empty?
|
|
54
|
+
|
|
55
|
+
fields = line.strip.split("\t")
|
|
56
|
+
process_unihan_fields(fields) if fields.size >= 3
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def process_unihan_fields(fields)
|
|
61
|
+
char = [fields[0].gsub(/^U\+/, "").hex].pack("U")
|
|
62
|
+
# Remove dictionary name.
|
|
63
|
+
# Example: U+348B kSemanticVariant U+5EDD<kMatthews U+53AE<kMatthews
|
|
64
|
+
variant = [fields[2].split("<")[0].gsub(/^U\+/, "").hex].pack("U")
|
|
65
|
+
case fields[1]
|
|
66
|
+
when "kTraditionalVariant"
|
|
67
|
+
@zh_tw << variant
|
|
68
|
+
@zh_cn << char
|
|
69
|
+
when "kSimplifiedVariant"
|
|
70
|
+
@zh_cn << variant
|
|
71
|
+
@zh_tw << char
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def load_traditional_chinese_list
|
|
76
|
+
file_path = File.join(File.dirname(__FILE__), "..", "..", "data",
|
|
77
|
+
"traditional_chinese_list.txt")
|
|
78
|
+
File.foreach(file_path, encoding: "UTF-8") { |line| @zh_tw << line.strip }
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def process_character_sets
|
|
82
|
+
@common = @zh_tw & @zh_cn
|
|
83
|
+
@zh_tw -= @zh_cn
|
|
84
|
+
@zh_cn -= @zh_tw
|
|
85
|
+
@zh_cn |= @common
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|
data/lib/unihan_lang.rb
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "unihan_lang/version"
|
|
4
|
+
require_relative "unihan_lang/chinese_processor"
|
|
5
|
+
|
|
6
|
+
module UnihanLang
|
|
7
|
+
class Unihan
|
|
8
|
+
def initialize
|
|
9
|
+
@chinese_processor = ChineseProcessor.new
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def zh_tw?(text)
|
|
13
|
+
language_ratio(text) == :tw
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def zh_cn?(text)
|
|
17
|
+
language_ratio(text) == :cn
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def only_zh_tw?(text)
|
|
21
|
+
text.chars.all? { |char| @chinese_processor.only_zh_tw?(char) }
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def only_zh_cn?(text)
|
|
25
|
+
text.chars.all? { |char| @chinese_processor.only_zh_cn?(char) }
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def contains_zh_tw?(text)
|
|
29
|
+
text.chars.any? { |char| @chinese_processor.only_zh_tw?(char) }
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def contains_zh_cn?(text)
|
|
33
|
+
text.chars.any? { |char| @chinese_processor.only_zh_cn?(char) }
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def contains_chinese?(text)
|
|
37
|
+
text.chars.any? { |char| @chinese_processor.chinese_character?(char) }
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def extract_chinese_characters(text)
|
|
41
|
+
text.chars.select { |char| @chinese_processor.chinese_character?(char) }
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def determine_language(text)
|
|
45
|
+
case language_ratio(text)
|
|
46
|
+
when :ja then "JA"
|
|
47
|
+
when :tw then "ZH_TW"
|
|
48
|
+
when :cn then "ZH_CN"
|
|
49
|
+
else "Unknown"
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
private
|
|
54
|
+
|
|
55
|
+
# テキストの言語比率を計算し、最も可能性の高い言語を返す
|
|
56
|
+
def language_ratio(text)
|
|
57
|
+
only_tw_chars = text.chars.count { |char| @chinese_processor.only_zh_tw?(char) }
|
|
58
|
+
only_cn_chars = text.chars.count { |char| @chinese_processor.only_zh_cn?(char) }
|
|
59
|
+
chinese_chars = text.chars.count { |char| @chinese_processor.chinese?(char) }
|
|
60
|
+
|
|
61
|
+
return :unknown unless chinese_chars == text.length
|
|
62
|
+
return :tw if only_tw_chars > only_cn_chars
|
|
63
|
+
return :cn if only_cn_chars >= only_tw_chars
|
|
64
|
+
|
|
65
|
+
:unknown
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
data/test.rb
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
# rubocop:disable all
|
|
3
|
+
$LOAD_PATH.unshift File.expand_path("lib", __dir__)
|
|
4
|
+
require "unihan_lang"
|
|
5
|
+
|
|
6
|
+
unihan = UnihanLang::Unihan.new
|
|
7
|
+
|
|
8
|
+
test_cases = %w(
|
|
9
|
+
繁體字
|
|
10
|
+
简体字
|
|
11
|
+
日本語
|
|
12
|
+
中文
|
|
13
|
+
漢字
|
|
14
|
+
汉字
|
|
15
|
+
東京
|
|
16
|
+
北京
|
|
17
|
+
台北
|
|
18
|
+
ひらがな
|
|
19
|
+
カタカナ
|
|
20
|
+
漢字とひらがな
|
|
21
|
+
こんにちは世界
|
|
22
|
+
你好世界
|
|
23
|
+
你好世界
|
|
24
|
+
實際的例子
|
|
25
|
+
实际的例子
|
|
26
|
+
現実の例
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
test_cases.each do |word|
|
|
30
|
+
puts "\nTesting '#{word}':"
|
|
31
|
+
puts "zh_tw?: #{unihan.zh_tw?(word)}"
|
|
32
|
+
puts "zh_cn?: #{unihan.zh_cn?(word)}"
|
|
33
|
+
puts "ja?: #{unihan.ja?(word)}"
|
|
34
|
+
puts "Language: #{unihan.determine_language(word)}"
|
|
35
|
+
puts "Character details:"
|
|
36
|
+
word.each_char do |char|
|
|
37
|
+
print "#{char}: "
|
|
38
|
+
chinese_processor = unihan.instance_variable_get(:@chinese_processor)
|
|
39
|
+
japanese_processor = unihan.instance_variable_get(:@japanese_processor)
|
|
40
|
+
in_zh_tw = chinese_processor.zh_tw.include?(char)
|
|
41
|
+
in_zh_cn = chinese_processor.zh_cn.include?(char)
|
|
42
|
+
in_common = chinese_processor.common.include?(char)
|
|
43
|
+
is_chinese = chinese_processor.chinese?(char)
|
|
44
|
+
is_japanese = japanese_processor.japanese?(char.to_s)
|
|
45
|
+
is_kana = char =~ /[\p{Hiragana}\p{Katakana}ー]/
|
|
46
|
+
print "ZH_TW " if in_zh_tw
|
|
47
|
+
print "ZH_CN " if in_zh_cn
|
|
48
|
+
print "Common " if in_common
|
|
49
|
+
print "Chinese " if is_chinese
|
|
50
|
+
print "Japanese " if is_japanese
|
|
51
|
+
print "KANA " if is_kana
|
|
52
|
+
if !in_zh_tw && !in_zh_cn && !in_common && !is_chinese && !is_japanese && !is_kana
|
|
53
|
+
print "UNKNOWN"
|
|
54
|
+
end
|
|
55
|
+
puts
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
# rubocop:enable all
|
|
File without changes
|
data/unihan_lang.gemspec
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# rubocop:disable Layout/LineLength, Gemspec/RequiredRubyVersion
|
|
4
|
+
require_relative "lib/unihan_lang/version"
|
|
5
|
+
|
|
6
|
+
Gem::Specification.new do |spec|
|
|
7
|
+
spec.name = "unihan_lang"
|
|
8
|
+
spec.version = UnihanLang::VERSION
|
|
9
|
+
spec.authors = ["kyubey1228"]
|
|
10
|
+
spec.email = ["kyuuka1228@gmail.com"]
|
|
11
|
+
|
|
12
|
+
spec.summary = "Language detection for Chinese and Japanese characters"
|
|
13
|
+
spec.description = "A gem to detect and differentiate between Traditional Chinese, Simplified Chinese, and Japanese characters based on Unihan data."
|
|
14
|
+
spec.homepage = "https://github.com/kyubey1228/unihan_lang"
|
|
15
|
+
spec.license = "MIT"
|
|
16
|
+
spec.required_ruby_version = Gem::Requirement.new(">= 2.5.0")
|
|
17
|
+
|
|
18
|
+
spec.bindir = "exe"
|
|
19
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
|
20
|
+
spec.require_paths = ["lib"]
|
|
21
|
+
|
|
22
|
+
spec.add_development_dependency "bundler", "~> 2.0"
|
|
23
|
+
spec.add_development_dependency "rake", "~> 13.0"
|
|
24
|
+
spec.add_development_dependency "rspec", "~> 3.0"
|
|
25
|
+
|
|
26
|
+
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
|
27
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
|
28
|
+
end + Dir["data/*"]
|
|
29
|
+
end
|
|
30
|
+
# rubocop:enable all
|
metadata
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: unihan_lang
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- kyubey1228
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: exe
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2024-09-08 00:00:00.000000000 Z
|
|
12
|
+
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: bundler
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - "~>"
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '2.0'
|
|
20
|
+
type: :development
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - "~>"
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '2.0'
|
|
27
|
+
- !ruby/object:Gem::Dependency
|
|
28
|
+
name: rake
|
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
|
30
|
+
requirements:
|
|
31
|
+
- - "~>"
|
|
32
|
+
- !ruby/object:Gem::Version
|
|
33
|
+
version: '13.0'
|
|
34
|
+
type: :development
|
|
35
|
+
prerelease: false
|
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
37
|
+
requirements:
|
|
38
|
+
- - "~>"
|
|
39
|
+
- !ruby/object:Gem::Version
|
|
40
|
+
version: '13.0'
|
|
41
|
+
- !ruby/object:Gem::Dependency
|
|
42
|
+
name: rspec
|
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
|
44
|
+
requirements:
|
|
45
|
+
- - "~>"
|
|
46
|
+
- !ruby/object:Gem::Version
|
|
47
|
+
version: '3.0'
|
|
48
|
+
type: :development
|
|
49
|
+
prerelease: false
|
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
51
|
+
requirements:
|
|
52
|
+
- - "~>"
|
|
53
|
+
- !ruby/object:Gem::Version
|
|
54
|
+
version: '3.0'
|
|
55
|
+
description: A gem to detect and differentiate between Traditional Chinese, Simplified
|
|
56
|
+
Chinese, and Japanese characters based on Unihan data.
|
|
57
|
+
email:
|
|
58
|
+
- kyuuka1228@gmail.com
|
|
59
|
+
executables: []
|
|
60
|
+
extensions: []
|
|
61
|
+
extra_rdoc_files: []
|
|
62
|
+
files:
|
|
63
|
+
- ".github/workflows/main.yml"
|
|
64
|
+
- ".gitignore"
|
|
65
|
+
- ".rspec"
|
|
66
|
+
- ".rubocop.yml"
|
|
67
|
+
- Gemfile
|
|
68
|
+
- Gemfile.lock
|
|
69
|
+
- README.md
|
|
70
|
+
- Rakefile
|
|
71
|
+
- data/Unihan_Variants.txt
|
|
72
|
+
- data/traditional_chinese_list.txt
|
|
73
|
+
- lib/unihan_lang.rb
|
|
74
|
+
- lib/unihan_lang/chinese_processor.rb
|
|
75
|
+
- lib/unihan_lang/version.rb
|
|
76
|
+
- test.rb
|
|
77
|
+
- traditional_characters.txt
|
|
78
|
+
- unihan_lang.gemspec
|
|
79
|
+
homepage: https://github.com/kyubey1228/unihan_lang
|
|
80
|
+
licenses:
|
|
81
|
+
- MIT
|
|
82
|
+
metadata: {}
|
|
83
|
+
post_install_message:
|
|
84
|
+
rdoc_options: []
|
|
85
|
+
require_paths:
|
|
86
|
+
- lib
|
|
87
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
88
|
+
requirements:
|
|
89
|
+
- - ">="
|
|
90
|
+
- !ruby/object:Gem::Version
|
|
91
|
+
version: 2.5.0
|
|
92
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
93
|
+
requirements:
|
|
94
|
+
- - ">="
|
|
95
|
+
- !ruby/object:Gem::Version
|
|
96
|
+
version: '0'
|
|
97
|
+
requirements: []
|
|
98
|
+
rubygems_version: 3.5.11
|
|
99
|
+
signing_key:
|
|
100
|
+
specification_version: 4
|
|
101
|
+
summary: Language detection for Chinese and Japanese characters
|
|
102
|
+
test_files: []
|