unihan_lang 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,88 @@
1
+ # frozen_string_literal: true
2
+
3
+ module UnihanLang
4
+ class ChineseProcessor
5
+ attr_reader :zh_tw, :zh_cn, :common
6
+
7
+ def initialize
8
+ @zh_tw = Set.new
9
+ @zh_cn = Set.new
10
+ @common = Set.new
11
+ load_chinese_characters
12
+ end
13
+
14
+ def zh_tw?(char)
15
+ @zh_tw.include?(char) || @common.include?(char)
16
+ end
17
+
18
+ def zh_cn?(char)
19
+ @zh_cn.include?(char) || @common.include?(char)
20
+ end
21
+
22
+ def only_zh_tw?(char)
23
+ @zh_tw.include?(char) && !@common.include?(char)
24
+ end
25
+
26
+ def only_zh_cn?(char)
27
+ @zh_cn.include?(char)
28
+ end
29
+
30
+ def chinese?(char)
31
+ zh_tw?(char) || zh_cn?(char) || cjk?(char)
32
+ end
33
+
34
+ def chinese_character?(char)
35
+ chinese?(char)
36
+ end
37
+
38
+ private
39
+
40
+ def cjk?(char)
41
+ char.ord >= 0x4E00 && char.ord <= 0x9FFF
42
+ end
43
+
44
+ def load_chinese_characters
45
+ load_unihan_variants
46
+ load_traditional_chinese_list
47
+ process_character_sets
48
+ end
49
+
50
+ def load_unihan_variants
51
+ file_path = File.join(File.dirname(__FILE__), "..", "..", "data", "Unihan_Variants.txt")
52
+ File.foreach(file_path, encoding: "UTF-8") do |line|
53
+ next if line.start_with?("#") || line.strip.empty?
54
+
55
+ fields = line.strip.split("\t")
56
+ process_unihan_fields(fields) if fields.size >= 3
57
+ end
58
+ end
59
+
60
+ def process_unihan_fields(fields)
61
+ char = [fields[0].gsub(/^U\+/, "").hex].pack("U")
62
+ # Remove dictionary name.
63
+ # Example: U+348B kSemanticVariant U+5EDD<kMatthews U+53AE<kMatthews
64
+ variant = [fields[2].split("<")[0].gsub(/^U\+/, "").hex].pack("U")
65
+ case fields[1]
66
+ when "kTraditionalVariant"
67
+ @zh_tw << variant
68
+ @zh_cn << char
69
+ when "kSimplifiedVariant"
70
+ @zh_cn << variant
71
+ @zh_tw << char
72
+ end
73
+ end
74
+
75
+ def load_traditional_chinese_list
76
+ file_path = File.join(File.dirname(__FILE__), "..", "..", "data",
77
+ "traditional_chinese_list.txt")
78
+ File.foreach(file_path, encoding: "UTF-8") { |line| @zh_tw << line.strip }
79
+ end
80
+
81
+ def process_character_sets
82
+ @common = @zh_tw & @zh_cn
83
+ @zh_tw -= @zh_cn
84
+ @zh_cn -= @zh_tw
85
+ @zh_cn |= @common
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module UnihanLang
4
+ VERSION = "0.1.0"
5
+ end
@@ -0,0 +1,68 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "unihan_lang/version"
4
+ require_relative "unihan_lang/chinese_processor"
5
+
6
+ module UnihanLang
7
+ class Unihan
8
+ def initialize
9
+ @chinese_processor = ChineseProcessor.new
10
+ end
11
+
12
+ def zh_tw?(text)
13
+ language_ratio(text) == :tw
14
+ end
15
+
16
+ def zh_cn?(text)
17
+ language_ratio(text) == :cn
18
+ end
19
+
20
+ def only_zh_tw?(text)
21
+ text.chars.all? { |char| @chinese_processor.only_zh_tw?(char) }
22
+ end
23
+
24
+ def only_zh_cn?(text)
25
+ text.chars.all? { |char| @chinese_processor.only_zh_cn?(char) }
26
+ end
27
+
28
+ def contains_zh_tw?(text)
29
+ text.chars.any? { |char| @chinese_processor.only_zh_tw?(char) }
30
+ end
31
+
32
+ def contains_zh_cn?(text)
33
+ text.chars.any? { |char| @chinese_processor.only_zh_cn?(char) }
34
+ end
35
+
36
+ def contains_chinese?(text)
37
+ text.chars.any? { |char| @chinese_processor.chinese_character?(char) }
38
+ end
39
+
40
+ def extract_chinese_characters(text)
41
+ text.chars.select { |char| @chinese_processor.chinese_character?(char) }
42
+ end
43
+
44
+ def determine_language(text)
45
+ case language_ratio(text)
46
+ when :ja then "JA"
47
+ when :tw then "ZH_TW"
48
+ when :cn then "ZH_CN"
49
+ else "Unknown"
50
+ end
51
+ end
52
+
53
+ private
54
+
55
+ # テキストの言語比率を計算し、最も可能性の高い言語を返す
56
+ def language_ratio(text)
57
+ only_tw_chars = text.chars.count { |char| @chinese_processor.only_zh_tw?(char) }
58
+ only_cn_chars = text.chars.count { |char| @chinese_processor.only_zh_cn?(char) }
59
+ chinese_chars = text.chars.count { |char| @chinese_processor.chinese?(char) }
60
+
61
+ return :unknown unless chinese_chars == text.length
62
+ return :tw if only_tw_chars > only_cn_chars
63
+ return :cn if only_cn_chars >= only_tw_chars
64
+
65
+ :unknown
66
+ end
67
+ end
68
+ end
data/test.rb ADDED
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+ # rubocop:disable all
3
+ $LOAD_PATH.unshift File.expand_path("lib", __dir__)
4
+ require "unihan_lang"
5
+
6
+ unihan = UnihanLang::Unihan.new
7
+
8
+ test_cases = %w(
9
+ 繁體字
10
+ 简体字
11
+ 日本語
12
+ 中文
13
+ 漢字
14
+ 汉字
15
+ 東京
16
+ 北京
17
+ 台北
18
+ ひらがな
19
+ カタカナ
20
+ 漢字とひらがな
21
+ こんにちは世界
22
+ 你好世界
23
+ 你好世界
24
+ 實際的例子
25
+ 实际的例子
26
+ 現実の例
27
+ )
28
+
29
+ test_cases.each do |word|
30
+ puts "\nTesting '#{word}':"
31
+ puts "zh_tw?: #{unihan.zh_tw?(word)}"
32
+ puts "zh_cn?: #{unihan.zh_cn?(word)}"
33
+ puts "ja?: #{unihan.ja?(word)}"
34
+ puts "Language: #{unihan.determine_language(word)}"
35
+ puts "Character details:"
36
+ word.each_char do |char|
37
+ print "#{char}: "
38
+ chinese_processor = unihan.instance_variable_get(:@chinese_processor)
39
+ japanese_processor = unihan.instance_variable_get(:@japanese_processor)
40
+ in_zh_tw = chinese_processor.zh_tw.include?(char)
41
+ in_zh_cn = chinese_processor.zh_cn.include?(char)
42
+ in_common = chinese_processor.common.include?(char)
43
+ is_chinese = chinese_processor.chinese?(char)
44
+ is_japanese = japanese_processor.japanese?(char.to_s)
45
+ is_kana = char =~ /[\p{Hiragana}\p{Katakana}ー]/
46
+ print "ZH_TW " if in_zh_tw
47
+ print "ZH_CN " if in_zh_cn
48
+ print "Common " if in_common
49
+ print "Chinese " if is_chinese
50
+ print "Japanese " if is_japanese
51
+ print "KANA " if is_kana
52
+ if !in_zh_tw && !in_zh_cn && !in_common && !is_chinese && !is_japanese && !is_kana
53
+ print "UNKNOWN"
54
+ end
55
+ puts
56
+ end
57
+ end
58
+ # rubocop:enable all
File without changes
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ # rubocop:disable Layout/LineLength, Gemspec/RequiredRubyVersion
4
+ require_relative "lib/unihan_lang/version"
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "unihan_lang"
8
+ spec.version = UnihanLang::VERSION
9
+ spec.authors = ["kyubey1228"]
10
+ spec.email = ["kyuuka1228@gmail.com"]
11
+
12
+ spec.summary = "Language detection for Chinese and Japanese characters"
13
+ spec.description = "A gem to detect and differentiate between Traditional Chinese, Simplified Chinese, and Japanese characters based on Unihan data."
14
+ spec.homepage = "https://github.com/kyubey1228/unihan_lang"
15
+ spec.license = "MIT"
16
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.5.0")
17
+
18
+ spec.bindir = "exe"
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ["lib"]
21
+
22
+ spec.add_development_dependency "bundler", "~> 2.0"
23
+ spec.add_development_dependency "rake", "~> 13.0"
24
+ spec.add_development_dependency "rspec", "~> 3.0"
25
+
26
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
27
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
28
+ end + Dir["data/*"]
29
+ end
30
+ # rubocop:enable all
metadata ADDED
@@ -0,0 +1,102 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: unihan_lang
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - kyubey1228
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2024-09-08 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '13.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '13.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.0'
55
+ description: A gem to detect and differentiate between Traditional Chinese, Simplified
56
+ Chinese, and Japanese characters based on Unihan data.
57
+ email:
58
+ - kyuuka1228@gmail.com
59
+ executables: []
60
+ extensions: []
61
+ extra_rdoc_files: []
62
+ files:
63
+ - ".github/workflows/main.yml"
64
+ - ".gitignore"
65
+ - ".rspec"
66
+ - ".rubocop.yml"
67
+ - Gemfile
68
+ - Gemfile.lock
69
+ - README.md
70
+ - Rakefile
71
+ - data/Unihan_Variants.txt
72
+ - data/traditional_chinese_list.txt
73
+ - lib/unihan_lang.rb
74
+ - lib/unihan_lang/chinese_processor.rb
75
+ - lib/unihan_lang/version.rb
76
+ - test.rb
77
+ - traditional_characters.txt
78
+ - unihan_lang.gemspec
79
+ homepage: https://github.com/kyubey1228/unihan_lang
80
+ licenses:
81
+ - MIT
82
+ metadata: {}
83
+ post_install_message:
84
+ rdoc_options: []
85
+ require_paths:
86
+ - lib
87
+ required_ruby_version: !ruby/object:Gem::Requirement
88
+ requirements:
89
+ - - ">="
90
+ - !ruby/object:Gem::Version
91
+ version: 2.5.0
92
+ required_rubygems_version: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ requirements: []
98
+ rubygems_version: 3.5.11
99
+ signing_key:
100
+ specification_version: 4
101
+ summary: Language detection for Chinese and Japanese characters
102
+ test_files: []