RubyGems - unihan_lang - Versions diffs - 0.1.0 - Mend

unihan_lang 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +7 -0
data/.github/workflows/main.yml +30 -0
data/.gitignore +10 -0
data/.rspec +2 -0
data/.rubocop.yml +102 -0
data/Gemfile +8 -0
data/Gemfile.lock +68 -0
data/README.md +57 -0
data/Rakefile +12 -0
data/data/Unihan_Variants.txt +17429 -0
data/data/traditional_chinese_list.txt +6017 -0
data/lib/unihan_lang/chinese_processor.rb +88 -0
data/lib/unihan_lang/version.rb +5 -0
data/lib/unihan_lang.rb +68 -0
data/test.rb +58 -0
data/traditional_characters.txt +0 -0
data/unihan_lang.gemspec +30 -0
metadata +102 -0

data/lib/unihan_lang/chinese_processor.rb ADDED Viewed

@@ -0,0 +1,88 @@
+# frozen_string_literal: true
+module UnihanLang
+  class ChineseProcessor
+    attr_reader :zh_tw, :zh_cn, :common
+    def initialize
+      @zh_tw = Set.new
+      @zh_cn = Set.new
+      @common = Set.new
+      load_chinese_characters
+    end
+    def zh_tw?(char)
+      @zh_tw.include?(char) || @common.include?(char)
+    end
+    def zh_cn?(char)
+      @zh_cn.include?(char) || @common.include?(char)
+    end
+    def only_zh_tw?(char)
+      @zh_tw.include?(char) && !@common.include?(char)
+    end
+    def only_zh_cn?(char)
+      @zh_cn.include?(char)
+    end
+    def chinese?(char)
+      zh_tw?(char) || zh_cn?(char) || cjk?(char)
+    end
+    def chinese_character?(char)
+      chinese?(char)
+    end
+    private
+    def cjk?(char)
+      char.ord >= 0x4E00 && char.ord <= 0x9FFF
+    end
+    def load_chinese_characters
+      load_unihan_variants
+      load_traditional_chinese_list
+      process_character_sets
+    end
+    def load_unihan_variants
+      file_path = File.join(File.dirname(__FILE__), "..", "..", "data", "Unihan_Variants.txt")
+      File.foreach(file_path, encoding: "UTF-8") do |line|
+        next if line.start_with?("#") || line.strip.empty?
+        fields = line.strip.split("\t")
+        process_unihan_fields(fields) if fields.size >= 3
+      end
+    end
+    def process_unihan_fields(fields)
+      char = [fields[0].gsub(/^U\+/, "").hex].pack("U")
+      # Remove dictionary name.
+      # Example: U+348B kSemanticVariant U+5EDD<kMatthews U+53AE<kMatthews
+      variant = [fields[2].split("<")[0].gsub(/^U\+/, "").hex].pack("U")
+      case fields[1]
+      when "kTraditionalVariant"
+        @zh_tw << variant
+        @zh_cn << char
+      when "kSimplifiedVariant"
+        @zh_cn << variant
+        @zh_tw << char
+      end
+    end
+    def load_traditional_chinese_list
+      file_path = File.join(File.dirname(__FILE__), "..", "..", "data",
+                            "traditional_chinese_list.txt")
+      File.foreach(file_path, encoding: "UTF-8") { |line| @zh_tw << line.strip }
+    end
+    def process_character_sets
+      @common = @zh_tw & @zh_cn
+      @zh_tw -= @zh_cn
+      @zh_cn -= @zh_tw
+      @zh_cn |= @common
+    end
+  end
+end

data/lib/unihan_lang/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+# frozen_string_literal: true
+module UnihanLang
+  VERSION = "0.1.0"
+end

data/lib/unihan_lang.rb ADDED Viewed

@@ -0,0 +1,68 @@
+# frozen_string_literal: true
+require_relative "unihan_lang/version"
+require_relative "unihan_lang/chinese_processor"
+module UnihanLang
+  class Unihan
+    def initialize
+      @chinese_processor = ChineseProcessor.new
+    end
+    def zh_tw?(text)
+      language_ratio(text) == :tw
+    end
+    def zh_cn?(text)
+      language_ratio(text) == :cn
+    end
+    def only_zh_tw?(text)
+      text.chars.all? { |char| @chinese_processor.only_zh_tw?(char) }
+    end
+    def only_zh_cn?(text)
+      text.chars.all? { |char| @chinese_processor.only_zh_cn?(char) }
+    end
+    def contains_zh_tw?(text)
+      text.chars.any? { |char| @chinese_processor.only_zh_tw?(char) }
+    end
+    def contains_zh_cn?(text)
+      text.chars.any? { |char| @chinese_processor.only_zh_cn?(char) }
+    end
+    def contains_chinese?(text)
+      text.chars.any? { |char| @chinese_processor.chinese_character?(char) }
+    end
+    def extract_chinese_characters(text)
+      text.chars.select { |char| @chinese_processor.chinese_character?(char) }
+    end
+    def determine_language(text)
+      case language_ratio(text)
+      when :ja then "JA"
+      when :tw then "ZH_TW"
+      when :cn then "ZH_CN"
+      else "Unknown"
+      end
+    end
+    private
+    # テキストの言語比率を計算し、最も可能性の高い言語を返す
+    def language_ratio(text)
+      only_tw_chars = text.chars.count { |char| @chinese_processor.only_zh_tw?(char) }
+      only_cn_chars = text.chars.count { |char| @chinese_processor.only_zh_cn?(char) }
+      chinese_chars = text.chars.count { |char| @chinese_processor.chinese?(char) }
+      return :unknown unless chinese_chars == text.length
+      return :tw if only_tw_chars > only_cn_chars
+      return :cn if only_cn_chars >= only_tw_chars
+      :unknown
+    end
+  end
+end

data/test.rb ADDED Viewed

@@ -0,0 +1,58 @@
+# frozen_string_literal: true
+# rubocop:disable all
+$LOAD_PATH.unshift File.expand_path("lib", __dir__)
+require "unihan_lang"
+unihan = UnihanLang::Unihan.new
+test_cases = %w(
+  繁體字
+  简体字
+  日本語
+  中文
+  漢字
+  汉字
+  東京
+  北京
+  台北
+  ひらがな
+  カタカナ
+  漢字とひらがな
+  こんにちは世界
+  你好世界
+  你好世界
+  實際的例子
+  实际的例子
+  現実の例
+)
+test_cases.each do |word|
+  puts "\nTesting '#{word}':"
+  puts "zh_tw?: #{unihan.zh_tw?(word)}"
+  puts "zh_cn?: #{unihan.zh_cn?(word)}"
+  puts "ja?: #{unihan.ja?(word)}"
+  puts "Language: #{unihan.determine_language(word)}"
+  puts "Character details:"
+  word.each_char do |char|
+    print "#{char}: "
+    chinese_processor = unihan.instance_variable_get(:@chinese_processor)
+    japanese_processor = unihan.instance_variable_get(:@japanese_processor)
+    in_zh_tw = chinese_processor.zh_tw.include?(char)
+    in_zh_cn = chinese_processor.zh_cn.include?(char)
+    in_common = chinese_processor.common.include?(char)
+    is_chinese = chinese_processor.chinese?(char)
+    is_japanese = japanese_processor.japanese?(char.to_s)
+    is_kana = char =~ /[\p{Hiragana}\p{Katakana}ー]/
+    print "ZH_TW " if in_zh_tw
+    print "ZH_CN " if in_zh_cn
+    print "Common " if in_common
+    print "Chinese " if is_chinese
+    print "Japanese " if is_japanese
+    print "KANA " if is_kana
+    if !in_zh_tw && !in_zh_cn && !in_common && !is_chinese && !is_japanese && !is_kana
+      print "UNKNOWN"
+    end
+    puts
+  end
+end
+# rubocop:enable all

data/traditional_characters.txt ADDED Viewed

File without changes

data/unihan_lang.gemspec ADDED Viewed

@@ -0,0 +1,30 @@
+# frozen_string_literal: true
+# rubocop:disable Layout/LineLength, Gemspec/RequiredRubyVersion
+require_relative "lib/unihan_lang/version"
+Gem::Specification.new do |spec|
+  spec.name          = "unihan_lang"
+  spec.version       = UnihanLang::VERSION
+  spec.authors       = ["kyubey1228"]
+  spec.email         = ["kyuuka1228@gmail.com"]
+  spec.summary       = "Language detection for Chinese and Japanese characters"
+  spec.description   = "A gem to detect and differentiate between Traditional Chinese, Simplified Chinese, and Japanese characters based on Unihan data."
+  spec.homepage      = "https://github.com/kyubey1228/unihan_lang"
+  spec.license       = "MIT"
+  spec.required_ruby_version = Gem::Requirement.new(">= 2.5.0")
+  spec.bindir        = "exe"
+  spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
+  spec.require_paths = ["lib"]
+  spec.add_development_dependency "bundler", "~> 2.0"
+  spec.add_development_dependency "rake", "~> 13.0"
+  spec.add_development_dependency "rspec", "~> 3.0"
+  spec.files = Dir.chdir(File.expand_path(__dir__)) do
+    `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
+  end + Dir["data/*"]
+end
+# rubocop:enable all

metadata ADDED Viewed

@@ -0,0 +1,102 @@
+--- !ruby/object:Gem::Specification
+name: unihan_lang
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- kyubey1228
+autorequire:
+bindir: exe
+cert_chain: []
+date: 2024-09-08 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.0'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '13.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '13.0'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.0'
+description: A gem to detect and differentiate between Traditional Chinese, Simplified
+  Chinese, and Japanese characters based on Unihan data.
+email:
+- kyuuka1228@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- ".github/workflows/main.yml"
+- ".gitignore"
+- ".rspec"
+- ".rubocop.yml"
+- Gemfile
+- Gemfile.lock
+- README.md
+- Rakefile
+- data/Unihan_Variants.txt
+- data/traditional_chinese_list.txt
+- lib/unihan_lang.rb
+- lib/unihan_lang/chinese_processor.rb
+- lib/unihan_lang/version.rb
+- test.rb
+- traditional_characters.txt
+- unihan_lang.gemspec
+homepage: https://github.com/kyubey1228/unihan_lang
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: 2.5.0
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubygems_version: 3.5.11
+signing_key:
+specification_version: 4
+summary: Language detection for Chinese and Japanese characters
+test_files: []