unihan_lang 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 81d1394f3bee01c607c5440f3682b6401c9aa1d1ff9ba5ad286c196f8eebc54b
4
- data.tar.gz: 8be0a8218adbe226b8f2079a893200912e8cbcba6b2c00aa405b0b886dd01f50
3
+ metadata.gz: 07200fff605b22cae82d18c8b9a16041fb1fbcd53c9106a25582785a779631ad
4
+ data.tar.gz: 335f78c7b507b547bbad354856bb294655f1042943ed66d3899a607721c40efb
5
5
  SHA512:
6
- metadata.gz: 01ae746510cad08ab38db9f21752049e7fd99141fdc6886f6a93a2c521feb0554b47fafa8d7d3bfa4fd6aee560bbd1dfa3a08ad0ac477e399583971fa7258ebe
7
- data.tar.gz: 302c234bc6616021682ff2b9f86df610f24bc9bd7894e6107369d44a5e2e4164738130ec31d785f45485b5e508fc03934cd50bb5bea52d9423299ca5679e182c
6
+ metadata.gz: 7691fa9c50fc79921fa069e36a367cb37adc20a2be5fecdd4d768d037389b3fa434fa1078ab4bf05afc026f5a91d1b8215696f0eff0f2feb9987e10264a4bc32
7
+ data.tar.gz: acff2b9f98d296486e0a83f2b8909db4351042e7ec837c088d0ddf8377b99425ebda8eea4f99709861390fe141162f95717768196c7a022eb29d13030d9708f8
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- unihan_lang (0.1.0)
4
+ unihan_lang (0.3.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
data/HOW_TO_UPDATE.md ADDED
@@ -0,0 +1,32 @@
1
+ # Setup
2
+
3
+ ```bash
4
+ mkdir ~/.gem
5
+ RUBY_GEMS_ACCOUNT=TODO
6
+ curl -u $RUBY_GEMS_ACCOUNT https://rubygems.org/api/v1/api_key.yaml > ~/.gem/credentials; chmod 0600 ~/.gem/credentials
7
+ ```
8
+
9
+ # Publish
10
+
11
+ ```bash
12
+ # Add release branch
13
+ VERSION=TODO
14
+ git checkout master
15
+ git checkout -b release/$VERSION
16
+ vi lib/unihan_lang/version.rb # edit version
17
+ git commit -am "Bump version $VERSION"
18
+
19
+ # Publish to rubygems
20
+ gem build unihan_lang.gemspec
21
+ bundle install
22
+ gem push unihan_lang-$VERSION.gem
23
+
24
+ # GitHub release
25
+ git tag $VERSION
26
+ git push --tags
27
+ open https://github.com/kyubey1228/unihan_lang/releases/new
28
+
29
+ # merge to master
30
+ git checkout master
31
+ git merge release/$VERSION
32
+ ```
@@ -0,0 +1,56 @@
1
+ # frozen_string_literal: true
2
+
3
+ module UnihanLang
4
+ class ChineseScoreAnalyzer
5
+ attr_reader :traditional_score, :simplified_score, :total_chinese
6
+
7
+ def initialize(text, chinese_processor, variant_mapping)
8
+ @text = text
9
+ @chinese_processor = chinese_processor
10
+ @variant_mapping = variant_mapping
11
+ @traditional_score = 0
12
+ @simplified_score = 0
13
+ analyze
14
+ end
15
+
16
+ def dominant_language
17
+ return "Unknown" if total_chinese.zero?
18
+ return "ZH_TW" if traditional_score > simplified_score
19
+ return "ZH_CN" if simplified_score > traditional_score
20
+
21
+ "Unknown"
22
+ end
23
+
24
+ def language_ratio
25
+ return :unknown if total_chinese != @text.length
26
+ return :tw if traditional_score > simplified_score
27
+ return :cn if simplified_score >= traditional_score
28
+
29
+ :unknown
30
+ end
31
+
32
+ private
33
+
34
+ def analyze
35
+ @total_chinese = 0
36
+ @text.chars.each do |char|
37
+ next unless @chinese_processor.chinese_character?(char)
38
+
39
+ @total_chinese += 1
40
+
41
+ calculate_character_scores(char)
42
+ end
43
+ end
44
+
45
+ def calculate_character_scores(char)
46
+ if @chinese_processor.only_zh_tw?(char)
47
+ @traditional_score += 2
48
+ elsif @chinese_processor.only_zh_cn?(char)
49
+ @simplified_score += 2
50
+ end
51
+
52
+ @traditional_score += 0.5 if @variant_mapping.traditional_variants(char).any?
53
+ @simplified_score += 0.5 if @variant_mapping.simplified_variants(char).any?
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,46 @@
1
+ # frozen_string_literal: true
2
+
3
+ module UnihanLang
4
+ class VariantMapping
5
+ def initialize
6
+ @traditional_to_simplified = load_variant_mappings
7
+ @simplified_to_traditional = {}
8
+ # 簡体字から繁体字へのマッピングを構築
9
+ @traditional_to_simplified.each do |trad, simps|
10
+ simps.each do |simp|
11
+ @simplified_to_traditional[simp] ||= Set.new
12
+ @simplified_to_traditional[simp] << trad
13
+ end
14
+ end
15
+ end
16
+
17
+ def traditional_variants(char)
18
+ @simplified_to_traditional[char] || Set.new
19
+ end
20
+
21
+ def simplified_variants(char)
22
+ @traditional_to_simplified[char] || Set.new
23
+ end
24
+
25
+ private
26
+
27
+ def load_variant_mappings
28
+ traditional_to_simplified = {}
29
+ file_path = File.join(File.dirname(__FILE__), "..", "..", "data", "Unihan_Variants.txt")
30
+
31
+ File.foreach(file_path, encoding: "UTF-8") do |line|
32
+ next if line.start_with?("#") || line.strip.empty?
33
+
34
+ fields = line.strip.split("\t")
35
+ # kTraditionalVariant フィールドの場合のみ処理
36
+ if fields.size >= 3 && fields[1] == ("kTraditionalVariant")
37
+ simp = [fields[0].gsub(/^U\+/, "").hex].pack("U")
38
+ trad = [fields[2].gsub(/^U\+/, "").hex].pack("U")
39
+ traditional_to_simplified[trad] ||= Set.new
40
+ traditional_to_simplified[trad] << simp
41
+ end
42
+ end
43
+ traditional_to_simplified
44
+ end
45
+ end
46
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module UnihanLang
4
- VERSION = "0.2.0"
4
+ VERSION = "0.3.0"
5
5
  end
data/lib/unihan_lang.rb CHANGED
@@ -2,11 +2,14 @@
2
2
 
3
3
  require_relative "unihan_lang/version"
4
4
  require_relative "unihan_lang/chinese_processor"
5
+ require_relative "unihan_lang/variant_mapping"
6
+ require_relative "unihan_lang/chinese_score_analyzer"
5
7
 
6
8
  module UnihanLang
7
9
  class Unihan
8
10
  def initialize
9
11
  @chinese_processor = ChineseProcessor.new
12
+ @variant_mapping = VariantMapping.new
10
13
  end
11
14
 
12
15
  def zh_tw?(text)
@@ -49,19 +52,25 @@ module UnihanLang
49
52
  end
50
53
  end
51
54
 
52
- private
55
+ def analyze_with_variants(text)
56
+ analyzer = ChineseScoreAnalyzer.new(text, @chinese_processor, @variant_mapping)
57
+ {
58
+ traditional_score: analyzer.traditional_score,
59
+ simplified_score: analyzer.simplified_score,
60
+ total_chinese: analyzer.total_chinese,
61
+ }
62
+ end
53
63
 
54
- # テキストの言語比率を計算し、最も可能性の高い言語を返す
55
- def language_ratio(text)
56
- only_tw_chars = text.chars.count { |char| @chinese_processor.only_zh_tw?(char) }
57
- only_cn_chars = text.chars.count { |char| @chinese_processor.only_zh_cn?(char) }
58
- chinese_chars = text.chars.count { |char| @chinese_processor.chinese?(char) }
64
+ def determine_language_with_variants(text)
65
+ analyzer = ChineseScoreAnalyzer.new(text, @chinese_processor, @variant_mapping)
66
+ analyzer.dominant_language
67
+ end
59
68
 
60
- return :unknown unless chinese_chars == text.length
61
- return :tw if only_tw_chars > only_cn_chars
62
- return :cn if only_cn_chars >= only_tw_chars
69
+ private
63
70
 
64
- :unknown
71
+ def language_ratio(text)
72
+ analyzer = ChineseScoreAnalyzer.new(text, @chinese_processor, @variant_mapping)
73
+ analyzer.language_ratio
65
74
  end
66
75
  end
67
76
  end
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unihan_lang
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - kyubey1228
8
- autorequire:
9
8
  bindir: exe
10
9
  cert_chain: []
11
- date: 2024-10-17 00:00:00.000000000 Z
10
+ date: 2025-02-20 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: bundler
@@ -66,6 +65,7 @@ files:
66
65
  - ".rubocop.yml"
67
66
  - Gemfile
68
67
  - Gemfile.lock
68
+ - HOW_TO_UPDATE.md
69
69
  - LICENSE.md
70
70
  - README.ja.md
71
71
  - README.md
@@ -73,13 +73,14 @@ files:
73
73
  - data/Unihan_Variants.txt
74
74
  - lib/unihan_lang.rb
75
75
  - lib/unihan_lang/chinese_processor.rb
76
+ - lib/unihan_lang/chinese_score_analyzer.rb
77
+ - lib/unihan_lang/variant_mapping.rb
76
78
  - lib/unihan_lang/version.rb
77
79
  - unihan_lang.gemspec
78
80
  homepage: https://github.com/kyubey1228/unihan_lang
79
81
  licenses:
80
82
  - MIT
81
83
  metadata: {}
82
- post_install_message:
83
84
  rdoc_options: []
84
85
  require_paths:
85
86
  - lib
@@ -94,8 +95,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
94
95
  - !ruby/object:Gem::Version
95
96
  version: '0'
96
97
  requirements: []
97
- rubygems_version: 3.5.3
98
- signing_key:
98
+ rubygems_version: 3.6.2
99
99
  specification_version: 4
100
100
  summary: Language detection for Chinese characters
101
101
  test_files: []