unihan_lang 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/HOW_TO_UPDATE.md +32 -0
- data/lib/unihan_lang/chinese_score_analyzer.rb +56 -0
- data/lib/unihan_lang/variant_mapping.rb +46 -0
- data/lib/unihan_lang/version.rb +1 -1
- data/lib/unihan_lang.rb +19 -10
- metadata +6 -6
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 07200fff605b22cae82d18c8b9a16041fb1fbcd53c9106a25582785a779631ad
|
|
4
|
+
data.tar.gz: 335f78c7b507b547bbad354856bb294655f1042943ed66d3899a607721c40efb
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 7691fa9c50fc79921fa069e36a367cb37adc20a2be5fecdd4d768d037389b3fa434fa1078ab4bf05afc026f5a91d1b8215696f0eff0f2feb9987e10264a4bc32
|
|
7
|
+
data.tar.gz: acff2b9f98d296486e0a83f2b8909db4351042e7ec837c088d0ddf8377b99425ebda8eea4f99709861390fe141162f95717768196c7a022eb29d13030d9708f8
|
data/Gemfile.lock
CHANGED
data/HOW_TO_UPDATE.md
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# Setup
|
|
2
|
+
|
|
3
|
+
```bash
|
|
4
|
+
mkdir ~/.gem
|
|
5
|
+
RUBY_GEMS_ACCOUNT=TODO
|
|
6
|
+
curl -u $RUBY_GEMS_ACCOUNT https://rubygems.org/api/v1/api_key.yaml > ~/.gem/credentials; chmod 0600 ~/.gem/credentials
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
# Publish
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
# Add release branch
|
|
13
|
+
VERSION=TODO
|
|
14
|
+
git checkout master
|
|
15
|
+
git checkout -b release/$VERSION
|
|
16
|
+
vi lib/unihan_lang/version.rb # edit version
|
|
17
|
+
git commit -am "Bump version $VERSION"
|
|
18
|
+
|
|
19
|
+
# Publish to rubygems
|
|
20
|
+
gem build unihan_lang.gemspec
|
|
21
|
+
bundle install
|
|
22
|
+
gem push unihan_lang-$VERSION.gem
|
|
23
|
+
|
|
24
|
+
# GitHub release
|
|
25
|
+
git tag $VERSION
|
|
26
|
+
git push --tags
|
|
27
|
+
open https://github.com/kyubey1228/unihan_lang/releases/new
|
|
28
|
+
|
|
29
|
+
# merge to master
|
|
30
|
+
git checkout master
|
|
31
|
+
git merge release/$VERSION
|
|
32
|
+
```
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module UnihanLang
|
|
4
|
+
class ChineseScoreAnalyzer
|
|
5
|
+
attr_reader :traditional_score, :simplified_score, :total_chinese
|
|
6
|
+
|
|
7
|
+
def initialize(text, chinese_processor, variant_mapping)
|
|
8
|
+
@text = text
|
|
9
|
+
@chinese_processor = chinese_processor
|
|
10
|
+
@variant_mapping = variant_mapping
|
|
11
|
+
@traditional_score = 0
|
|
12
|
+
@simplified_score = 0
|
|
13
|
+
analyze
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def dominant_language
|
|
17
|
+
return "Unknown" if total_chinese.zero?
|
|
18
|
+
return "ZH_TW" if traditional_score > simplified_score
|
|
19
|
+
return "ZH_CN" if simplified_score > traditional_score
|
|
20
|
+
|
|
21
|
+
"Unknown"
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def language_ratio
|
|
25
|
+
return :unknown if total_chinese != @text.length
|
|
26
|
+
return :tw if traditional_score > simplified_score
|
|
27
|
+
return :cn if simplified_score >= traditional_score
|
|
28
|
+
|
|
29
|
+
:unknown
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
private
|
|
33
|
+
|
|
34
|
+
def analyze
|
|
35
|
+
@total_chinese = 0
|
|
36
|
+
@text.chars.each do |char|
|
|
37
|
+
next unless @chinese_processor.chinese_character?(char)
|
|
38
|
+
|
|
39
|
+
@total_chinese += 1
|
|
40
|
+
|
|
41
|
+
calculate_character_scores(char)
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def calculate_character_scores(char)
|
|
46
|
+
if @chinese_processor.only_zh_tw?(char)
|
|
47
|
+
@traditional_score += 2
|
|
48
|
+
elsif @chinese_processor.only_zh_cn?(char)
|
|
49
|
+
@simplified_score += 2
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
@traditional_score += 0.5 if @variant_mapping.traditional_variants(char).any?
|
|
53
|
+
@simplified_score += 0.5 if @variant_mapping.simplified_variants(char).any?
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module UnihanLang
|
|
4
|
+
class VariantMapping
|
|
5
|
+
def initialize
|
|
6
|
+
@traditional_to_simplified = load_variant_mappings
|
|
7
|
+
@simplified_to_traditional = {}
|
|
8
|
+
# 簡体字から繁体字へのマッピングを構築
|
|
9
|
+
@traditional_to_simplified.each do |trad, simps|
|
|
10
|
+
simps.each do |simp|
|
|
11
|
+
@simplified_to_traditional[simp] ||= Set.new
|
|
12
|
+
@simplified_to_traditional[simp] << trad
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def traditional_variants(char)
|
|
18
|
+
@simplified_to_traditional[char] || Set.new
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def simplified_variants(char)
|
|
22
|
+
@traditional_to_simplified[char] || Set.new
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
private
|
|
26
|
+
|
|
27
|
+
def load_variant_mappings
|
|
28
|
+
traditional_to_simplified = {}
|
|
29
|
+
file_path = File.join(File.dirname(__FILE__), "..", "..", "data", "Unihan_Variants.txt")
|
|
30
|
+
|
|
31
|
+
File.foreach(file_path, encoding: "UTF-8") do |line|
|
|
32
|
+
next if line.start_with?("#") || line.strip.empty?
|
|
33
|
+
|
|
34
|
+
fields = line.strip.split("\t")
|
|
35
|
+
# kTraditionalVariant フィールドの場合のみ処理
|
|
36
|
+
if fields.size >= 3 && fields[1] == ("kTraditionalVariant")
|
|
37
|
+
simp = [fields[0].gsub(/^U\+/, "").hex].pack("U")
|
|
38
|
+
trad = [fields[2].gsub(/^U\+/, "").hex].pack("U")
|
|
39
|
+
traditional_to_simplified[trad] ||= Set.new
|
|
40
|
+
traditional_to_simplified[trad] << simp
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
traditional_to_simplified
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
data/lib/unihan_lang/version.rb
CHANGED
data/lib/unihan_lang.rb
CHANGED
|
@@ -2,11 +2,14 @@
|
|
|
2
2
|
|
|
3
3
|
require_relative "unihan_lang/version"
|
|
4
4
|
require_relative "unihan_lang/chinese_processor"
|
|
5
|
+
require_relative "unihan_lang/variant_mapping"
|
|
6
|
+
require_relative "unihan_lang/chinese_score_analyzer"
|
|
5
7
|
|
|
6
8
|
module UnihanLang
|
|
7
9
|
class Unihan
|
|
8
10
|
def initialize
|
|
9
11
|
@chinese_processor = ChineseProcessor.new
|
|
12
|
+
@variant_mapping = VariantMapping.new
|
|
10
13
|
end
|
|
11
14
|
|
|
12
15
|
def zh_tw?(text)
|
|
@@ -49,19 +52,25 @@ module UnihanLang
|
|
|
49
52
|
end
|
|
50
53
|
end
|
|
51
54
|
|
|
52
|
-
|
|
55
|
+
def analyze_with_variants(text)
|
|
56
|
+
analyzer = ChineseScoreAnalyzer.new(text, @chinese_processor, @variant_mapping)
|
|
57
|
+
{
|
|
58
|
+
traditional_score: analyzer.traditional_score,
|
|
59
|
+
simplified_score: analyzer.simplified_score,
|
|
60
|
+
total_chinese: analyzer.total_chinese,
|
|
61
|
+
}
|
|
62
|
+
end
|
|
53
63
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
chinese_chars = text.chars.count { |char| @chinese_processor.chinese?(char) }
|
|
64
|
+
def determine_language_with_variants(text)
|
|
65
|
+
analyzer = ChineseScoreAnalyzer.new(text, @chinese_processor, @variant_mapping)
|
|
66
|
+
analyzer.dominant_language
|
|
67
|
+
end
|
|
59
68
|
|
|
60
|
-
|
|
61
|
-
return :tw if only_tw_chars > only_cn_chars
|
|
62
|
-
return :cn if only_cn_chars >= only_tw_chars
|
|
69
|
+
private
|
|
63
70
|
|
|
64
|
-
|
|
71
|
+
def language_ratio(text)
|
|
72
|
+
analyzer = ChineseScoreAnalyzer.new(text, @chinese_processor, @variant_mapping)
|
|
73
|
+
analyzer.language_ratio
|
|
65
74
|
end
|
|
66
75
|
end
|
|
67
76
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: unihan_lang
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.3.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- kyubey1228
|
|
8
|
-
autorequire:
|
|
9
8
|
bindir: exe
|
|
10
9
|
cert_chain: []
|
|
11
|
-
date:
|
|
10
|
+
date: 2025-02-20 00:00:00.000000000 Z
|
|
12
11
|
dependencies:
|
|
13
12
|
- !ruby/object:Gem::Dependency
|
|
14
13
|
name: bundler
|
|
@@ -66,6 +65,7 @@ files:
|
|
|
66
65
|
- ".rubocop.yml"
|
|
67
66
|
- Gemfile
|
|
68
67
|
- Gemfile.lock
|
|
68
|
+
- HOW_TO_UPDATE.md
|
|
69
69
|
- LICENSE.md
|
|
70
70
|
- README.ja.md
|
|
71
71
|
- README.md
|
|
@@ -73,13 +73,14 @@ files:
|
|
|
73
73
|
- data/Unihan_Variants.txt
|
|
74
74
|
- lib/unihan_lang.rb
|
|
75
75
|
- lib/unihan_lang/chinese_processor.rb
|
|
76
|
+
- lib/unihan_lang/chinese_score_analyzer.rb
|
|
77
|
+
- lib/unihan_lang/variant_mapping.rb
|
|
76
78
|
- lib/unihan_lang/version.rb
|
|
77
79
|
- unihan_lang.gemspec
|
|
78
80
|
homepage: https://github.com/kyubey1228/unihan_lang
|
|
79
81
|
licenses:
|
|
80
82
|
- MIT
|
|
81
83
|
metadata: {}
|
|
82
|
-
post_install_message:
|
|
83
84
|
rdoc_options: []
|
|
84
85
|
require_paths:
|
|
85
86
|
- lib
|
|
@@ -94,8 +95,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
94
95
|
- !ruby/object:Gem::Version
|
|
95
96
|
version: '0'
|
|
96
97
|
requirements: []
|
|
97
|
-
rubygems_version: 3.
|
|
98
|
-
signing_key:
|
|
98
|
+
rubygems_version: 3.6.2
|
|
99
99
|
specification_version: 4
|
|
100
100
|
summary: Language detection for Chinese characters
|
|
101
101
|
test_files: []
|