unihan_lang 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE.md +24 -0
- data/README.ja.md +85 -0
- data/README.md +54 -23
- data/lib/unihan_lang/chinese_processor.rb +12 -16
- data/lib/unihan_lang/version.rb +1 -1
- data/lib/unihan_lang.rb +0 -1
- data/unihan_lang.gemspec +2 -2
- metadata +7 -8
- data/data/traditional_chinese_list.txt +0 -6017
- data/test.rb +0 -58
- data/traditional_characters.txt +0 -0
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 81d1394f3bee01c607c5440f3682b6401c9aa1d1ff9ba5ad286c196f8eebc54b
|
|
4
|
+
data.tar.gz: 8be0a8218adbe226b8f2079a893200912e8cbcba6b2c00aa405b0b886dd01f50
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 01ae746510cad08ab38db9f21752049e7fd99141fdc6886f6a93a2c521feb0554b47fafa8d7d3bfa4fd6aee560bbd1dfa3a08ad0ac477e399583971fa7258ebe
|
|
7
|
+
data.tar.gz: 302c234bc6616021682ff2b9f86df610f24bc9bd7894e6107369d44a5e2e4164738130ec31d785f45485b5e508fc03934cd50bb5bea52d9423299ca5679e182c
|
data/LICENSE.md
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
<!-- @format -->
|
|
2
|
+
|
|
3
|
+
# The MIT License (MIT)
|
|
4
|
+
|
|
5
|
+
Copyright 2024 kyubey1228
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
|
10
|
+
this software and associated documentation files (the “Software”), to deal in
|
|
11
|
+
the Software without restriction, including without limitation the rights to
|
|
12
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
|
13
|
+
the Software, and to permit persons to whom the Software is furnished to do so,
|
|
14
|
+
subject to the following conditions:
|
|
15
|
+
|
|
16
|
+
The above copyright notice and this permission notice shall be included in all
|
|
17
|
+
copies or substantial portions of the Software.
|
|
18
|
+
|
|
19
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
20
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
|
21
|
+
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
|
22
|
+
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
|
23
|
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
24
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.ja.md
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
<!-- @format -->
|
|
2
|
+
|
|
3
|
+
# UnihanLang
|
|
4
|
+
|
|
5
|
+
`unihan_lang` は、テキストの言語(繁体字中国語、簡体字中国語)を識別し、中国語の文字に関する様々な判定を行うための Ruby ライブラリです。
|
|
6
|
+
|
|
7
|
+
## インストール
|
|
8
|
+
|
|
9
|
+
Gemfile に以下の行を追加してください:
|
|
10
|
+
|
|
11
|
+
```ruby
|
|
12
|
+
gem 'unihan_lang'
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
そして、以下のコマンドを実行してください:
|
|
16
|
+
|
|
17
|
+
```sh
|
|
18
|
+
bundle install
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
または、直接インストールする場合は以下のコマンドを使用してください:
|
|
22
|
+
|
|
23
|
+
```sh
|
|
24
|
+
gem install unihan_lang
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## 使用方法
|
|
28
|
+
|
|
29
|
+
```ruby
|
|
30
|
+
require 'unihan_lang'
|
|
31
|
+
|
|
32
|
+
unihan = UnihanLang::Unihan.new
|
|
33
|
+
|
|
34
|
+
# 言語の判定
|
|
35
|
+
puts unihan.determine_language("這是繁體中文") # => "ZH_TW"
|
|
36
|
+
puts unihan.determine_language("这是简体中文") # => "ZH_CN"
|
|
37
|
+
|
|
38
|
+
# 繁体字中国語かどうかの判定
|
|
39
|
+
puts unihan.zh_tw?("這是繁體中文") # => true
|
|
40
|
+
puts unihan.zh_tw?("这不是繁体中文") # => false
|
|
41
|
+
|
|
42
|
+
# 簡体字中国語かどうかの判定
|
|
43
|
+
puts unihan.zh_cn?("这是简体中文") # => true
|
|
44
|
+
puts unihan.zh_cn?("這不是簡體中文") # => false
|
|
45
|
+
|
|
46
|
+
# テキストに中国語の文字が含まれているかの判定
|
|
47
|
+
puts unihan.contains_chinese?("This text contains 中文") # => true
|
|
48
|
+
puts unihan.contains_chinese?("This text has no Chinese") # => false
|
|
49
|
+
|
|
50
|
+
# テキストから中国語の文字を抽出
|
|
51
|
+
puts unihan.extract_chinese_characters("This text contains 中文").join # => "中文"
|
|
52
|
+
|
|
53
|
+
# 繁体字のみで構成されているかの判定
|
|
54
|
+
puts unihan.only_zh_tw?("繁體") # => true
|
|
55
|
+
puts unihan.only_zh_tw?("繁體简体") # => false
|
|
56
|
+
|
|
57
|
+
# 簡体字のみで構成されているかの判定
|
|
58
|
+
puts unihan.only_zh_cn?("简体") # => true
|
|
59
|
+
puts unihan.only_zh_cn?("简体繁體") # => false
|
|
60
|
+
|
|
61
|
+
# 繁体字を含むかどうかの判定
|
|
62
|
+
puts unihan.contains_zh_tw?("這個text包含繁體字") # => true
|
|
63
|
+
puts unihan.contains_zh_tw?("这个text不包含繁体字") # => false
|
|
64
|
+
|
|
65
|
+
# 簡体字を含むかどうかの判定
|
|
66
|
+
puts unihan.contains_zh_cn?("这个text包含简体字") # => true
|
|
67
|
+
puts unihan.contains_zh_cn?("這個text不包含簡體字") # => false
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## 機能説明
|
|
71
|
+
|
|
72
|
+
- `determine_language(text)`: テキストの言語を判定します("ZH_TW", "ZH_CN", "Unknown")。
|
|
73
|
+
- `zh_tw?(text)`: テキストが繁体字中国語かどうかを判定します。
|
|
74
|
+
- `zh_cn?(text)`: テキストが簡体字中国語かどうかを判定します。
|
|
75
|
+
- `contains_chinese?(text)`: テキストに中国語の文字が含まれているかを判定します。
|
|
76
|
+
- `extract_chinese_characters(text)`: テキストから中国語の文字を抽出します。
|
|
77
|
+
- `only_zh_tw?(text)`: テキストが繁体字のみで構成されているかを判定します。
|
|
78
|
+
- `only_zh_cn?(text)`: テキストが簡体字のみで構成されているかを判定します。
|
|
79
|
+
- `contains_zh_tw?(text)`: テキストに繁体字が含まれているかを判定します。
|
|
80
|
+
- `contains_zh_cn?(text)`: テキストに簡体字が含まれているかを判定します。
|
|
81
|
+
|
|
82
|
+
## 注意事項
|
|
83
|
+
|
|
84
|
+
このライブラリは、テキストの言語を完全に正確に判定することを保証するものではありません。
|
|
85
|
+
特に、短いテキストや複数の言語が混在するテキストの場合、判定が難しい場合があります。
|
data/README.md
CHANGED
|
@@ -2,56 +2,87 @@
|
|
|
2
2
|
|
|
3
3
|
# UnihanLang
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
`unihan_lang` is a Ruby library for identifying text language (Traditional Chinese, Simplified Chinese) and performing various checks on Chinese characters.
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
This document can also be read in [Japanese](https://github.com/kyubey1228/unihan_lang/blob/master/README.ja.md).
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
Add this line to your application's Gemfile:
|
|
10
12
|
|
|
11
13
|
```ruby
|
|
12
14
|
gem 'unihan_lang'
|
|
13
15
|
```
|
|
14
16
|
|
|
15
|
-
|
|
17
|
+
And then execute:
|
|
16
18
|
|
|
17
19
|
```sh
|
|
18
20
|
bundle install
|
|
19
21
|
```
|
|
20
22
|
|
|
21
|
-
|
|
23
|
+
Or install it yourself as:
|
|
22
24
|
|
|
23
25
|
```sh
|
|
24
26
|
gem install unihan_lang
|
|
25
27
|
```
|
|
26
28
|
|
|
27
|
-
##
|
|
29
|
+
## Usage
|
|
28
30
|
|
|
29
31
|
```ruby
|
|
30
32
|
require 'unihan_lang'
|
|
31
33
|
|
|
32
34
|
unihan = UnihanLang::Unihan.new
|
|
33
35
|
|
|
34
|
-
#
|
|
35
|
-
puts unihan.determine_language("這是繁體中文")
|
|
36
|
-
puts unihan.determine_language("这是简体中文")
|
|
36
|
+
# Language determination
|
|
37
|
+
puts unihan.determine_language("這是繁體中文") # => "ZH_TW"
|
|
38
|
+
puts unihan.determine_language("这是简体中文") # => "ZH_CN"
|
|
39
|
+
|
|
40
|
+
# Check if text is Traditional Chinese
|
|
41
|
+
puts unihan.zh_tw?("這是繁體中文") # => true
|
|
42
|
+
puts unihan.zh_tw?("这不是繁体中文") # => false
|
|
43
|
+
|
|
44
|
+
# Check if text is Simplified Chinese
|
|
45
|
+
puts unihan.zh_cn?("这是简体中文") # => true
|
|
46
|
+
puts unihan.zh_cn?("這不是簡體中文") # => false
|
|
47
|
+
|
|
48
|
+
# Check if text contains Chinese characters
|
|
49
|
+
puts unihan.contains_chinese?("This text contains 中文") # => true
|
|
50
|
+
puts unihan.contains_chinese?("This text has no Chinese") # => false
|
|
37
51
|
|
|
38
|
-
#
|
|
39
|
-
puts unihan.
|
|
40
|
-
puts unihan.zh_tw?("这不是繁体中文") # => false
|
|
52
|
+
# Extract Chinese characters from text
|
|
53
|
+
puts unihan.extract_chinese_characters("This text contains 中文").join # => "中文"
|
|
41
54
|
|
|
42
|
-
#
|
|
43
|
-
puts unihan.
|
|
44
|
-
puts unihan.
|
|
55
|
+
# Check if text consists only of Traditional Chinese characters
|
|
56
|
+
puts unihan.only_zh_tw?("繁體") # => true
|
|
57
|
+
puts unihan.only_zh_tw?("繁體简体") # => false
|
|
45
58
|
|
|
46
|
-
#
|
|
47
|
-
puts unihan.
|
|
48
|
-
puts unihan.
|
|
59
|
+
# Check if text consists only of Simplified Chinese characters
|
|
60
|
+
puts unihan.only_zh_cn?("简体") # => true
|
|
61
|
+
puts unihan.only_zh_cn?("简体繁體") # => false
|
|
49
62
|
|
|
50
|
-
#
|
|
51
|
-
puts unihan.
|
|
63
|
+
# Check if text contains Traditional Chinese characters
|
|
64
|
+
puts unihan.contains_zh_tw?("這個text包含繁體字") # => true
|
|
65
|
+
puts unihan.contains_zh_tw?("这个text不包含繁体字") # => false
|
|
66
|
+
|
|
67
|
+
# Check if text contains Simplified Chinese characters
|
|
68
|
+
puts unihan.contains_zh_cn?("这个text包含简体字") # => true
|
|
69
|
+
puts unihan.contains_zh_cn?("這個text不包含簡體字") # => false
|
|
52
70
|
```
|
|
53
71
|
|
|
54
|
-
##
|
|
72
|
+
## Features
|
|
73
|
+
|
|
74
|
+
- `determine_language(text)`: Determines the language of the text ("ZH_TW", "ZH_CN", "JA", "Unknown").
|
|
75
|
+
- `zh_tw?(text)`: Checks if the text is in Traditional Chinese.
|
|
76
|
+
- `zh_cn?(text)`: Checks if the text is in Simplified Chinese.
|
|
77
|
+
- `contains_chinese?(text)`: Checks if the text contains Chinese characters.
|
|
78
|
+
- `extract_chinese_characters(text)`: Extracts Chinese characters from the text.
|
|
79
|
+
- `only_zh_tw?(text)`: Checks if the text consists only of Traditional Chinese characters.
|
|
80
|
+
- `only_zh_cn?(text)`: Checks if the text consists only of Simplified Chinese characters.
|
|
81
|
+
- `contains_zh_tw?(text)`: Checks if the text contains Traditional Chinese characters.
|
|
82
|
+
- `contains_zh_cn?(text)`: Checks if the text contains Simplified Chinese characters.
|
|
83
|
+
|
|
84
|
+
## Note
|
|
55
85
|
|
|
56
|
-
|
|
57
|
-
|
|
86
|
+
This library does not guarantee 100% accuracy in language identification.
|
|
87
|
+
Particularly for short texts or texts containing multiple languages, determination may be challenging.
|
|
88
|
+
The distinction between Traditional and Simplified Chinese is based on the Unihan database.
|
|
@@ -43,7 +43,6 @@ module UnihanLang
|
|
|
43
43
|
|
|
44
44
|
def load_chinese_characters
|
|
45
45
|
load_unihan_variants
|
|
46
|
-
load_traditional_chinese_list
|
|
47
46
|
process_character_sets
|
|
48
47
|
end
|
|
49
48
|
|
|
@@ -58,31 +57,28 @@ module UnihanLang
|
|
|
58
57
|
end
|
|
59
58
|
|
|
60
59
|
def process_unihan_fields(fields)
|
|
61
|
-
|
|
60
|
+
from = [fields[0].gsub(/^U\+/, "").hex].pack("U")
|
|
62
61
|
# Remove dictionary name.
|
|
63
62
|
# Example: U+348B kSemanticVariant U+5EDD<kMatthews U+53AE<kMatthews
|
|
64
|
-
|
|
63
|
+
to = [fields[2].split("<")[0].gsub(/^U\+/, "").hex].pack("U")
|
|
65
64
|
case fields[1]
|
|
66
65
|
when "kTraditionalVariant"
|
|
67
|
-
@
|
|
68
|
-
@
|
|
66
|
+
@zh_cn << from
|
|
67
|
+
@zh_tw << to
|
|
69
68
|
when "kSimplifiedVariant"
|
|
70
|
-
@
|
|
71
|
-
@
|
|
69
|
+
@zh_tw << from
|
|
70
|
+
@zh_cn << to
|
|
72
71
|
end
|
|
73
72
|
end
|
|
74
73
|
|
|
75
|
-
def load_traditional_chinese_list
|
|
76
|
-
file_path = File.join(File.dirname(__FILE__), "..", "..", "data",
|
|
77
|
-
"traditional_chinese_list.txt")
|
|
78
|
-
File.foreach(file_path, encoding: "UTF-8") { |line| @zh_tw << line.strip }
|
|
79
|
-
end
|
|
80
|
-
|
|
81
74
|
def process_character_sets
|
|
75
|
+
# There are same code point both zh_tw and zh_cn in Unihan_Variants.txt.
|
|
76
|
+
# Example: 台(U+53F0)
|
|
77
|
+
# U+53F0 kSimplifiedVariant U+53F0
|
|
78
|
+
# U+53F0 kTraditionalVariant U+53F0 U+6AAF U+81FA U+98B1
|
|
82
79
|
@common = @zh_tw & @zh_cn
|
|
83
|
-
@zh_tw -= @
|
|
84
|
-
@zh_cn -= @
|
|
85
|
-
@zh_cn |= @common
|
|
80
|
+
@zh_tw -= @common
|
|
81
|
+
@zh_cn -= @common
|
|
86
82
|
end
|
|
87
83
|
end
|
|
88
84
|
end
|
data/lib/unihan_lang/version.rb
CHANGED
data/lib/unihan_lang.rb
CHANGED
data/unihan_lang.gemspec
CHANGED
|
@@ -9,8 +9,8 @@ Gem::Specification.new do |spec|
|
|
|
9
9
|
spec.authors = ["kyubey1228"]
|
|
10
10
|
spec.email = ["kyuuka1228@gmail.com"]
|
|
11
11
|
|
|
12
|
-
spec.summary = "Language detection for Chinese
|
|
13
|
-
spec.description = "A gem to detect and differentiate between Traditional Chinese, Simplified Chinese
|
|
12
|
+
spec.summary = "Language detection for Chinese characters"
|
|
13
|
+
spec.description = "A gem to detect and differentiate between Traditional Chinese, Simplified Chinese based on Unihan data."
|
|
14
14
|
spec.homepage = "https://github.com/kyubey1228/unihan_lang"
|
|
15
15
|
spec.license = "MIT"
|
|
16
16
|
spec.required_ruby_version = Gem::Requirement.new(">= 2.5.0")
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: unihan_lang
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- kyubey1228
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2024-
|
|
11
|
+
date: 2024-10-17 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|
|
@@ -53,7 +53,7 @@ dependencies:
|
|
|
53
53
|
- !ruby/object:Gem::Version
|
|
54
54
|
version: '3.0'
|
|
55
55
|
description: A gem to detect and differentiate between Traditional Chinese, Simplified
|
|
56
|
-
Chinese
|
|
56
|
+
Chinese based on Unihan data.
|
|
57
57
|
email:
|
|
58
58
|
- kyuuka1228@gmail.com
|
|
59
59
|
executables: []
|
|
@@ -66,15 +66,14 @@ files:
|
|
|
66
66
|
- ".rubocop.yml"
|
|
67
67
|
- Gemfile
|
|
68
68
|
- Gemfile.lock
|
|
69
|
+
- LICENSE.md
|
|
70
|
+
- README.ja.md
|
|
69
71
|
- README.md
|
|
70
72
|
- Rakefile
|
|
71
73
|
- data/Unihan_Variants.txt
|
|
72
|
-
- data/traditional_chinese_list.txt
|
|
73
74
|
- lib/unihan_lang.rb
|
|
74
75
|
- lib/unihan_lang/chinese_processor.rb
|
|
75
76
|
- lib/unihan_lang/version.rb
|
|
76
|
-
- test.rb
|
|
77
|
-
- traditional_characters.txt
|
|
78
77
|
- unihan_lang.gemspec
|
|
79
78
|
homepage: https://github.com/kyubey1228/unihan_lang
|
|
80
79
|
licenses:
|
|
@@ -95,8 +94,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
95
94
|
- !ruby/object:Gem::Version
|
|
96
95
|
version: '0'
|
|
97
96
|
requirements: []
|
|
98
|
-
rubygems_version: 3.5.
|
|
97
|
+
rubygems_version: 3.5.3
|
|
99
98
|
signing_key:
|
|
100
99
|
specification_version: 4
|
|
101
|
-
summary: Language detection for Chinese
|
|
100
|
+
summary: Language detection for Chinese characters
|
|
102
101
|
test_files: []
|