unihan_lang 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.github/workflows/main.yml +30 -0
- data/.gitignore +10 -0
- data/.rspec +2 -0
- data/.rubocop.yml +102 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +68 -0
- data/README.md +57 -0
- data/Rakefile +12 -0
- data/data/Unihan_Variants.txt +17429 -0
- data/data/traditional_chinese_list.txt +6017 -0
- data/lib/unihan_lang/chinese_processor.rb +88 -0
- data/lib/unihan_lang/version.rb +5 -0
- data/lib/unihan_lang.rb +68 -0
- data/test.rb +58 -0
- data/traditional_characters.txt +0 -0
- data/unihan_lang.gemspec +30 -0
- metadata +102 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: bc3bac523b20f37850e5d1e43587a736cb1922bb8fc0ea01a39b3c534d257045
|
|
4
|
+
data.tar.gz: f29a6426a0f23ac69d4865a0988586b628c7dd10869adafff412b0f4daabfafb
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 490b594addd8d6517bbbba34da1c3c7528c12e8a5a914101ccd5c0e71ce8fe0f406e282bf29ab0281dc8317ee7d240a58ccdfc3928911d009e5859147474081c
|
|
7
|
+
data.tar.gz: 13b72769f0e4e10f1e02c56659f56a39aa0f205a82312eeb14be0f431f36e5b8794c312d98306eeea063c56d5f1c711abc829539c83256d7772556eb1fb54d3b
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
name: Ruby
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches:
|
|
6
|
+
- "**"
|
|
7
|
+
|
|
8
|
+
pull_request:
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
build:
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
name: Ruby ${{ matrix.ruby }}
|
|
14
|
+
strategy:
|
|
15
|
+
matrix:
|
|
16
|
+
ruby:
|
|
17
|
+
- '3.3.0'
|
|
18
|
+
|
|
19
|
+
steps:
|
|
20
|
+
- uses: actions/checkout@v4
|
|
21
|
+
- name: Set up Ruby
|
|
22
|
+
uses: ruby/setup-ruby@v1
|
|
23
|
+
with:
|
|
24
|
+
ruby-version: ${{ matrix.ruby }}
|
|
25
|
+
bundler-cache: true
|
|
26
|
+
# FIXME: enable after #1
|
|
27
|
+
#- name: Rubocop
|
|
28
|
+
# run: bundle exec rubocop
|
|
29
|
+
- name: RSpec
|
|
30
|
+
run: bundle exec rspec
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.rubocop.yml
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# Normally additional configs of AllCops/Include and AllCops/Exclude override to RuboCops's defaults.
|
|
2
|
+
# But you'd like to apply both RuboCops's defaults and addtional configs of AllCops/Include and AllCops/Exclude.
|
|
3
|
+
# ref: https://docs.rubocop.org/en/stable/configuration/#unusual-files-that-would-not-be-included-by-default
|
|
4
|
+
inherit_mode:
|
|
5
|
+
merge:
|
|
6
|
+
- Include
|
|
7
|
+
- Exclude
|
|
8
|
+
|
|
9
|
+
AllCops:
|
|
10
|
+
TargetRubyVersion: 3.3
|
|
11
|
+
Exclude:
|
|
12
|
+
- 'Gemfile'
|
|
13
|
+
- 'bin/*'
|
|
14
|
+
- 'config/**/*'
|
|
15
|
+
- 'db/**/*'
|
|
16
|
+
- 'deploy/**/*'
|
|
17
|
+
|
|
18
|
+
# Accept single-line methods with no body
|
|
19
|
+
Style/SingleLineMethods:
|
|
20
|
+
AllowIfMethodIsEmpty: true
|
|
21
|
+
|
|
22
|
+
# Top-level documentation of classes and modules are needless
|
|
23
|
+
Style/Documentation:
|
|
24
|
+
Enabled: false
|
|
25
|
+
|
|
26
|
+
# Allow to chain of block after another block that spans multiple lines
|
|
27
|
+
Style/MultilineBlockChain:
|
|
28
|
+
Enabled: false
|
|
29
|
+
|
|
30
|
+
# Allow `->` literal for multi line blocks
|
|
31
|
+
Style/Lambda:
|
|
32
|
+
Enabled: false
|
|
33
|
+
|
|
34
|
+
# Both nested and compact are okay
|
|
35
|
+
Style/ClassAndModuleChildren:
|
|
36
|
+
Enabled: false
|
|
37
|
+
|
|
38
|
+
# Specifying param names is unnecessary
|
|
39
|
+
Style/SingleLineBlockParams:
|
|
40
|
+
Enabled: false
|
|
41
|
+
|
|
42
|
+
# Prefer Kernel#sprintf
|
|
43
|
+
Style/FormatString:
|
|
44
|
+
EnforcedStyle: sprintf
|
|
45
|
+
|
|
46
|
+
# Maximum method length
|
|
47
|
+
Metrics/MethodLength:
|
|
48
|
+
Max: 20
|
|
49
|
+
|
|
50
|
+
# Tune to MethodLength
|
|
51
|
+
Metrics/AbcSize:
|
|
52
|
+
Max: 30
|
|
53
|
+
|
|
54
|
+
# Tune to MethodLength
|
|
55
|
+
Metrics/ClassLength:
|
|
56
|
+
Max: 200
|
|
57
|
+
|
|
58
|
+
# Maximum line length
|
|
59
|
+
Layout/LineLength:
|
|
60
|
+
Max: 100
|
|
61
|
+
|
|
62
|
+
# Allow `has_` as prefix of predicate methods
|
|
63
|
+
Naming/PredicateName:
|
|
64
|
+
ForbiddenPrefixes:
|
|
65
|
+
- is_
|
|
66
|
+
- have_
|
|
67
|
+
|
|
68
|
+
# Prefer double_quotes strings unless your string literal contains escape chars
|
|
69
|
+
Style/StringLiterals:
|
|
70
|
+
EnforcedStyle: double_quotes
|
|
71
|
+
|
|
72
|
+
# Prefer raise over fail for exceptions
|
|
73
|
+
Style/SignalException:
|
|
74
|
+
EnforcedStyle: only_raise
|
|
75
|
+
|
|
76
|
+
# Allow empty condition in case statements
|
|
77
|
+
Style/EmptyCaseCondition:
|
|
78
|
+
Enabled: false
|
|
79
|
+
|
|
80
|
+
# Prefer trailing comma in argument lists
|
|
81
|
+
Style/TrailingCommaInArguments:
|
|
82
|
+
EnforcedStyleForMultiline: comma
|
|
83
|
+
|
|
84
|
+
# Prefer trailing comma in array literals
|
|
85
|
+
Style/TrailingCommaInArrayLiteral:
|
|
86
|
+
EnforcedStyleForMultiline: comma
|
|
87
|
+
|
|
88
|
+
# Prefer trailing comma in hash literals
|
|
89
|
+
Style/TrailingCommaInHashLiteral:
|
|
90
|
+
EnforcedStyleForMultiline: comma
|
|
91
|
+
|
|
92
|
+
# Prefer parentheses for almost all percent literals
|
|
93
|
+
Style/PercentLiteralDelimiters:
|
|
94
|
+
PreferredDelimiters:
|
|
95
|
+
'%i': '()'
|
|
96
|
+
'%I': '()'
|
|
97
|
+
'%w': '()'
|
|
98
|
+
'%W': '()'
|
|
99
|
+
|
|
100
|
+
# Prefer `has_?` style for Hash methods
|
|
101
|
+
Style/PreferredHashMethods:
|
|
102
|
+
EnforcedStyle: verbose
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
PATH
|
|
2
|
+
remote: .
|
|
3
|
+
specs:
|
|
4
|
+
unihan_lang (0.1.0)
|
|
5
|
+
|
|
6
|
+
GEM
|
|
7
|
+
remote: https://rubygems.org/
|
|
8
|
+
specs:
|
|
9
|
+
ast (2.4.2)
|
|
10
|
+
diff-lcs (1.5.1)
|
|
11
|
+
json (2.7.2)
|
|
12
|
+
language_server-protocol (3.17.0.3)
|
|
13
|
+
parallel (1.26.3)
|
|
14
|
+
parser (3.3.4.2)
|
|
15
|
+
ast (~> 2.4.1)
|
|
16
|
+
racc
|
|
17
|
+
racc (1.8.1)
|
|
18
|
+
rainbow (3.1.1)
|
|
19
|
+
rake (13.2.1)
|
|
20
|
+
regexp_parser (2.9.2)
|
|
21
|
+
rspec (3.13.0)
|
|
22
|
+
rspec-core (~> 3.13.0)
|
|
23
|
+
rspec-expectations (~> 3.13.0)
|
|
24
|
+
rspec-mocks (~> 3.13.0)
|
|
25
|
+
rspec-core (3.13.1)
|
|
26
|
+
rspec-support (~> 3.13.0)
|
|
27
|
+
rspec-expectations (3.13.2)
|
|
28
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
|
29
|
+
rspec-support (~> 3.13.0)
|
|
30
|
+
rspec-mocks (3.13.1)
|
|
31
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
|
32
|
+
rspec-support (~> 3.13.0)
|
|
33
|
+
rspec-support (3.13.1)
|
|
34
|
+
rubocop (1.66.0)
|
|
35
|
+
json (~> 2.3)
|
|
36
|
+
language_server-protocol (>= 3.17.0)
|
|
37
|
+
parallel (~> 1.10)
|
|
38
|
+
parser (>= 3.3.0.2)
|
|
39
|
+
rainbow (>= 2.2.2, < 4.0)
|
|
40
|
+
regexp_parser (>= 2.4, < 3.0)
|
|
41
|
+
rubocop-ast (>= 1.32.1, < 2.0)
|
|
42
|
+
ruby-progressbar (~> 1.7)
|
|
43
|
+
unicode-display_width (>= 2.4.0, < 3.0)
|
|
44
|
+
rubocop-ast (1.32.2)
|
|
45
|
+
parser (>= 3.3.1.0)
|
|
46
|
+
rubocop-performance (1.21.1)
|
|
47
|
+
rubocop (>= 1.48.1, < 2.0)
|
|
48
|
+
rubocop-ast (>= 1.31.1, < 2.0)
|
|
49
|
+
rubocop-rspec (3.0.4)
|
|
50
|
+
rubocop (~> 1.61)
|
|
51
|
+
ruby-progressbar (1.13.0)
|
|
52
|
+
unicode-display_width (2.5.0)
|
|
53
|
+
|
|
54
|
+
PLATFORMS
|
|
55
|
+
arm64-darwin-23
|
|
56
|
+
ruby
|
|
57
|
+
|
|
58
|
+
DEPENDENCIES
|
|
59
|
+
bundler (~> 2.0)
|
|
60
|
+
rake (~> 13.0)
|
|
61
|
+
rspec (~> 3.0)
|
|
62
|
+
rubocop
|
|
63
|
+
rubocop-performance
|
|
64
|
+
rubocop-rspec
|
|
65
|
+
unihan_lang!
|
|
66
|
+
|
|
67
|
+
BUNDLED WITH
|
|
68
|
+
2.5.4
|
data/README.md
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
<!-- @format -->
|
|
2
|
+
|
|
3
|
+
# UnihanLang
|
|
4
|
+
|
|
5
|
+
UnihanLang は、テキストの言語(日本語、繁体字中国語、簡体字中国語)を識別するための Ruby ライブラリです。
|
|
6
|
+
|
|
7
|
+
## インストール
|
|
8
|
+
|
|
9
|
+
Gemfile に以下の行を追加してください:
|
|
10
|
+
|
|
11
|
+
```ruby
|
|
12
|
+
gem 'unihan_lang'
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
そして、以下のコマンドを実行してください:
|
|
16
|
+
|
|
17
|
+
```sh
|
|
18
|
+
bundle install
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
または、直接インストールする場合は以下のコマンドを使用してください:
|
|
22
|
+
|
|
23
|
+
```sh
|
|
24
|
+
gem install unihan_lang
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## 使用方法
|
|
28
|
+
|
|
29
|
+
```ruby
|
|
30
|
+
require 'unihan_lang'
|
|
31
|
+
|
|
32
|
+
unihan = UnihanLang::Unihan.new
|
|
33
|
+
|
|
34
|
+
# 言語の判定
|
|
35
|
+
puts unihan.determine_language("這是繁體中文") # => "ZH_TW"
|
|
36
|
+
puts unihan.determine_language("这是简体中文") # => "ZH_CN"
|
|
37
|
+
|
|
38
|
+
# 繁体字中国語かどうかの判定
|
|
39
|
+
puts unihan.zh_tw?("這是繁體中文") # => true
|
|
40
|
+
puts unihan.zh_tw?("这不是繁体中文") # => false
|
|
41
|
+
|
|
42
|
+
# 簡体字中国語かどうかの判定
|
|
43
|
+
puts unihan.zh_cn?("这是简体中文") # => true
|
|
44
|
+
puts unihan.zh_cn?("這不是簡體中文") # => false
|
|
45
|
+
|
|
46
|
+
# テキストに中国語の文字が含まれているかの判定
|
|
47
|
+
puts unihan.contains_chinese?("This text contains 中文") # => true
|
|
48
|
+
puts unihan.contains_chinese?("This text has no Chinese") # => false
|
|
49
|
+
|
|
50
|
+
# テキストから中国語の文字を抽出
|
|
51
|
+
puts unihan.extract_chinese_characters("This text contains 中文").join # => "中文"
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## 注意事項
|
|
55
|
+
|
|
56
|
+
このライブラリは、テキストの言語を完全に正確に判定することを保証するものではありません。
|
|
57
|
+
特に、短いテキストや複数の言語が混在するテキストの場合、判定が難しい場合があります。
|