script_detector_2 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +4 -0
- data/.rubocop_todo.yml +4 -4
- data/.ruby-version +1 -0
- data/CHANGELOG.md +7 -0
- data/Gemfile.lock +9 -9
- data/README.md +1 -1
- data/lib/script_detector_2/version.rb +1 -1
- data/lib/script_detector_2.rb +23 -8
- data/tasks/gen_src.rake +1 -1
- data/tasks/unihan.rb +9 -4
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9709f6c83f82a3bf7073bb8b9cfc95a1edea6885e0f596ef6e93f01a62c26dbb
|
4
|
+
data.tar.gz: 3ed583c8487617e9687b3c776116f0cc22140993ef9debf4f8542057e9f9f232
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5d17c19eee4868b540af844e08c95133415de8ccf0293dd208c11a29c14315b716c1563c45fd22081fbbf604147c837fa14bc98772a4103b2df080e4d09f89fa
|
7
|
+
data.tar.gz: 4a7894f23f49494f9debcafc967217ccb00549234014166f9fe4f1fe3d13b12b15fbe63eac426b164dfa763acbc9b4e15dfa8cd616de84720f1b2ea3249abfde
|
data/.rubocop.yml
CHANGED
data/.rubocop_todo.yml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# This configuration was generated by
|
2
2
|
# `rubocop --auto-gen-config`
|
3
|
-
# on 2021-
|
3
|
+
# on 2021-10-11 07:56:31 UTC using RuboCop version 1.21.0.
|
4
4
|
# The point is for the user to remove these configuration records
|
5
5
|
# one by one as the offenses are removed from the code base.
|
6
6
|
# Note that changes in the inspected code, or installation of new
|
@@ -9,9 +9,9 @@
|
|
9
9
|
# Offense count: 1
|
10
10
|
# Configuration parameters: IgnoredMethods.
|
11
11
|
Metrics/CyclomaticComplexity:
|
12
|
-
Max:
|
12
|
+
Max: 10
|
13
13
|
|
14
|
-
# Offense count:
|
14
|
+
# Offense count: 2
|
15
15
|
# Configuration parameters: CountComments, CountAsOne, ExcludedMethods, IgnoredMethods.
|
16
16
|
Metrics/MethodLength:
|
17
17
|
Max: 14
|
@@ -19,4 +19,4 @@ Metrics/MethodLength:
|
|
19
19
|
# Offense count: 1
|
20
20
|
# Configuration parameters: IgnoredMethods.
|
21
21
|
Metrics/PerceivedComplexity:
|
22
|
-
Max:
|
22
|
+
Max: 11
|
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.7.4
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,12 @@
|
|
1
1
|
## [Unreleased]
|
2
2
|
|
3
|
+
## [0.3.0] - 2021-10-13
|
4
|
+
|
5
|
+
- Add `kana?` and `hangul?` methods
|
6
|
+
- Improve accuracy of `identify_script` method
|
7
|
+
- `chinese?` method now returns actual boolean instead of merely something
|
8
|
+
truthy
|
9
|
+
|
3
10
|
## [0.2.0] - 2021-08-23
|
4
11
|
|
5
12
|
- Slight optimization of script-matching regexps
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
script_detector_2 (0.
|
4
|
+
script_detector_2 (0.3.0)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
@@ -18,9 +18,9 @@ GEM
|
|
18
18
|
kramdown-parser-gfm (1.1.0)
|
19
19
|
kramdown (~> 2.0)
|
20
20
|
minitest (5.14.4)
|
21
|
-
nokogiri (1.12.
|
21
|
+
nokogiri (1.12.5-x86_64-darwin)
|
22
22
|
racc (~> 1.4)
|
23
|
-
parallel (1.
|
23
|
+
parallel (1.21.0)
|
24
24
|
parser (3.0.2.0)
|
25
25
|
ast (~> 2.4.1)
|
26
26
|
racc (1.5.2)
|
@@ -30,20 +30,20 @@ GEM
|
|
30
30
|
reverse_markdown (2.0.0)
|
31
31
|
nokogiri
|
32
32
|
rexml (3.2.5)
|
33
|
-
rubocop (1.
|
33
|
+
rubocop (1.22.1)
|
34
34
|
parallel (~> 1.10)
|
35
35
|
parser (>= 3.0.0.0)
|
36
36
|
rainbow (>= 2.2.2, < 4.0)
|
37
37
|
regexp_parser (>= 1.8, < 3.0)
|
38
38
|
rexml
|
39
|
-
rubocop-ast (>= 1.
|
39
|
+
rubocop-ast (>= 1.12.0, < 2.0)
|
40
40
|
ruby-progressbar (~> 1.7)
|
41
41
|
unicode-display_width (>= 1.4.0, < 3.0)
|
42
|
-
rubocop-ast (1.
|
42
|
+
rubocop-ast (1.12.0)
|
43
43
|
parser (>= 3.0.1.1)
|
44
44
|
ruby-progressbar (1.11.0)
|
45
45
|
rubyzip (2.3.2)
|
46
|
-
solargraph (0.
|
46
|
+
solargraph (0.44.0)
|
47
47
|
backport (~> 1.2)
|
48
48
|
benchmark
|
49
49
|
bundler (>= 1.17.2)
|
@@ -60,7 +60,7 @@ GEM
|
|
60
60
|
yard (~> 0.9, >= 0.9.24)
|
61
61
|
thor (1.1.0)
|
62
62
|
tilt (2.0.10)
|
63
|
-
unicode-display_width (2.
|
63
|
+
unicode-display_width (2.1.0)
|
64
64
|
yard (0.9.26)
|
65
65
|
|
66
66
|
PLATFORMS
|
@@ -76,4 +76,4 @@ DEPENDENCIES
|
|
76
76
|
solargraph
|
77
77
|
|
78
78
|
BUNDLED WITH
|
79
|
-
2.2.
|
79
|
+
2.2.29
|
data/README.md
CHANGED
@@ -12,7 +12,7 @@ Unlike the original script_detector, this gem:
|
|
12
12
|
- Uses the
|
13
13
|
[kUnihanCore2020](https://www.unicode.org/reports/tr38/#kUnihanCore2020)
|
14
14
|
property of the Unicode Unihan database to determine which characters belong
|
15
|
-
to which script (Unicode
|
15
|
+
to which script (Unicode 14)
|
16
16
|
([details](http://www.unicode.org/L2/L2019/19388-unihan-core-2020.pdf))
|
17
17
|
- Uses [ISO 15924 script names](https://en.wikipedia.org/wiki/ISO_15924) in
|
18
18
|
symbol form as return values (instead of English strings)
|
data/lib/script_detector_2.rb
CHANGED
@@ -10,8 +10,8 @@ module ScriptDetector2
|
|
10
10
|
# @param string [String]
|
11
11
|
# @return [Boolean]
|
12
12
|
def japanese?(string)
|
13
|
-
return true if string
|
14
|
-
return false if string
|
13
|
+
return true if kana?(string)
|
14
|
+
return false if hangul?(string)
|
15
15
|
|
16
16
|
kanji = string.scan(/\p{Han}/)
|
17
17
|
return false unless kanji.any?
|
@@ -19,12 +19,18 @@ module ScriptDetector2
|
|
19
19
|
kanji.all?(JAPANESE_PATTERN)
|
20
20
|
end
|
21
21
|
|
22
|
+
# @param string [String]
|
23
|
+
# @return [Boolean]
|
24
|
+
def kana?(string)
|
25
|
+
/[\p{Hiragana}\p{Katakana}]/.match?(string)
|
26
|
+
end
|
27
|
+
|
22
28
|
# @param string [String]
|
23
29
|
# @return [Boolean]
|
24
30
|
def chinese?(string)
|
25
31
|
return false if string =~ /[\p{Hiragana}\p{Katakana}\p{Hangul}]/
|
26
32
|
|
27
|
-
|
33
|
+
/\p{Han}/.match?(string)
|
28
34
|
end
|
29
35
|
|
30
36
|
# @param string [String]
|
@@ -52,8 +58,8 @@ module ScriptDetector2
|
|
52
58
|
# @param string [String]
|
53
59
|
# @return [Boolean]
|
54
60
|
def korean?(string)
|
55
|
-
return true if string
|
56
|
-
return false if string
|
61
|
+
return true if hangul?(string)
|
62
|
+
return false if kana?(string)
|
57
63
|
|
58
64
|
hanja = string.scan(/\p{Han}/)
|
59
65
|
return false unless hanja.any?
|
@@ -61,19 +67,28 @@ module ScriptDetector2
|
|
61
67
|
hanja.all?(KOREAN_PATTERN)
|
62
68
|
end
|
63
69
|
|
70
|
+
# @param string [String]
|
71
|
+
# @return [Boolean]
|
72
|
+
def hangul?(string)
|
73
|
+
/\p{Hangul}/.match?(string)
|
74
|
+
end
|
75
|
+
|
64
76
|
# @param string [String]
|
65
77
|
# @return [Symbol]
|
66
78
|
def identify_script(string)
|
67
|
-
return :Jpan if
|
68
|
-
return :Kore if
|
79
|
+
return :Jpan if kana?(string)
|
80
|
+
return :Kore if hangul?(string)
|
69
81
|
|
70
82
|
is_hant = traditional_chinese?(string)
|
71
83
|
is_hans = simplified_chinese?(string)
|
72
84
|
if is_hant && is_hans then :Hani
|
73
85
|
elsif is_hans then :Hans
|
74
86
|
elsif is_hant then :Hant
|
87
|
+
elsif japanese?(string) then :Jpan
|
88
|
+
elsif korean?(string) then :Kore
|
75
89
|
elsif chinese?(string) then :Hani # rubocop:disable Lint/DuplicateBranch
|
76
|
-
else
|
90
|
+
else
|
91
|
+
:Zyyy
|
77
92
|
end
|
78
93
|
end
|
79
94
|
end
|
data/tasks/gen_src.rake
CHANGED
data/tasks/unihan.rb
CHANGED
@@ -5,15 +5,20 @@ module Unihan
|
|
5
5
|
CODEPOINT_PATTERN = /U\+(?<hex>[A-F0-9]+)/.freeze
|
6
6
|
|
7
7
|
class << self
|
8
|
-
# @param
|
8
|
+
# @param dict_data [Hash<Integer,Hash{String => String}>]
|
9
9
|
# @param tags [Array<String>]
|
10
10
|
# @return [Regexp]
|
11
11
|
def gen_unihan_core_pattern(dict_data, *tags)
|
12
|
-
|
12
|
+
gen_pattern(codepoints_for_tags(dict_data, tags))
|
13
|
+
end
|
14
|
+
|
15
|
+
# @param dict_data [Hash<Integer,Hash{String => String}>]
|
16
|
+
# @param tags [Array<String>]
|
17
|
+
# @return [Array<Integer>]
|
18
|
+
def codepoints_for_tags(dict_data, tags)
|
19
|
+
dict_data.select do |_, data|
|
13
20
|
tags.all? { |t| data['kUnihanCore2020']&.include?(t) }
|
14
21
|
end.keys
|
15
|
-
|
16
|
-
gen_pattern(codepoints)
|
17
22
|
end
|
18
23
|
|
19
24
|
# @param codepoints [Array<Integer>]
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: script_detector_2
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aaron Madlon-Kay
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-10-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: byebug
|
@@ -76,6 +76,7 @@ files:
|
|
76
76
|
- ".dir-locals.el"
|
77
77
|
- ".rubocop.yml"
|
78
78
|
- ".rubocop_todo.yml"
|
79
|
+
- ".ruby-version"
|
79
80
|
- ".solargraph.yml"
|
80
81
|
- CHANGELOG.md
|
81
82
|
- Gemfile
|