script_detector_2 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +4 -0
- data/.rubocop_todo.yml +4 -4
- data/.ruby-version +1 -0
- data/CHANGELOG.md +7 -0
- data/Gemfile.lock +9 -9
- data/README.md +1 -1
- data/lib/script_detector_2/version.rb +1 -1
- data/lib/script_detector_2.rb +23 -8
- data/tasks/gen_src.rake +1 -1
- data/tasks/unihan.rb +9 -4
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9709f6c83f82a3bf7073bb8b9cfc95a1edea6885e0f596ef6e93f01a62c26dbb
|
4
|
+
data.tar.gz: 3ed583c8487617e9687b3c776116f0cc22140993ef9debf4f8542057e9f9f232
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5d17c19eee4868b540af844e08c95133415de8ccf0293dd208c11a29c14315b716c1563c45fd22081fbbf604147c837fa14bc98772a4103b2df080e4d09f89fa
|
7
|
+
data.tar.gz: 4a7894f23f49494f9debcafc967217ccb00549234014166f9fe4f1fe3d13b12b15fbe63eac426b164dfa763acbc9b4e15dfa8cd616de84720f1b2ea3249abfde
|
data/.rubocop.yml
CHANGED
data/.rubocop_todo.yml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# This configuration was generated by
|
2
2
|
# `rubocop --auto-gen-config`
|
3
|
-
# on 2021-
|
3
|
+
# on 2021-10-11 07:56:31 UTC using RuboCop version 1.21.0.
|
4
4
|
# The point is for the user to remove these configuration records
|
5
5
|
# one by one as the offenses are removed from the code base.
|
6
6
|
# Note that changes in the inspected code, or installation of new
|
@@ -9,9 +9,9 @@
|
|
9
9
|
# Offense count: 1
|
10
10
|
# Configuration parameters: IgnoredMethods.
|
11
11
|
Metrics/CyclomaticComplexity:
|
12
|
-
Max:
|
12
|
+
Max: 10
|
13
13
|
|
14
|
-
# Offense count:
|
14
|
+
# Offense count: 2
|
15
15
|
# Configuration parameters: CountComments, CountAsOne, ExcludedMethods, IgnoredMethods.
|
16
16
|
Metrics/MethodLength:
|
17
17
|
Max: 14
|
@@ -19,4 +19,4 @@ Metrics/MethodLength:
|
|
19
19
|
# Offense count: 1
|
20
20
|
# Configuration parameters: IgnoredMethods.
|
21
21
|
Metrics/PerceivedComplexity:
|
22
|
-
Max:
|
22
|
+
Max: 11
|
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.7.4
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,12 @@
|
|
1
1
|
## [Unreleased]
|
2
2
|
|
3
|
+
## [0.3.0] - 2021-10-13
|
4
|
+
|
5
|
+
- Add `kana?` and `hangul?` methods
|
6
|
+
- Improve accuracy of `identify_script` method
|
7
|
+
- `chinese?` method now returns actual boolean instead of merely something
|
8
|
+
truthy
|
9
|
+
|
3
10
|
## [0.2.0] - 2021-08-23
|
4
11
|
|
5
12
|
- Slight optimization of script-matching regexps
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
script_detector_2 (0.
|
4
|
+
script_detector_2 (0.3.0)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
@@ -18,9 +18,9 @@ GEM
|
|
18
18
|
kramdown-parser-gfm (1.1.0)
|
19
19
|
kramdown (~> 2.0)
|
20
20
|
minitest (5.14.4)
|
21
|
-
nokogiri (1.12.
|
21
|
+
nokogiri (1.12.5-x86_64-darwin)
|
22
22
|
racc (~> 1.4)
|
23
|
-
parallel (1.
|
23
|
+
parallel (1.21.0)
|
24
24
|
parser (3.0.2.0)
|
25
25
|
ast (~> 2.4.1)
|
26
26
|
racc (1.5.2)
|
@@ -30,20 +30,20 @@ GEM
|
|
30
30
|
reverse_markdown (2.0.0)
|
31
31
|
nokogiri
|
32
32
|
rexml (3.2.5)
|
33
|
-
rubocop (1.
|
33
|
+
rubocop (1.22.1)
|
34
34
|
parallel (~> 1.10)
|
35
35
|
parser (>= 3.0.0.0)
|
36
36
|
rainbow (>= 2.2.2, < 4.0)
|
37
37
|
regexp_parser (>= 1.8, < 3.0)
|
38
38
|
rexml
|
39
|
-
rubocop-ast (>= 1.
|
39
|
+
rubocop-ast (>= 1.12.0, < 2.0)
|
40
40
|
ruby-progressbar (~> 1.7)
|
41
41
|
unicode-display_width (>= 1.4.0, < 3.0)
|
42
|
-
rubocop-ast (1.
|
42
|
+
rubocop-ast (1.12.0)
|
43
43
|
parser (>= 3.0.1.1)
|
44
44
|
ruby-progressbar (1.11.0)
|
45
45
|
rubyzip (2.3.2)
|
46
|
-
solargraph (0.
|
46
|
+
solargraph (0.44.0)
|
47
47
|
backport (~> 1.2)
|
48
48
|
benchmark
|
49
49
|
bundler (>= 1.17.2)
|
@@ -60,7 +60,7 @@ GEM
|
|
60
60
|
yard (~> 0.9, >= 0.9.24)
|
61
61
|
thor (1.1.0)
|
62
62
|
tilt (2.0.10)
|
63
|
-
unicode-display_width (2.
|
63
|
+
unicode-display_width (2.1.0)
|
64
64
|
yard (0.9.26)
|
65
65
|
|
66
66
|
PLATFORMS
|
@@ -76,4 +76,4 @@ DEPENDENCIES
|
|
76
76
|
solargraph
|
77
77
|
|
78
78
|
BUNDLED WITH
|
79
|
-
2.2.
|
79
|
+
2.2.29
|
data/README.md
CHANGED
@@ -12,7 +12,7 @@ Unlike the original script_detector, this gem:
|
|
12
12
|
- Uses the
|
13
13
|
[kUnihanCore2020](https://www.unicode.org/reports/tr38/#kUnihanCore2020)
|
14
14
|
property of the Unicode Unihan database to determine which characters belong
|
15
|
-
to which script (Unicode
|
15
|
+
to which script (Unicode 14)
|
16
16
|
([details](http://www.unicode.org/L2/L2019/19388-unihan-core-2020.pdf))
|
17
17
|
- Uses [ISO 15924 script names](https://en.wikipedia.org/wiki/ISO_15924) in
|
18
18
|
symbol form as return values (instead of English strings)
|
data/lib/script_detector_2.rb
CHANGED
@@ -10,8 +10,8 @@ module ScriptDetector2
|
|
10
10
|
# @param string [String]
|
11
11
|
# @return [Boolean]
|
12
12
|
def japanese?(string)
|
13
|
-
return true if string
|
14
|
-
return false if string
|
13
|
+
return true if kana?(string)
|
14
|
+
return false if hangul?(string)
|
15
15
|
|
16
16
|
kanji = string.scan(/\p{Han}/)
|
17
17
|
return false unless kanji.any?
|
@@ -19,12 +19,18 @@ module ScriptDetector2
|
|
19
19
|
kanji.all?(JAPANESE_PATTERN)
|
20
20
|
end
|
21
21
|
|
22
|
+
# @param string [String]
|
23
|
+
# @return [Boolean]
|
24
|
+
def kana?(string)
|
25
|
+
/[\p{Hiragana}\p{Katakana}]/.match?(string)
|
26
|
+
end
|
27
|
+
|
22
28
|
# @param string [String]
|
23
29
|
# @return [Boolean]
|
24
30
|
def chinese?(string)
|
25
31
|
return false if string =~ /[\p{Hiragana}\p{Katakana}\p{Hangul}]/
|
26
32
|
|
27
|
-
|
33
|
+
/\p{Han}/.match?(string)
|
28
34
|
end
|
29
35
|
|
30
36
|
# @param string [String]
|
@@ -52,8 +58,8 @@ module ScriptDetector2
|
|
52
58
|
# @param string [String]
|
53
59
|
# @return [Boolean]
|
54
60
|
def korean?(string)
|
55
|
-
return true if string
|
56
|
-
return false if string
|
61
|
+
return true if hangul?(string)
|
62
|
+
return false if kana?(string)
|
57
63
|
|
58
64
|
hanja = string.scan(/\p{Han}/)
|
59
65
|
return false unless hanja.any?
|
@@ -61,19 +67,28 @@ module ScriptDetector2
|
|
61
67
|
hanja.all?(KOREAN_PATTERN)
|
62
68
|
end
|
63
69
|
|
70
|
+
# @param string [String]
|
71
|
+
# @return [Boolean]
|
72
|
+
def hangul?(string)
|
73
|
+
/\p{Hangul}/.match?(string)
|
74
|
+
end
|
75
|
+
|
64
76
|
# @param string [String]
|
65
77
|
# @return [Symbol]
|
66
78
|
def identify_script(string)
|
67
|
-
return :Jpan if
|
68
|
-
return :Kore if
|
79
|
+
return :Jpan if kana?(string)
|
80
|
+
return :Kore if hangul?(string)
|
69
81
|
|
70
82
|
is_hant = traditional_chinese?(string)
|
71
83
|
is_hans = simplified_chinese?(string)
|
72
84
|
if is_hant && is_hans then :Hani
|
73
85
|
elsif is_hans then :Hans
|
74
86
|
elsif is_hant then :Hant
|
87
|
+
elsif japanese?(string) then :Jpan
|
88
|
+
elsif korean?(string) then :Kore
|
75
89
|
elsif chinese?(string) then :Hani # rubocop:disable Lint/DuplicateBranch
|
76
|
-
else
|
90
|
+
else
|
91
|
+
:Zyyy
|
77
92
|
end
|
78
93
|
end
|
79
94
|
end
|
data/tasks/gen_src.rake
CHANGED
data/tasks/unihan.rb
CHANGED
@@ -5,15 +5,20 @@ module Unihan
|
|
5
5
|
CODEPOINT_PATTERN = /U\+(?<hex>[A-F0-9]+)/.freeze
|
6
6
|
|
7
7
|
class << self
|
8
|
-
# @param
|
8
|
+
# @param dict_data [Hash<Integer,Hash{String => String}>]
|
9
9
|
# @param tags [Array<String>]
|
10
10
|
# @return [Regexp]
|
11
11
|
def gen_unihan_core_pattern(dict_data, *tags)
|
12
|
-
|
12
|
+
gen_pattern(codepoints_for_tags(dict_data, tags))
|
13
|
+
end
|
14
|
+
|
15
|
+
# @param dict_data [Hash<Integer,Hash{String => String}>]
|
16
|
+
# @param tags [Array<String>]
|
17
|
+
# @return [Array<Integer>]
|
18
|
+
def codepoints_for_tags(dict_data, tags)
|
19
|
+
dict_data.select do |_, data|
|
13
20
|
tags.all? { |t| data['kUnihanCore2020']&.include?(t) }
|
14
21
|
end.keys
|
15
|
-
|
16
|
-
gen_pattern(codepoints)
|
17
22
|
end
|
18
23
|
|
19
24
|
# @param codepoints [Array<Integer>]
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: script_detector_2
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aaron Madlon-Kay
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-10-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: byebug
|
@@ -76,6 +76,7 @@ files:
|
|
76
76
|
- ".dir-locals.el"
|
77
77
|
- ".rubocop.yml"
|
78
78
|
- ".rubocop_todo.yml"
|
79
|
+
- ".ruby-version"
|
79
80
|
- ".solargraph.yml"
|
80
81
|
- CHANGELOG.md
|
81
82
|
- Gemfile
|