script_detector_2 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +10 -5
- data/CHANGELOG.md +5 -0
- data/Gemfile.lock +8 -8
- data/README.md +1 -0
- data/lib/script_detector_2/version.rb +1 -1
- data/lib/script_detector_2.rb +60 -11
- data/script_detector_2.gemspec +1 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 37ab1716845b98ca15a072e67cd900f5854dadd3c3641667fb5887c1626c4a85
|
4
|
+
data.tar.gz: 5142f341e40601f3d1fff211ff1fa566047f6353d0d3f6bc5062653827e759db
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 35b1771a4d8898d6ca02f38b4a016adc927600e7456b0a41ae515f88e85c846d8a41951ce78fa12c7a2851fc596a13ab48a3537dd97ff4579e8cd45d19b50ccf
|
7
|
+
data.tar.gz: ea6da32d1c4b9a10a4c04208fda4bee7bc6bad7d8ffa635f163c608e256160b4115fe3c1a8f4e7c9ec8b65f435ddca1fcb7af9f857a82042edd11457685d7424
|
data/.rubocop_todo.yml
CHANGED
@@ -1,22 +1,27 @@
|
|
1
1
|
# This configuration was generated by
|
2
2
|
# `rubocop --auto-gen-config`
|
3
|
-
# on 2021-
|
3
|
+
# on 2021-11-24 02:35:34 UTC using RuboCop version 1.23.0.
|
4
4
|
# The point is for the user to remove these configuration records
|
5
5
|
# one by one as the offenses are removed from the code base.
|
6
6
|
# Note that changes in the inspected code, or installation of new
|
7
7
|
# versions of RuboCop, may require this file to be generated again.
|
8
8
|
|
9
|
-
# Offense count:
|
9
|
+
# Offense count: 2
|
10
|
+
# Configuration parameters: IgnoredMethods, CountRepeatedAttributes.
|
11
|
+
Metrics/AbcSize:
|
12
|
+
Max: 20
|
13
|
+
|
14
|
+
# Offense count: 2
|
10
15
|
# Configuration parameters: IgnoredMethods.
|
11
16
|
Metrics/CyclomaticComplexity:
|
12
|
-
Max:
|
17
|
+
Max: 13
|
13
18
|
|
14
19
|
# Offense count: 2
|
15
20
|
# Configuration parameters: CountComments, CountAsOne, ExcludedMethods, IgnoredMethods.
|
16
21
|
Metrics/MethodLength:
|
17
|
-
Max:
|
22
|
+
Max: 15
|
18
23
|
|
19
24
|
# Offense count: 1
|
20
25
|
# Configuration parameters: IgnoredMethods.
|
21
26
|
Metrics/PerceivedComplexity:
|
22
|
-
Max:
|
27
|
+
Max: 14
|
data/CHANGELOG.md
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
script_detector_2 (0.
|
4
|
+
script_detector_2 (0.4.0)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
8
8
|
specs:
|
9
9
|
ast (2.4.2)
|
10
10
|
backport (1.2.0)
|
11
|
-
benchmark (0.
|
11
|
+
benchmark (0.2.0)
|
12
12
|
byebug (11.1.3)
|
13
13
|
diff-lcs (1.4.4)
|
14
14
|
e2mmap (0.1.0)
|
@@ -23,14 +23,14 @@ GEM
|
|
23
23
|
parallel (1.21.0)
|
24
24
|
parser (3.0.2.0)
|
25
25
|
ast (~> 2.4.1)
|
26
|
-
racc (1.
|
26
|
+
racc (1.6.0)
|
27
27
|
rainbow (3.0.0)
|
28
28
|
rake (13.0.6)
|
29
29
|
regexp_parser (2.1.1)
|
30
|
-
reverse_markdown (2.
|
30
|
+
reverse_markdown (2.1.1)
|
31
31
|
nokogiri
|
32
32
|
rexml (3.2.5)
|
33
|
-
rubocop (1.
|
33
|
+
rubocop (1.23.0)
|
34
34
|
parallel (~> 1.10)
|
35
35
|
parser (>= 3.0.0.0)
|
36
36
|
rainbow (>= 2.2.2, < 4.0)
|
@@ -39,11 +39,11 @@ GEM
|
|
39
39
|
rubocop-ast (>= 1.12.0, < 2.0)
|
40
40
|
ruby-progressbar (~> 1.7)
|
41
41
|
unicode-display_width (>= 1.4.0, < 3.0)
|
42
|
-
rubocop-ast (1.
|
42
|
+
rubocop-ast (1.13.0)
|
43
43
|
parser (>= 3.0.1.1)
|
44
44
|
ruby-progressbar (1.11.0)
|
45
45
|
rubyzip (2.3.2)
|
46
|
-
solargraph (0.44.
|
46
|
+
solargraph (0.44.2)
|
47
47
|
backport (~> 1.2)
|
48
48
|
benchmark
|
49
49
|
bundler (>= 1.17.2)
|
@@ -76,4 +76,4 @@ DEPENDENCIES
|
|
76
76
|
solargraph
|
77
77
|
|
78
78
|
BUNDLED WITH
|
79
|
-
2.2.
|
79
|
+
2.2.32
|
data/README.md
CHANGED
@@ -42,6 +42,7 @@ The main detection methods are:
|
|
42
42
|
- `ScriptDetector2.simplified_chinese?`
|
43
43
|
- `ScriptDetector2.traditional_chinese?`
|
44
44
|
- `ScriptDetector2.identify_script`
|
45
|
+
- `ScriptDetector2.identify_scripts`
|
45
46
|
|
46
47
|
Regexp patterns are used to identify the script to which Han characters belong.
|
47
48
|
These can be used directly as well:
|
data/lib/script_detector_2.rb
CHANGED
@@ -8,7 +8,7 @@ require_relative 'script_detector_2/string'
|
|
8
8
|
module ScriptDetector2
|
9
9
|
class << self
|
10
10
|
# @param string [String]
|
11
|
-
# @return [Boolean]
|
11
|
+
# @return [Boolean] true if +string+ appears to be Japanese
|
12
12
|
def japanese?(string)
|
13
13
|
return true if kana?(string)
|
14
14
|
return false if hangul?(string)
|
@@ -20,13 +20,14 @@ module ScriptDetector2
|
|
20
20
|
end
|
21
21
|
|
22
22
|
# @param string [String]
|
23
|
-
# @return [Boolean]
|
23
|
+
# @return [Boolean] true if +string+ contains Hiragana or Katakana
|
24
24
|
def kana?(string)
|
25
25
|
/[\p{Hiragana}\p{Katakana}]/.match?(string)
|
26
26
|
end
|
27
27
|
|
28
28
|
# @param string [String]
|
29
|
-
# @return [Boolean]
|
29
|
+
# @return [Boolean] true if +string+ appears to be Chinese (either
|
30
|
+
# Simplified or Traditional)
|
30
31
|
def chinese?(string)
|
31
32
|
return false if string =~ /[\p{Hiragana}\p{Katakana}\p{Hangul}]/
|
32
33
|
|
@@ -34,7 +35,7 @@ module ScriptDetector2
|
|
34
35
|
end
|
35
36
|
|
36
37
|
# @param string [String]
|
37
|
-
# @return [Boolean]
|
38
|
+
# @return [Boolean] true if +string+ appears to be Simplified Chinese
|
38
39
|
def simplified_chinese?(string)
|
39
40
|
return false if string =~ /[\p{Hiragana}\p{Katakana}\p{Hangul}]/
|
40
41
|
|
@@ -45,7 +46,7 @@ module ScriptDetector2
|
|
45
46
|
end
|
46
47
|
|
47
48
|
# @param string [String]
|
48
|
-
# @return [Boolean]
|
49
|
+
# @return [Boolean] true if +string+ appears to be Traditional Chinese
|
49
50
|
def traditional_chinese?(string)
|
50
51
|
return false if string =~ /[\p{Hiragana}\p{Katakana}\p{Hangul}]/
|
51
52
|
|
@@ -56,7 +57,7 @@ module ScriptDetector2
|
|
56
57
|
end
|
57
58
|
|
58
59
|
# @param string [String]
|
59
|
-
# @return [Boolean]
|
60
|
+
# @return [Boolean] true if +string+ appears to be Korean
|
60
61
|
def korean?(string)
|
61
62
|
return true if hangul?(string)
|
62
63
|
return false if kana?(string)
|
@@ -68,11 +69,24 @@ module ScriptDetector2
|
|
68
69
|
end
|
69
70
|
|
70
71
|
# @param string [String]
|
71
|
-
# @return [Boolean]
|
72
|
+
# @return [Boolean] true if +string+ contains Hangul
|
72
73
|
def hangul?(string)
|
73
74
|
/\p{Hangul}/.match?(string)
|
74
75
|
end
|
75
76
|
|
77
|
+
# Make a best-effort attempt to guess the singular script of +string+.
|
78
|
+
# Result is a symbol representing one of the scripts defined by ISO 15924,
|
79
|
+
# namely one of:
|
80
|
+
# - Hans (Simplified Chinese)
|
81
|
+
# - Hant (Traditional Chinese)
|
82
|
+
# - Hani (Unspecified Han)
|
83
|
+
# - Jpan (Japanese: Han, Hiragana, Katakana)
|
84
|
+
# - Kore (Korean: Hangul, Han)
|
85
|
+
# - Zyyy (Undetermined)
|
86
|
+
#
|
87
|
+
# Note that this is likely to give poor results for very short strings,
|
88
|
+
# which are often inherently ambiguous.
|
89
|
+
#
|
76
90
|
# @param string [String]
|
77
91
|
# @return [Symbol]
|
78
92
|
def identify_script(string)
|
@@ -81,15 +95,50 @@ module ScriptDetector2
|
|
81
95
|
|
82
96
|
is_hant = traditional_chinese?(string)
|
83
97
|
is_hans = simplified_chinese?(string)
|
84
|
-
if is_hant && is_hans
|
85
|
-
|
98
|
+
return :Hani if is_hant && is_hans
|
99
|
+
|
100
|
+
is_japanese = japanese?(string)
|
101
|
+
return :Hani if is_japanese && (is_hant || is_hans)
|
102
|
+
|
103
|
+
# At this point we have determined that the string does not contain
|
104
|
+
# Hangul; for such a string to be Korean would be unusual. Allowing Korean
|
105
|
+
# to dilute the result to Hani is going to be a loss on average, so we
|
106
|
+
# don't handle it like Japanese above.
|
107
|
+
|
108
|
+
if is_hans then :Hans
|
86
109
|
elsif is_hant then :Hant
|
87
|
-
elsif
|
110
|
+
elsif is_japanese then :Jpan
|
88
111
|
elsif korean?(string) then :Kore
|
89
|
-
elsif chinese?(string) then :Hani
|
112
|
+
elsif chinese?(string) then :Hani
|
90
113
|
else
|
91
114
|
:Zyyy
|
92
115
|
end
|
93
116
|
end
|
117
|
+
|
118
|
+
# Identify all CJK scripts represented in +string+. Result is a list of symbols
|
119
|
+
# representing scripts defined by ISO 15924, namely one or more of:
|
120
|
+
# - Hans (Simplified Chinese)
|
121
|
+
# - Hant (Traditional Chinese)
|
122
|
+
# - Hani (Unspecified Chinese)
|
123
|
+
# - Jpan (Japanese: Han, Hiragana, Katakana)
|
124
|
+
# - Kore (Korean: Hangul, Han)
|
125
|
+
# - Zyyy (Undetermined)
|
126
|
+
#
|
127
|
+
# This method does not attempt to identify other scripts such as Latn.
|
128
|
+
#
|
129
|
+
# @param string [String]
|
130
|
+
# @return [Array<Symbol>]
|
131
|
+
def identify_scripts(string)
|
132
|
+
result = []
|
133
|
+
|
134
|
+
result << :Hans if simplified_chinese?(string)
|
135
|
+
result << :Hant if traditional_chinese?(string)
|
136
|
+
result << :Jpan if japanese?(string)
|
137
|
+
result << :Kore if korean?(string)
|
138
|
+
result << :Hani if chinese?(string) && result.empty?
|
139
|
+
result << :Zyyy if result.empty?
|
140
|
+
|
141
|
+
result
|
142
|
+
end
|
94
143
|
end
|
95
144
|
end
|
data/script_detector_2.gemspec
CHANGED
@@ -17,6 +17,7 @@ Gem::Specification.new do |spec|
|
|
17
17
|
|
18
18
|
spec.metadata['homepage_uri'] = spec.homepage
|
19
19
|
spec.metadata['source_code_uri'] = 'https://github.com/amake/script_detector_2.git'
|
20
|
+
spec.metadata['rubygems_mfa_required'] = 'true'
|
20
21
|
|
21
22
|
# Specify which files should be added to the gem when it is released.
|
22
23
|
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: script_detector_2
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aaron Madlon-Kay
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-11-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: byebug
|
@@ -100,6 +100,7 @@ licenses:
|
|
100
100
|
metadata:
|
101
101
|
homepage_uri: https://github.com/amake/script_detector_2
|
102
102
|
source_code_uri: https://github.com/amake/script_detector_2.git
|
103
|
+
rubygems_mfa_required: 'true'
|
103
104
|
post_install_message:
|
104
105
|
rdoc_options: []
|
105
106
|
require_paths:
|