script_detector_2 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +10 -5
- data/CHANGELOG.md +5 -0
- data/Gemfile.lock +8 -8
- data/README.md +1 -0
- data/lib/script_detector_2/version.rb +1 -1
- data/lib/script_detector_2.rb +60 -11
- data/script_detector_2.gemspec +1 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 37ab1716845b98ca15a072e67cd900f5854dadd3c3641667fb5887c1626c4a85
|
4
|
+
data.tar.gz: 5142f341e40601f3d1fff211ff1fa566047f6353d0d3f6bc5062653827e759db
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 35b1771a4d8898d6ca02f38b4a016adc927600e7456b0a41ae515f88e85c846d8a41951ce78fa12c7a2851fc596a13ab48a3537dd97ff4579e8cd45d19b50ccf
|
7
|
+
data.tar.gz: ea6da32d1c4b9a10a4c04208fda4bee7bc6bad7d8ffa635f163c608e256160b4115fe3c1a8f4e7c9ec8b65f435ddca1fcb7af9f857a82042edd11457685d7424
|
data/.rubocop_todo.yml
CHANGED
@@ -1,22 +1,27 @@
|
|
1
1
|
# This configuration was generated by
|
2
2
|
# `rubocop --auto-gen-config`
|
3
|
-
# on 2021-
|
3
|
+
# on 2021-11-24 02:35:34 UTC using RuboCop version 1.23.0.
|
4
4
|
# The point is for the user to remove these configuration records
|
5
5
|
# one by one as the offenses are removed from the code base.
|
6
6
|
# Note that changes in the inspected code, or installation of new
|
7
7
|
# versions of RuboCop, may require this file to be generated again.
|
8
8
|
|
9
|
-
# Offense count:
|
9
|
+
# Offense count: 2
|
10
|
+
# Configuration parameters: IgnoredMethods, CountRepeatedAttributes.
|
11
|
+
Metrics/AbcSize:
|
12
|
+
Max: 20
|
13
|
+
|
14
|
+
# Offense count: 2
|
10
15
|
# Configuration parameters: IgnoredMethods.
|
11
16
|
Metrics/CyclomaticComplexity:
|
12
|
-
Max:
|
17
|
+
Max: 13
|
13
18
|
|
14
19
|
# Offense count: 2
|
15
20
|
# Configuration parameters: CountComments, CountAsOne, ExcludedMethods, IgnoredMethods.
|
16
21
|
Metrics/MethodLength:
|
17
|
-
Max:
|
22
|
+
Max: 15
|
18
23
|
|
19
24
|
# Offense count: 1
|
20
25
|
# Configuration parameters: IgnoredMethods.
|
21
26
|
Metrics/PerceivedComplexity:
|
22
|
-
Max:
|
27
|
+
Max: 14
|
data/CHANGELOG.md
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
script_detector_2 (0.
|
4
|
+
script_detector_2 (0.4.0)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
8
8
|
specs:
|
9
9
|
ast (2.4.2)
|
10
10
|
backport (1.2.0)
|
11
|
-
benchmark (0.
|
11
|
+
benchmark (0.2.0)
|
12
12
|
byebug (11.1.3)
|
13
13
|
diff-lcs (1.4.4)
|
14
14
|
e2mmap (0.1.0)
|
@@ -23,14 +23,14 @@ GEM
|
|
23
23
|
parallel (1.21.0)
|
24
24
|
parser (3.0.2.0)
|
25
25
|
ast (~> 2.4.1)
|
26
|
-
racc (1.
|
26
|
+
racc (1.6.0)
|
27
27
|
rainbow (3.0.0)
|
28
28
|
rake (13.0.6)
|
29
29
|
regexp_parser (2.1.1)
|
30
|
-
reverse_markdown (2.
|
30
|
+
reverse_markdown (2.1.1)
|
31
31
|
nokogiri
|
32
32
|
rexml (3.2.5)
|
33
|
-
rubocop (1.
|
33
|
+
rubocop (1.23.0)
|
34
34
|
parallel (~> 1.10)
|
35
35
|
parser (>= 3.0.0.0)
|
36
36
|
rainbow (>= 2.2.2, < 4.0)
|
@@ -39,11 +39,11 @@ GEM
|
|
39
39
|
rubocop-ast (>= 1.12.0, < 2.0)
|
40
40
|
ruby-progressbar (~> 1.7)
|
41
41
|
unicode-display_width (>= 1.4.0, < 3.0)
|
42
|
-
rubocop-ast (1.
|
42
|
+
rubocop-ast (1.13.0)
|
43
43
|
parser (>= 3.0.1.1)
|
44
44
|
ruby-progressbar (1.11.0)
|
45
45
|
rubyzip (2.3.2)
|
46
|
-
solargraph (0.44.
|
46
|
+
solargraph (0.44.2)
|
47
47
|
backport (~> 1.2)
|
48
48
|
benchmark
|
49
49
|
bundler (>= 1.17.2)
|
@@ -76,4 +76,4 @@ DEPENDENCIES
|
|
76
76
|
solargraph
|
77
77
|
|
78
78
|
BUNDLED WITH
|
79
|
-
2.2.
|
79
|
+
2.2.32
|
data/README.md
CHANGED
@@ -42,6 +42,7 @@ The main detection methods are:
|
|
42
42
|
- `ScriptDetector2.simplified_chinese?`
|
43
43
|
- `ScriptDetector2.traditional_chinese?`
|
44
44
|
- `ScriptDetector2.identify_script`
|
45
|
+
- `ScriptDetector2.identify_scripts`
|
45
46
|
|
46
47
|
Regexp patterns are used to identify the script to which Han characters belong.
|
47
48
|
These can be used directly as well:
|
data/lib/script_detector_2.rb
CHANGED
@@ -8,7 +8,7 @@ require_relative 'script_detector_2/string'
|
|
8
8
|
module ScriptDetector2
|
9
9
|
class << self
|
10
10
|
# @param string [String]
|
11
|
-
# @return [Boolean]
|
11
|
+
# @return [Boolean] true if +string+ appears to be Japanese
|
12
12
|
def japanese?(string)
|
13
13
|
return true if kana?(string)
|
14
14
|
return false if hangul?(string)
|
@@ -20,13 +20,14 @@ module ScriptDetector2
|
|
20
20
|
end
|
21
21
|
|
22
22
|
# @param string [String]
|
23
|
-
# @return [Boolean]
|
23
|
+
# @return [Boolean] true if +string+ contains Hiragana or Katakana
|
24
24
|
def kana?(string)
|
25
25
|
/[\p{Hiragana}\p{Katakana}]/.match?(string)
|
26
26
|
end
|
27
27
|
|
28
28
|
# @param string [String]
|
29
|
-
# @return [Boolean]
|
29
|
+
# @return [Boolean] true if +string+ appears to be Chinese (either
|
30
|
+
# Simplified or Traditional)
|
30
31
|
def chinese?(string)
|
31
32
|
return false if string =~ /[\p{Hiragana}\p{Katakana}\p{Hangul}]/
|
32
33
|
|
@@ -34,7 +35,7 @@ module ScriptDetector2
|
|
34
35
|
end
|
35
36
|
|
36
37
|
# @param string [String]
|
37
|
-
# @return [Boolean]
|
38
|
+
# @return [Boolean] true if +string+ appears to be Simplified Chinese
|
38
39
|
def simplified_chinese?(string)
|
39
40
|
return false if string =~ /[\p{Hiragana}\p{Katakana}\p{Hangul}]/
|
40
41
|
|
@@ -45,7 +46,7 @@ module ScriptDetector2
|
|
45
46
|
end
|
46
47
|
|
47
48
|
# @param string [String]
|
48
|
-
# @return [Boolean]
|
49
|
+
# @return [Boolean] true if +string+ appears to be Traditional Chinese
|
49
50
|
def traditional_chinese?(string)
|
50
51
|
return false if string =~ /[\p{Hiragana}\p{Katakana}\p{Hangul}]/
|
51
52
|
|
@@ -56,7 +57,7 @@ module ScriptDetector2
|
|
56
57
|
end
|
57
58
|
|
58
59
|
# @param string [String]
|
59
|
-
# @return [Boolean]
|
60
|
+
# @return [Boolean] true if +string+ appears to be Korean
|
60
61
|
def korean?(string)
|
61
62
|
return true if hangul?(string)
|
62
63
|
return false if kana?(string)
|
@@ -68,11 +69,24 @@ module ScriptDetector2
|
|
68
69
|
end
|
69
70
|
|
70
71
|
# @param string [String]
|
71
|
-
# @return [Boolean]
|
72
|
+
# @return [Boolean] true if +string+ contains Hangul
|
72
73
|
def hangul?(string)
|
73
74
|
/\p{Hangul}/.match?(string)
|
74
75
|
end
|
75
76
|
|
77
|
+
# Make a best-effort attempt to guess the singular script of +string+.
|
78
|
+
# Result is a symbol representing one of the scripts defined by ISO 15924,
|
79
|
+
# namely one of:
|
80
|
+
# - Hans (Simplified Chinese)
|
81
|
+
# - Hant (Traditional Chinese)
|
82
|
+
# - Hani (Unspecified Han)
|
83
|
+
# - Jpan (Japanese: Han, Hiragana, Katakana)
|
84
|
+
# - Kore (Korean: Hangul, Han)
|
85
|
+
# - Zyyy (Undetermined)
|
86
|
+
#
|
87
|
+
# Note that this is likely to give poor results for very short strings,
|
88
|
+
# which are often inherently ambiguous.
|
89
|
+
#
|
76
90
|
# @param string [String]
|
77
91
|
# @return [Symbol]
|
78
92
|
def identify_script(string)
|
@@ -81,15 +95,50 @@ module ScriptDetector2
|
|
81
95
|
|
82
96
|
is_hant = traditional_chinese?(string)
|
83
97
|
is_hans = simplified_chinese?(string)
|
84
|
-
if is_hant && is_hans
|
85
|
-
|
98
|
+
return :Hani if is_hant && is_hans
|
99
|
+
|
100
|
+
is_japanese = japanese?(string)
|
101
|
+
return :Hani if is_japanese && (is_hant || is_hans)
|
102
|
+
|
103
|
+
# At this point we have determined that the string does not contain
|
104
|
+
# Hangul; for such a string to be Korean would be unusual. Allowing Korean
|
105
|
+
# to dilute the result to Hani is going to be a loss on average, so we
|
106
|
+
# don't handle it like Japanese above.
|
107
|
+
|
108
|
+
if is_hans then :Hans
|
86
109
|
elsif is_hant then :Hant
|
87
|
-
elsif
|
110
|
+
elsif is_japanese then :Jpan
|
88
111
|
elsif korean?(string) then :Kore
|
89
|
-
elsif chinese?(string) then :Hani
|
112
|
+
elsif chinese?(string) then :Hani
|
90
113
|
else
|
91
114
|
:Zyyy
|
92
115
|
end
|
93
116
|
end
|
117
|
+
|
118
|
+
# Identify all CJK scripts represented in +string+. Result is a list of symbols
|
119
|
+
# representing scripts defined by ISO 15924, namely one or more of:
|
120
|
+
# - Hans (Simplified Chinese)
|
121
|
+
# - Hant (Traditional Chinese)
|
122
|
+
# - Hani (Unspecified Chinese)
|
123
|
+
# - Jpan (Japanese: Han, Hiragana, Katakana)
|
124
|
+
# - Kore (Korean: Hangul, Han)
|
125
|
+
# - Zyyy (Undetermined)
|
126
|
+
#
|
127
|
+
# This method does not attempt to identify other scripts such as Latn.
|
128
|
+
#
|
129
|
+
# @param string [String]
|
130
|
+
# @return [Array<Symbol>]
|
131
|
+
def identify_scripts(string)
|
132
|
+
result = []
|
133
|
+
|
134
|
+
result << :Hans if simplified_chinese?(string)
|
135
|
+
result << :Hant if traditional_chinese?(string)
|
136
|
+
result << :Jpan if japanese?(string)
|
137
|
+
result << :Kore if korean?(string)
|
138
|
+
result << :Hani if chinese?(string) && result.empty?
|
139
|
+
result << :Zyyy if result.empty?
|
140
|
+
|
141
|
+
result
|
142
|
+
end
|
94
143
|
end
|
95
144
|
end
|
data/script_detector_2.gemspec
CHANGED
@@ -17,6 +17,7 @@ Gem::Specification.new do |spec|
|
|
17
17
|
|
18
18
|
spec.metadata['homepage_uri'] = spec.homepage
|
19
19
|
spec.metadata['source_code_uri'] = 'https://github.com/amake/script_detector_2.git'
|
20
|
+
spec.metadata['rubygems_mfa_required'] = 'true'
|
20
21
|
|
21
22
|
# Specify which files should be added to the gem when it is released.
|
22
23
|
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: script_detector_2
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aaron Madlon-Kay
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-11-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: byebug
|
@@ -100,6 +100,7 @@ licenses:
|
|
100
100
|
metadata:
|
101
101
|
homepage_uri: https://github.com/amake/script_detector_2
|
102
102
|
source_code_uri: https://github.com/amake/script_detector_2.git
|
103
|
+
rubygems_mfa_required: 'true'
|
103
104
|
post_install_message:
|
104
105
|
rdoc_options: []
|
105
106
|
require_paths:
|