script_detector_2 0.1.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +4 -0
- data/.rubocop_todo.yml +17 -2
- data/.ruby-version +1 -0
- data/CHANGELOG.md +21 -0
- data/Gemfile.lock +12 -12
- data/README.md +11 -1
- data/lib/script_detector_2/patterns.gen.rb +4 -4
- data/lib/script_detector_2/version.rb +1 -1
- data/lib/script_detector_2.rb +82 -14
- data/script_detector_2.gemspec +1 -0
- data/tasks/gen_src.rake +1 -1
- data/tasks/unihan.rb +12 -5
- metadata +4 -2
data/lib/script_detector_2.rb
CHANGED
@@ -8,10 +8,10 @@ require_relative 'script_detector_2/string'
|
|
8
8
|
module ScriptDetector2
|
9
9
|
class << self
|
10
10
|
# @param string [String]
|
11
|
-
# @return [Boolean]
|
11
|
+
# @return [Boolean] true if +string+ appears to be Japanese
|
12
12
|
def japanese?(string)
|
13
|
-
return true if string
|
14
|
-
return false if string
|
13
|
+
return true if kana?(string)
|
14
|
+
return false if hangul?(string)
|
15
15
|
|
16
16
|
kanji = string.scan(/\p{Han}/)
|
17
17
|
return false unless kanji.any?
|
@@ -20,15 +20,22 @@ module ScriptDetector2
|
|
20
20
|
end
|
21
21
|
|
22
22
|
# @param string [String]
|
23
|
-
# @return [Boolean]
|
23
|
+
# @return [Boolean] true if +string+ contains Hiragana or Katakana
|
24
|
+
def kana?(string)
|
25
|
+
/[\p{Hiragana}\p{Katakana}]/.match?(string)
|
26
|
+
end
|
27
|
+
|
28
|
+
# @param string [String]
|
29
|
+
# @return [Boolean] true if +string+ appears to be Chinese (either
|
30
|
+
# Simplified or Traditional)
|
24
31
|
def chinese?(string)
|
25
32
|
return false if string =~ /[\p{Hiragana}\p{Katakana}\p{Hangul}]/
|
26
33
|
|
27
|
-
|
34
|
+
/\p{Han}/.match?(string)
|
28
35
|
end
|
29
36
|
|
30
37
|
# @param string [String]
|
31
|
-
# @return [Boolean]
|
38
|
+
# @return [Boolean] true if +string+ appears to be Simplified Chinese
|
32
39
|
def simplified_chinese?(string)
|
33
40
|
return false if string =~ /[\p{Hiragana}\p{Katakana}\p{Hangul}]/
|
34
41
|
|
@@ -39,7 +46,7 @@ module ScriptDetector2
|
|
39
46
|
end
|
40
47
|
|
41
48
|
# @param string [String]
|
42
|
-
# @return [Boolean]
|
49
|
+
# @return [Boolean] true if +string+ appears to be Traditional Chinese
|
43
50
|
def traditional_chinese?(string)
|
44
51
|
return false if string =~ /[\p{Hiragana}\p{Katakana}\p{Hangul}]/
|
45
52
|
|
@@ -50,10 +57,10 @@ module ScriptDetector2
|
|
50
57
|
end
|
51
58
|
|
52
59
|
# @param string [String]
|
53
|
-
# @return [Boolean]
|
60
|
+
# @return [Boolean] true if +string+ appears to be Korean
|
54
61
|
def korean?(string)
|
55
|
-
return true if string
|
56
|
-
return false if string
|
62
|
+
return true if hangul?(string)
|
63
|
+
return false if kana?(string)
|
57
64
|
|
58
65
|
hanja = string.scan(/\p{Han}/)
|
59
66
|
return false unless hanja.any?
|
@@ -61,16 +68,77 @@ module ScriptDetector2
|
|
61
68
|
hanja.all?(KOREAN_PATTERN)
|
62
69
|
end
|
63
70
|
|
71
|
+
# @param string [String]
|
72
|
+
# @return [Boolean] true if +string+ contains Hangul
|
73
|
+
def hangul?(string)
|
74
|
+
/\p{Hangul}/.match?(string)
|
75
|
+
end
|
76
|
+
|
77
|
+
# Make a best-effort attempt to guess the singular script of +string+.
|
78
|
+
# Result is a symbol representing one of the scripts defined by ISO 15924,
|
79
|
+
# namely one of:
|
80
|
+
# - Hans (Simplified Chinese)
|
81
|
+
# - Hant (Traditional Chinese)
|
82
|
+
# - Hani (Unspecified Han)
|
83
|
+
# - Jpan (Japanese: Han, Hiragana, Katakana)
|
84
|
+
# - Kore (Korean: Hangul, Han)
|
85
|
+
# - Zyyy (Undetermined)
|
86
|
+
#
|
87
|
+
# Note that this is likely to give poor results for very short strings,
|
88
|
+
# which are often inherently ambiguous.
|
89
|
+
#
|
64
90
|
# @param string [String]
|
65
91
|
# @return [Symbol]
|
66
92
|
def identify_script(string)
|
67
|
-
if
|
93
|
+
return :Jpan if kana?(string)
|
94
|
+
return :Kore if hangul?(string)
|
95
|
+
|
96
|
+
is_hant = traditional_chinese?(string)
|
97
|
+
is_hans = simplified_chinese?(string)
|
98
|
+
return :Hani if is_hant && is_hans
|
99
|
+
|
100
|
+
is_japanese = japanese?(string)
|
101
|
+
return :Hani if is_japanese && (is_hant || is_hans)
|
102
|
+
|
103
|
+
# At this point we have determined that the string does not contain
|
104
|
+
# Hangul; for such a string to be Korean would be unusual. Allowing Korean
|
105
|
+
# to dilute the result to Hani is going to be a loss on average, so we
|
106
|
+
# don't handle it like Japanese above.
|
107
|
+
|
108
|
+
if is_hans then :Hans
|
109
|
+
elsif is_hant then :Hant
|
110
|
+
elsif is_japanese then :Jpan
|
68
111
|
elsif korean?(string) then :Kore
|
69
|
-
elsif traditional_chinese?(string) then :Hant
|
70
|
-
elsif simplified_chinese?(string) then :Hans
|
71
112
|
elsif chinese?(string) then :Hani
|
72
|
-
else
|
113
|
+
else
|
114
|
+
:Zyyy
|
73
115
|
end
|
74
116
|
end
|
117
|
+
|
118
|
+
# Identify all CJK scripts represented in +string+. Result is a list of symbols
|
119
|
+
# representing scripts defined by ISO 15924, namely one or more of:
|
120
|
+
# - Hans (Simplified Chinese)
|
121
|
+
# - Hant (Traditional Chinese)
|
122
|
+
# - Hani (Unspecified Chinese)
|
123
|
+
# - Jpan (Japanese: Han, Hiragana, Katakana)
|
124
|
+
# - Kore (Korean: Hangul, Han)
|
125
|
+
# - Zyyy (Undetermined)
|
126
|
+
#
|
127
|
+
# This method does not attempt to identify other scripts such as Latn.
|
128
|
+
#
|
129
|
+
# @param string [String]
|
130
|
+
# @return [Array<Symbol>]
|
131
|
+
def identify_scripts(string)
|
132
|
+
result = []
|
133
|
+
|
134
|
+
result << :Hans if simplified_chinese?(string)
|
135
|
+
result << :Hant if traditional_chinese?(string)
|
136
|
+
result << :Jpan if japanese?(string)
|
137
|
+
result << :Kore if korean?(string)
|
138
|
+
result << :Hani if chinese?(string) && result.empty?
|
139
|
+
result << :Zyyy if result.empty?
|
140
|
+
|
141
|
+
result
|
142
|
+
end
|
75
143
|
end
|
76
144
|
end
|
data/script_detector_2.gemspec
CHANGED
@@ -17,6 +17,7 @@ Gem::Specification.new do |spec|
|
|
17
17
|
|
18
18
|
spec.metadata['homepage_uri'] = spec.homepage
|
19
19
|
spec.metadata['source_code_uri'] = 'https://github.com/amake/script_detector_2.git'
|
20
|
+
spec.metadata['rubygems_mfa_required'] = 'true'
|
20
21
|
|
21
22
|
# Specify which files should be added to the gem when it is released.
|
22
23
|
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
data/tasks/gen_src.rake
CHANGED
data/tasks/unihan.rb
CHANGED
@@ -5,15 +5,20 @@ module Unihan
|
|
5
5
|
CODEPOINT_PATTERN = /U\+(?<hex>[A-F0-9]+)/.freeze
|
6
6
|
|
7
7
|
class << self
|
8
|
-
# @param
|
8
|
+
# @param dict_data [Hash<Integer,Hash{String => String}>]
|
9
9
|
# @param tags [Array<String>]
|
10
10
|
# @return [Regexp]
|
11
11
|
def gen_unihan_core_pattern(dict_data, *tags)
|
12
|
-
|
12
|
+
gen_pattern(codepoints_for_tags(dict_data, tags))
|
13
|
+
end
|
14
|
+
|
15
|
+
# @param dict_data [Hash<Integer,Hash{String => String}>]
|
16
|
+
# @param tags [Array<String>]
|
17
|
+
# @return [Array<Integer>]
|
18
|
+
def codepoints_for_tags(dict_data, tags)
|
19
|
+
dict_data.select do |_, data|
|
13
20
|
tags.all? { |t| data['kUnihanCore2020']&.include?(t) }
|
14
21
|
end.keys
|
15
|
-
|
16
|
-
gen_pattern(codepoints)
|
17
22
|
end
|
18
23
|
|
19
24
|
# @param codepoints [Array<Integer>]
|
@@ -22,11 +27,13 @@ module Unihan
|
|
22
27
|
alts = group(codepoints).map do |first, last|
|
23
28
|
if first == last
|
24
29
|
format('\u{%x}', first)
|
30
|
+
elsif first.succ == last
|
31
|
+
format('\u{%<first>x}\u{%<last>x}', first: first, last: last)
|
25
32
|
else
|
26
33
|
format('\u{%<first>x}-\u{%<last>x}', first: first, last: last)
|
27
34
|
end
|
28
35
|
end
|
29
|
-
|
36
|
+
/\A[#{alts.join}]+\Z/
|
30
37
|
end
|
31
38
|
|
32
39
|
# @param codepoints [Array<Integer>]
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: script_detector_2
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aaron Madlon-Kay
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-11-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: byebug
|
@@ -76,6 +76,7 @@ files:
|
|
76
76
|
- ".dir-locals.el"
|
77
77
|
- ".rubocop.yml"
|
78
78
|
- ".rubocop_todo.yml"
|
79
|
+
- ".ruby-version"
|
79
80
|
- ".solargraph.yml"
|
80
81
|
- CHANGELOG.md
|
81
82
|
- Gemfile
|
@@ -99,6 +100,7 @@ licenses:
|
|
99
100
|
metadata:
|
100
101
|
homepage_uri: https://github.com/amake/script_detector_2
|
101
102
|
source_code_uri: https://github.com/amake/script_detector_2.git
|
103
|
+
rubygems_mfa_required: 'true'
|
102
104
|
post_install_message:
|
103
105
|
rdoc_options: []
|
104
106
|
require_paths:
|