script_detector_2 0.1.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +4 -0
- data/.rubocop_todo.yml +17 -2
- data/.ruby-version +1 -0
- data/CHANGELOG.md +21 -0
- data/Gemfile.lock +12 -12
- data/README.md +11 -1
- data/lib/script_detector_2/patterns.gen.rb +4 -4
- data/lib/script_detector_2/version.rb +1 -1
- data/lib/script_detector_2.rb +82 -14
- data/script_detector_2.gemspec +1 -0
- data/tasks/gen_src.rake +1 -1
- data/tasks/unihan.rb +12 -5
- metadata +4 -2
data/lib/script_detector_2.rb
CHANGED
@@ -8,10 +8,10 @@ require_relative 'script_detector_2/string'
|
|
8
8
|
module ScriptDetector2
|
9
9
|
class << self
|
10
10
|
# @param string [String]
|
11
|
-
# @return [Boolean]
|
11
|
+
# @return [Boolean] true if +string+ appears to be Japanese
|
12
12
|
def japanese?(string)
|
13
|
-
return true if string
|
14
|
-
return false if string
|
13
|
+
return true if kana?(string)
|
14
|
+
return false if hangul?(string)
|
15
15
|
|
16
16
|
kanji = string.scan(/\p{Han}/)
|
17
17
|
return false unless kanji.any?
|
@@ -20,15 +20,22 @@ module ScriptDetector2
|
|
20
20
|
end
|
21
21
|
|
22
22
|
# @param string [String]
|
23
|
-
# @return [Boolean]
|
23
|
+
# @return [Boolean] true if +string+ contains Hiragana or Katakana
|
24
|
+
def kana?(string)
|
25
|
+
/[\p{Hiragana}\p{Katakana}]/.match?(string)
|
26
|
+
end
|
27
|
+
|
28
|
+
# @param string [String]
|
29
|
+
# @return [Boolean] true if +string+ appears to be Chinese (either
|
30
|
+
# Simplified or Traditional)
|
24
31
|
def chinese?(string)
|
25
32
|
return false if string =~ /[\p{Hiragana}\p{Katakana}\p{Hangul}]/
|
26
33
|
|
27
|
-
|
34
|
+
/\p{Han}/.match?(string)
|
28
35
|
end
|
29
36
|
|
30
37
|
# @param string [String]
|
31
|
-
# @return [Boolean]
|
38
|
+
# @return [Boolean] true if +string+ appears to be Simplified Chinese
|
32
39
|
def simplified_chinese?(string)
|
33
40
|
return false if string =~ /[\p{Hiragana}\p{Katakana}\p{Hangul}]/
|
34
41
|
|
@@ -39,7 +46,7 @@ module ScriptDetector2
|
|
39
46
|
end
|
40
47
|
|
41
48
|
# @param string [String]
|
42
|
-
# @return [Boolean]
|
49
|
+
# @return [Boolean] true if +string+ appears to be Traditional Chinese
|
43
50
|
def traditional_chinese?(string)
|
44
51
|
return false if string =~ /[\p{Hiragana}\p{Katakana}\p{Hangul}]/
|
45
52
|
|
@@ -50,10 +57,10 @@ module ScriptDetector2
|
|
50
57
|
end
|
51
58
|
|
52
59
|
# @param string [String]
|
53
|
-
# @return [Boolean]
|
60
|
+
# @return [Boolean] true if +string+ appears to be Korean
|
54
61
|
def korean?(string)
|
55
|
-
return true if string
|
56
|
-
return false if string
|
62
|
+
return true if hangul?(string)
|
63
|
+
return false if kana?(string)
|
57
64
|
|
58
65
|
hanja = string.scan(/\p{Han}/)
|
59
66
|
return false unless hanja.any?
|
@@ -61,16 +68,77 @@ module ScriptDetector2
|
|
61
68
|
hanja.all?(KOREAN_PATTERN)
|
62
69
|
end
|
63
70
|
|
71
|
+
# @param string [String]
|
72
|
+
# @return [Boolean] true if +string+ contains Hangul
|
73
|
+
def hangul?(string)
|
74
|
+
/\p{Hangul}/.match?(string)
|
75
|
+
end
|
76
|
+
|
77
|
+
# Make a best-effort attempt to guess the singular script of +string+.
|
78
|
+
# Result is a symbol representing one of the scripts defined by ISO 15924,
|
79
|
+
# namely one of:
|
80
|
+
# - Hans (Simplified Chinese)
|
81
|
+
# - Hant (Traditional Chinese)
|
82
|
+
# - Hani (Unspecified Han)
|
83
|
+
# - Jpan (Japanese: Han, Hiragana, Katakana)
|
84
|
+
# - Kore (Korean: Hangul, Han)
|
85
|
+
# - Zyyy (Undetermined)
|
86
|
+
#
|
87
|
+
# Note that this is likely to give poor results for very short strings,
|
88
|
+
# which are often inherently ambiguous.
|
89
|
+
#
|
64
90
|
# @param string [String]
|
65
91
|
# @return [Symbol]
|
66
92
|
def identify_script(string)
|
67
|
-
if
|
93
|
+
return :Jpan if kana?(string)
|
94
|
+
return :Kore if hangul?(string)
|
95
|
+
|
96
|
+
is_hant = traditional_chinese?(string)
|
97
|
+
is_hans = simplified_chinese?(string)
|
98
|
+
return :Hani if is_hant && is_hans
|
99
|
+
|
100
|
+
is_japanese = japanese?(string)
|
101
|
+
return :Hani if is_japanese && (is_hant || is_hans)
|
102
|
+
|
103
|
+
# At this point we have determined that the string does not contain
|
104
|
+
# Hangul; for such a string to be Korean would be unusual. Allowing Korean
|
105
|
+
# to dilute the result to Hani is going to be a loss on average, so we
|
106
|
+
# don't handle it like Japanese above.
|
107
|
+
|
108
|
+
if is_hans then :Hans
|
109
|
+
elsif is_hant then :Hant
|
110
|
+
elsif is_japanese then :Jpan
|
68
111
|
elsif korean?(string) then :Kore
|
69
|
-
elsif traditional_chinese?(string) then :Hant
|
70
|
-
elsif simplified_chinese?(string) then :Hans
|
71
112
|
elsif chinese?(string) then :Hani
|
72
|
-
else
|
113
|
+
else
|
114
|
+
:Zyyy
|
73
115
|
end
|
74
116
|
end
|
117
|
+
|
118
|
+
# Identify all CJK scripts represented in +string+. Result is a list of symbols
|
119
|
+
# representing scripts defined by ISO 15924, namely one or more of:
|
120
|
+
# - Hans (Simplified Chinese)
|
121
|
+
# - Hant (Traditional Chinese)
|
122
|
+
# - Hani (Unspecified Chinese)
|
123
|
+
# - Jpan (Japanese: Han, Hiragana, Katakana)
|
124
|
+
# - Kore (Korean: Hangul, Han)
|
125
|
+
# - Zyyy (Undetermined)
|
126
|
+
#
|
127
|
+
# This method does not attempt to identify other scripts such as Latn.
|
128
|
+
#
|
129
|
+
# @param string [String]
|
130
|
+
# @return [Array<Symbol>]
|
131
|
+
def identify_scripts(string)
|
132
|
+
result = []
|
133
|
+
|
134
|
+
result << :Hans if simplified_chinese?(string)
|
135
|
+
result << :Hant if traditional_chinese?(string)
|
136
|
+
result << :Jpan if japanese?(string)
|
137
|
+
result << :Kore if korean?(string)
|
138
|
+
result << :Hani if chinese?(string) && result.empty?
|
139
|
+
result << :Zyyy if result.empty?
|
140
|
+
|
141
|
+
result
|
142
|
+
end
|
75
143
|
end
|
76
144
|
end
|
data/script_detector_2.gemspec
CHANGED
@@ -17,6 +17,7 @@ Gem::Specification.new do |spec|
|
|
17
17
|
|
18
18
|
spec.metadata['homepage_uri'] = spec.homepage
|
19
19
|
spec.metadata['source_code_uri'] = 'https://github.com/amake/script_detector_2.git'
|
20
|
+
spec.metadata['rubygems_mfa_required'] = 'true'
|
20
21
|
|
21
22
|
# Specify which files should be added to the gem when it is released.
|
22
23
|
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
data/tasks/gen_src.rake
CHANGED
data/tasks/unihan.rb
CHANGED
@@ -5,15 +5,20 @@ module Unihan
|
|
5
5
|
CODEPOINT_PATTERN = /U\+(?<hex>[A-F0-9]+)/.freeze
|
6
6
|
|
7
7
|
class << self
|
8
|
-
# @param
|
8
|
+
# @param dict_data [Hash<Integer,Hash{String => String}>]
|
9
9
|
# @param tags [Array<String>]
|
10
10
|
# @return [Regexp]
|
11
11
|
def gen_unihan_core_pattern(dict_data, *tags)
|
12
|
-
|
12
|
+
gen_pattern(codepoints_for_tags(dict_data, tags))
|
13
|
+
end
|
14
|
+
|
15
|
+
# @param dict_data [Hash<Integer,Hash{String => String}>]
|
16
|
+
# @param tags [Array<String>]
|
17
|
+
# @return [Array<Integer>]
|
18
|
+
def codepoints_for_tags(dict_data, tags)
|
19
|
+
dict_data.select do |_, data|
|
13
20
|
tags.all? { |t| data['kUnihanCore2020']&.include?(t) }
|
14
21
|
end.keys
|
15
|
-
|
16
|
-
gen_pattern(codepoints)
|
17
22
|
end
|
18
23
|
|
19
24
|
# @param codepoints [Array<Integer>]
|
@@ -22,11 +27,13 @@ module Unihan
|
|
22
27
|
alts = group(codepoints).map do |first, last|
|
23
28
|
if first == last
|
24
29
|
format('\u{%x}', first)
|
30
|
+
elsif first.succ == last
|
31
|
+
format('\u{%<first>x}\u{%<last>x}', first: first, last: last)
|
25
32
|
else
|
26
33
|
format('\u{%<first>x}-\u{%<last>x}', first: first, last: last)
|
27
34
|
end
|
28
35
|
end
|
29
|
-
|
36
|
+
/\A[#{alts.join}]+\Z/
|
30
37
|
end
|
31
38
|
|
32
39
|
# @param codepoints [Array<Integer>]
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: script_detector_2
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aaron Madlon-Kay
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-11-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: byebug
|
@@ -76,6 +76,7 @@ files:
|
|
76
76
|
- ".dir-locals.el"
|
77
77
|
- ".rubocop.yml"
|
78
78
|
- ".rubocop_todo.yml"
|
79
|
+
- ".ruby-version"
|
79
80
|
- ".solargraph.yml"
|
80
81
|
- CHANGELOG.md
|
81
82
|
- Gemfile
|
@@ -99,6 +100,7 @@ licenses:
|
|
99
100
|
metadata:
|
100
101
|
homepage_uri: https://github.com/amake/script_detector_2
|
101
102
|
source_code_uri: https://github.com/amake/script_detector_2.git
|
103
|
+
rubygems_mfa_required: 'true'
|
102
104
|
post_install_message:
|
103
105
|
rdoc_options: []
|
104
106
|
require_paths:
|