script_detector_2 0.1.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ScriptDetector2
4
- VERSION = '0.1.0'
4
+ VERSION = '0.4.0'
5
5
  end
@@ -8,10 +8,10 @@ require_relative 'script_detector_2/string'
8
8
  module ScriptDetector2
9
9
  class << self
10
10
  # @param string [String]
11
- # @return [Boolean]
11
+ # @return [Boolean] true if +string+ appears to be Japanese
12
12
  def japanese?(string)
13
- return true if string =~ /[\p{Hiragana}\p{Katakana}]/
14
- return false if string =~ /\p{Hangul}/
13
+ return true if kana?(string)
14
+ return false if hangul?(string)
15
15
 
16
16
  kanji = string.scan(/\p{Han}/)
17
17
  return false unless kanji.any?
@@ -20,15 +20,22 @@ module ScriptDetector2
20
20
  end
21
21
 
22
22
  # @param string [String]
23
- # @return [Boolean]
23
+ # @return [Boolean] true if +string+ contains Hiragana or Katakana
24
+ def kana?(string)
25
+ /[\p{Hiragana}\p{Katakana}]/.match?(string)
26
+ end
27
+
28
+ # @param string [String]
29
+ # @return [Boolean] true if +string+ appears to be Chinese (either
30
+ # Simplified or Traditional)
24
31
  def chinese?(string)
25
32
  return false if string =~ /[\p{Hiragana}\p{Katakana}\p{Hangul}]/
26
33
 
27
- string =~ /\p{Han}/
34
+ /\p{Han}/.match?(string)
28
35
  end
29
36
 
30
37
  # @param string [String]
31
- # @return [Boolean]
38
+ # @return [Boolean] true if +string+ appears to be Simplified Chinese
32
39
  def simplified_chinese?(string)
33
40
  return false if string =~ /[\p{Hiragana}\p{Katakana}\p{Hangul}]/
34
41
 
@@ -39,7 +46,7 @@ module ScriptDetector2
39
46
  end
40
47
 
41
48
  # @param string [String]
42
- # @return [Boolean]
49
+ # @return [Boolean] true if +string+ appears to be Traditional Chinese
43
50
  def traditional_chinese?(string)
44
51
  return false if string =~ /[\p{Hiragana}\p{Katakana}\p{Hangul}]/
45
52
 
@@ -50,10 +57,10 @@ module ScriptDetector2
50
57
  end
51
58
 
52
59
  # @param string [String]
53
- # @return [Boolean]
60
+ # @return [Boolean] true if +string+ appears to be Korean
54
61
  def korean?(string)
55
- return true if string =~ /\p{Hangul}/
56
- return false if string =~ /[\p{Hiragana}\p{Katakana}]/
62
+ return true if hangul?(string)
63
+ return false if kana?(string)
57
64
 
58
65
  hanja = string.scan(/\p{Han}/)
59
66
  return false unless hanja.any?
@@ -61,16 +68,77 @@ module ScriptDetector2
61
68
  hanja.all?(KOREAN_PATTERN)
62
69
  end
63
70
 
71
+ # @param string [String]
72
+ # @return [Boolean] true if +string+ contains Hangul
73
+ def hangul?(string)
74
+ /\p{Hangul}/.match?(string)
75
+ end
76
+
77
+ # Make a best-effort attempt to guess the singular script of +string+.
78
+ # Result is a symbol representing one of the scripts defined by ISO 15924,
79
+ # namely one of:
80
+ # - Hans (Simplified Chinese)
81
+ # - Hant (Traditional Chinese)
82
+ # - Hani (Unspecified Han)
83
+ # - Jpan (Japanese: Han, Hiragana, Katakana)
84
+ # - Kore (Korean: Hangul, Han)
85
+ # - Zyyy (Undetermined)
86
+ #
87
+ # Note that this is likely to give poor results for very short strings,
88
+ # which are often inherently ambiguous.
89
+ #
64
90
  # @param string [String]
65
91
  # @return [Symbol]
66
92
  def identify_script(string)
67
- if japanese?(string) then :Jpan
93
+ return :Jpan if kana?(string)
94
+ return :Kore if hangul?(string)
95
+
96
+ is_hant = traditional_chinese?(string)
97
+ is_hans = simplified_chinese?(string)
98
+ return :Hani if is_hant && is_hans
99
+
100
+ is_japanese = japanese?(string)
101
+ return :Hani if is_japanese && (is_hant || is_hans)
102
+
103
+ # At this point we have determined that the string does not contain
104
+ # Hangul; for such a string to be Korean would be unusual. Allowing Korean
105
+ # to dilute the result to Hani is going to be a loss on average, so we
106
+ # don't handle it like Japanese above.
107
+
108
+ if is_hans then :Hans
109
+ elsif is_hant then :Hant
110
+ elsif is_japanese then :Jpan
68
111
  elsif korean?(string) then :Kore
69
- elsif traditional_chinese?(string) then :Hant
70
- elsif simplified_chinese?(string) then :Hans
71
112
  elsif chinese?(string) then :Hani
72
- else :Zyyy
113
+ else
114
+ :Zyyy
73
115
  end
74
116
  end
117
+
118
+ # Identify all CJK scripts represented in +string+. Result is a list of symbols
119
+ # representing scripts defined by ISO 15924, namely one or more of:
120
+ # - Hans (Simplified Chinese)
121
+ # - Hant (Traditional Chinese)
122
+ # - Hani (Unspecified Chinese)
123
+ # - Jpan (Japanese: Han, Hiragana, Katakana)
124
+ # - Kore (Korean: Hangul, Han)
125
+ # - Zyyy (Undetermined)
126
+ #
127
+ # This method does not attempt to identify other scripts such as Latn.
128
+ #
129
+ # @param string [String]
130
+ # @return [Array<Symbol>]
131
+ def identify_scripts(string)
132
+ result = []
133
+
134
+ result << :Hans if simplified_chinese?(string)
135
+ result << :Hant if traditional_chinese?(string)
136
+ result << :Jpan if japanese?(string)
137
+ result << :Kore if korean?(string)
138
+ result << :Hani if chinese?(string) && result.empty?
139
+ result << :Zyyy if result.empty?
140
+
141
+ result
142
+ end
75
143
  end
76
144
  end
@@ -17,6 +17,7 @@ Gem::Specification.new do |spec|
17
17
 
18
18
  spec.metadata['homepage_uri'] = spec.homepage
19
19
  spec.metadata['source_code_uri'] = 'https://github.com/amake/script_detector_2.git'
20
+ spec.metadata['rubygems_mfa_required'] = 'true'
20
21
 
21
22
  # Specify which files should be added to the gem when it is released.
22
23
  # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
data/tasks/gen_src.rake CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  TMP_DIR = 'tmp'
4
4
  UNIHAN_ZIP = File.join(TMP_DIR, 'Unihan.zip')
5
- UNIHAN_URL = 'https://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip'
5
+ UNIHAN_URL = 'https://www.unicode.org/Public/14.0.0/ucd/Unihan.zip'
6
6
 
7
7
  directory TMP_DIR
8
8
 
data/tasks/unihan.rb CHANGED
@@ -5,15 +5,20 @@ module Unihan
5
5
  CODEPOINT_PATTERN = /U\+(?<hex>[A-F0-9]+)/.freeze
6
6
 
7
7
  class << self
8
- # @param readings_data [Hash<Integer,Hash{String => String}>]
8
+ # @param dict_data [Hash<Integer,Hash{String => String}>]
9
9
  # @param tags [Array<String>]
10
10
  # @return [Regexp]
11
11
  def gen_unihan_core_pattern(dict_data, *tags)
12
- codepoints = dict_data.select do |_, data|
12
+ gen_pattern(codepoints_for_tags(dict_data, tags))
13
+ end
14
+
15
+ # @param dict_data [Hash<Integer,Hash{String => String}>]
16
+ # @param tags [Array<String>]
17
+ # @return [Array<Integer>]
18
+ def codepoints_for_tags(dict_data, tags)
19
+ dict_data.select do |_, data|
13
20
  tags.all? { |t| data['kUnihanCore2020']&.include?(t) }
14
21
  end.keys
15
-
16
- gen_pattern(codepoints)
17
22
  end
18
23
 
19
24
  # @param codepoints [Array<Integer>]
@@ -22,11 +27,13 @@ module Unihan
22
27
  alts = group(codepoints).map do |first, last|
23
28
  if first == last
24
29
  format('\u{%x}', first)
30
+ elsif first.succ == last
31
+ format('\u{%<first>x}\u{%<last>x}', first: first, last: last)
25
32
  else
26
33
  format('\u{%<first>x}-\u{%<last>x}', first: first, last: last)
27
34
  end
28
35
  end
29
- /[#{alts.join}]/
36
+ /\A[#{alts.join}]+\Z/
30
37
  end
31
38
 
32
39
  # @param codepoints [Array<Integer>]
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: script_detector_2
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aaron Madlon-Kay
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-08-22 00:00:00.000000000 Z
11
+ date: 2021-11-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: byebug
@@ -76,6 +76,7 @@ files:
76
76
  - ".dir-locals.el"
77
77
  - ".rubocop.yml"
78
78
  - ".rubocop_todo.yml"
79
+ - ".ruby-version"
79
80
  - ".solargraph.yml"
80
81
  - CHANGELOG.md
81
82
  - Gemfile
@@ -99,6 +100,7 @@ licenses:
99
100
  metadata:
100
101
  homepage_uri: https://github.com/amake/script_detector_2
101
102
  source_code_uri: https://github.com/amake/script_detector_2.git
103
+ rubygems_mfa_required: 'true'
102
104
  post_install_message:
103
105
  rdoc_options: []
104
106
  require_paths: