script_detector_2 0.1.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ScriptDetector2
4
- VERSION = '0.1.0'
4
+ VERSION = '0.4.0'
5
5
  end
@@ -8,10 +8,10 @@ require_relative 'script_detector_2/string'
8
8
  module ScriptDetector2
9
9
  class << self
10
10
  # @param string [String]
11
- # @return [Boolean]
11
+ # @return [Boolean] true if +string+ appears to be Japanese
12
12
  def japanese?(string)
13
- return true if string =~ /[\p{Hiragana}\p{Katakana}]/
14
- return false if string =~ /\p{Hangul}/
13
+ return true if kana?(string)
14
+ return false if hangul?(string)
15
15
 
16
16
  kanji = string.scan(/\p{Han}/)
17
17
  return false unless kanji.any?
@@ -20,15 +20,22 @@ module ScriptDetector2
20
20
  end
21
21
 
22
22
  # @param string [String]
23
- # @return [Boolean]
23
+ # @return [Boolean] true if +string+ contains Hiragana or Katakana
24
+ def kana?(string)
25
+ /[\p{Hiragana}\p{Katakana}]/.match?(string)
26
+ end
27
+
28
+ # @param string [String]
29
+ # @return [Boolean] true if +string+ appears to be Chinese (either
30
+ # Simplified or Traditional)
24
31
  def chinese?(string)
25
32
  return false if string =~ /[\p{Hiragana}\p{Katakana}\p{Hangul}]/
26
33
 
27
- string =~ /\p{Han}/
34
+ /\p{Han}/.match?(string)
28
35
  end
29
36
 
30
37
  # @param string [String]
31
- # @return [Boolean]
38
+ # @return [Boolean] true if +string+ appears to be Simplified Chinese
32
39
  def simplified_chinese?(string)
33
40
  return false if string =~ /[\p{Hiragana}\p{Katakana}\p{Hangul}]/
34
41
 
@@ -39,7 +46,7 @@ module ScriptDetector2
39
46
  end
40
47
 
41
48
  # @param string [String]
42
- # @return [Boolean]
49
+ # @return [Boolean] true if +string+ appears to be Traditional Chinese
43
50
  def traditional_chinese?(string)
44
51
  return false if string =~ /[\p{Hiragana}\p{Katakana}\p{Hangul}]/
45
52
 
@@ -50,10 +57,10 @@ module ScriptDetector2
50
57
  end
51
58
 
52
59
  # @param string [String]
53
- # @return [Boolean]
60
+ # @return [Boolean] true if +string+ appears to be Korean
54
61
  def korean?(string)
55
- return true if string =~ /\p{Hangul}/
56
- return false if string =~ /[\p{Hiragana}\p{Katakana}]/
62
+ return true if hangul?(string)
63
+ return false if kana?(string)
57
64
 
58
65
  hanja = string.scan(/\p{Han}/)
59
66
  return false unless hanja.any?
@@ -61,16 +68,77 @@ module ScriptDetector2
61
68
  hanja.all?(KOREAN_PATTERN)
62
69
  end
63
70
 
71
+ # @param string [String]
72
+ # @return [Boolean] true if +string+ contains Hangul
73
+ def hangul?(string)
74
+ /\p{Hangul}/.match?(string)
75
+ end
76
+
77
+ # Make a best-effort attempt to guess the singular script of +string+.
78
+ # Result is a symbol representing one of the scripts defined by ISO 15924,
79
+ # namely one of:
80
+ # - Hans (Simplified Chinese)
81
+ # - Hant (Traditional Chinese)
82
+ # - Hani (Unspecified Han)
83
+ # - Jpan (Japanese: Han, Hiragana, Katakana)
84
+ # - Kore (Korean: Hangul, Han)
85
+ # - Zyyy (Undetermined)
86
+ #
87
+ # Note that this is likely to give poor results for very short strings,
88
+ # which are often inherently ambiguous.
89
+ #
64
90
  # @param string [String]
65
91
  # @return [Symbol]
66
92
  def identify_script(string)
67
- if japanese?(string) then :Jpan
93
+ return :Jpan if kana?(string)
94
+ return :Kore if hangul?(string)
95
+
96
+ is_hant = traditional_chinese?(string)
97
+ is_hans = simplified_chinese?(string)
98
+ return :Hani if is_hant && is_hans
99
+
100
+ is_japanese = japanese?(string)
101
+ return :Hani if is_japanese && (is_hant || is_hans)
102
+
103
+ # At this point we have determined that the string does not contain
104
+ # Hangul; for such a string to be Korean would be unusual. Allowing Korean
105
+ # to dilute the result to Hani is going to be a loss on average, so we
106
+ # don't handle it like Japanese above.
107
+
108
+ if is_hans then :Hans
109
+ elsif is_hant then :Hant
110
+ elsif is_japanese then :Jpan
68
111
  elsif korean?(string) then :Kore
69
- elsif traditional_chinese?(string) then :Hant
70
- elsif simplified_chinese?(string) then :Hans
71
112
  elsif chinese?(string) then :Hani
72
- else :Zyyy
113
+ else
114
+ :Zyyy
73
115
  end
74
116
  end
117
+
118
+ # Identify all CJK scripts represented in +string+. Result is a list of symbols
119
+ # representing scripts defined by ISO 15924, namely one or more of:
120
+ # - Hans (Simplified Chinese)
121
+ # - Hant (Traditional Chinese)
122
+ # - Hani (Unspecified Chinese)
123
+ # - Jpan (Japanese: Han, Hiragana, Katakana)
124
+ # - Kore (Korean: Hangul, Han)
125
+ # - Zyyy (Undetermined)
126
+ #
127
+ # This method does not attempt to identify other scripts such as Latn.
128
+ #
129
+ # @param string [String]
130
+ # @return [Array<Symbol>]
131
+ def identify_scripts(string)
132
+ result = []
133
+
134
+ result << :Hans if simplified_chinese?(string)
135
+ result << :Hant if traditional_chinese?(string)
136
+ result << :Jpan if japanese?(string)
137
+ result << :Kore if korean?(string)
138
+ result << :Hani if chinese?(string) && result.empty?
139
+ result << :Zyyy if result.empty?
140
+
141
+ result
142
+ end
75
143
  end
76
144
  end
@@ -17,6 +17,7 @@ Gem::Specification.new do |spec|
17
17
 
18
18
  spec.metadata['homepage_uri'] = spec.homepage
19
19
  spec.metadata['source_code_uri'] = 'https://github.com/amake/script_detector_2.git'
20
+ spec.metadata['rubygems_mfa_required'] = 'true'
20
21
 
21
22
  # Specify which files should be added to the gem when it is released.
22
23
  # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
data/tasks/gen_src.rake CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  TMP_DIR = 'tmp'
4
4
  UNIHAN_ZIP = File.join(TMP_DIR, 'Unihan.zip')
5
- UNIHAN_URL = 'https://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip'
5
+ UNIHAN_URL = 'https://www.unicode.org/Public/14.0.0/ucd/Unihan.zip'
6
6
 
7
7
  directory TMP_DIR
8
8
 
data/tasks/unihan.rb CHANGED
@@ -5,15 +5,20 @@ module Unihan
5
5
  CODEPOINT_PATTERN = /U\+(?<hex>[A-F0-9]+)/.freeze
6
6
 
7
7
  class << self
8
- # @param readings_data [Hash<Integer,Hash{String => String}>]
8
+ # @param dict_data [Hash<Integer,Hash{String => String}>]
9
9
  # @param tags [Array<String>]
10
10
  # @return [Regexp]
11
11
  def gen_unihan_core_pattern(dict_data, *tags)
12
- codepoints = dict_data.select do |_, data|
12
+ gen_pattern(codepoints_for_tags(dict_data, tags))
13
+ end
14
+
15
+ # @param dict_data [Hash<Integer,Hash{String => String}>]
16
+ # @param tags [Array<String>]
17
+ # @return [Array<Integer>]
18
+ def codepoints_for_tags(dict_data, tags)
19
+ dict_data.select do |_, data|
13
20
  tags.all? { |t| data['kUnihanCore2020']&.include?(t) }
14
21
  end.keys
15
-
16
- gen_pattern(codepoints)
17
22
  end
18
23
 
19
24
  # @param codepoints [Array<Integer>]
@@ -22,11 +27,13 @@ module Unihan
22
27
  alts = group(codepoints).map do |first, last|
23
28
  if first == last
24
29
  format('\u{%x}', first)
30
+ elsif first.succ == last
31
+ format('\u{%<first>x}\u{%<last>x}', first: first, last: last)
25
32
  else
26
33
  format('\u{%<first>x}-\u{%<last>x}', first: first, last: last)
27
34
  end
28
35
  end
29
- /[#{alts.join}]/
36
+ /\A[#{alts.join}]+\Z/
30
37
  end
31
38
 
32
39
  # @param codepoints [Array<Integer>]
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: script_detector_2
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aaron Madlon-Kay
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-08-22 00:00:00.000000000 Z
11
+ date: 2021-11-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: byebug
@@ -76,6 +76,7 @@ files:
76
76
  - ".dir-locals.el"
77
77
  - ".rubocop.yml"
78
78
  - ".rubocop_todo.yml"
79
+ - ".ruby-version"
79
80
  - ".solargraph.yml"
80
81
  - CHANGELOG.md
81
82
  - Gemfile
@@ -99,6 +100,7 @@ licenses:
99
100
  metadata:
100
101
  homepage_uri: https://github.com/amake/script_detector_2
101
102
  source_code_uri: https://github.com/amake/script_detector_2.git
103
+ rubygems_mfa_required: 'true'
102
104
  post_install_message:
103
105
  rdoc_options: []
104
106
  require_paths: