script_detector_2 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9709f6c83f82a3bf7073bb8b9cfc95a1edea6885e0f596ef6e93f01a62c26dbb
4
- data.tar.gz: 3ed583c8487617e9687b3c776116f0cc22140993ef9debf4f8542057e9f9f232
3
+ metadata.gz: 37ab1716845b98ca15a072e67cd900f5854dadd3c3641667fb5887c1626c4a85
4
+ data.tar.gz: 5142f341e40601f3d1fff211ff1fa566047f6353d0d3f6bc5062653827e759db
5
5
  SHA512:
6
- metadata.gz: 5d17c19eee4868b540af844e08c95133415de8ccf0293dd208c11a29c14315b716c1563c45fd22081fbbf604147c837fa14bc98772a4103b2df080e4d09f89fa
7
- data.tar.gz: 4a7894f23f49494f9debcafc967217ccb00549234014166f9fe4f1fe3d13b12b15fbe63eac426b164dfa763acbc9b4e15dfa8cd616de84720f1b2ea3249abfde
6
+ metadata.gz: 35b1771a4d8898d6ca02f38b4a016adc927600e7456b0a41ae515f88e85c846d8a41951ce78fa12c7a2851fc596a13ab48a3537dd97ff4579e8cd45d19b50ccf
7
+ data.tar.gz: ea6da32d1c4b9a10a4c04208fda4bee7bc6bad7d8ffa635f163c608e256160b4115fe3c1a8f4e7c9ec8b65f435ddca1fcb7af9f857a82042edd11457685d7424
data/.rubocop_todo.yml CHANGED
@@ -1,22 +1,27 @@
1
1
  # This configuration was generated by
2
2
  # `rubocop --auto-gen-config`
3
- # on 2021-10-11 07:56:31 UTC using RuboCop version 1.21.0.
3
+ # on 2021-11-24 02:35:34 UTC using RuboCop version 1.23.0.
4
4
  # The point is for the user to remove these configuration records
5
5
  # one by one as the offenses are removed from the code base.
6
6
  # Note that changes in the inspected code, or installation of new
7
7
  # versions of RuboCop, may require this file to be generated again.
8
8
 
9
- # Offense count: 1
9
+ # Offense count: 2
10
+ # Configuration parameters: IgnoredMethods, CountRepeatedAttributes.
11
+ Metrics/AbcSize:
12
+ Max: 20
13
+
14
+ # Offense count: 2
10
15
  # Configuration parameters: IgnoredMethods.
11
16
  Metrics/CyclomaticComplexity:
12
- Max: 10
17
+ Max: 13
13
18
 
14
19
  # Offense count: 2
15
20
  # Configuration parameters: CountComments, CountAsOne, ExcludedMethods, IgnoredMethods.
16
21
  Metrics/MethodLength:
17
- Max: 14
22
+ Max: 15
18
23
 
19
24
  # Offense count: 1
20
25
  # Configuration parameters: IgnoredMethods.
21
26
  Metrics/PerceivedComplexity:
22
- Max: 11
27
+ Max: 14
data/CHANGELOG.md CHANGED
@@ -1,5 +1,10 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [0.4.0] - 2021-11-24
4
+
5
+ - Add `identify_scripts` method
6
+ - Improve accuracy of `identify_script` method
7
+
3
8
  ## [0.3.0] - 2021-10-13
4
9
 
5
10
  - Add `kana?` and `hangul?` methods
data/Gemfile.lock CHANGED
@@ -1,14 +1,14 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- script_detector_2 (0.3.0)
4
+ script_detector_2 (0.4.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
8
8
  specs:
9
9
  ast (2.4.2)
10
10
  backport (1.2.0)
11
- benchmark (0.1.1)
11
+ benchmark (0.2.0)
12
12
  byebug (11.1.3)
13
13
  diff-lcs (1.4.4)
14
14
  e2mmap (0.1.0)
@@ -23,14 +23,14 @@ GEM
23
23
  parallel (1.21.0)
24
24
  parser (3.0.2.0)
25
25
  ast (~> 2.4.1)
26
- racc (1.5.2)
26
+ racc (1.6.0)
27
27
  rainbow (3.0.0)
28
28
  rake (13.0.6)
29
29
  regexp_parser (2.1.1)
30
- reverse_markdown (2.0.0)
30
+ reverse_markdown (2.1.1)
31
31
  nokogiri
32
32
  rexml (3.2.5)
33
- rubocop (1.22.1)
33
+ rubocop (1.23.0)
34
34
  parallel (~> 1.10)
35
35
  parser (>= 3.0.0.0)
36
36
  rainbow (>= 2.2.2, < 4.0)
@@ -39,11 +39,11 @@ GEM
39
39
  rubocop-ast (>= 1.12.0, < 2.0)
40
40
  ruby-progressbar (~> 1.7)
41
41
  unicode-display_width (>= 1.4.0, < 3.0)
42
- rubocop-ast (1.12.0)
42
+ rubocop-ast (1.13.0)
43
43
  parser (>= 3.0.1.1)
44
44
  ruby-progressbar (1.11.0)
45
45
  rubyzip (2.3.2)
46
- solargraph (0.44.0)
46
+ solargraph (0.44.2)
47
47
  backport (~> 1.2)
48
48
  benchmark
49
49
  bundler (>= 1.17.2)
@@ -76,4 +76,4 @@ DEPENDENCIES
76
76
  solargraph
77
77
 
78
78
  BUNDLED WITH
79
- 2.2.29
79
+ 2.2.32
data/README.md CHANGED
@@ -42,6 +42,7 @@ The main detection methods are:
42
42
  - `ScriptDetector2.simplified_chinese?`
43
43
  - `ScriptDetector2.traditional_chinese?`
44
44
  - `ScriptDetector2.identify_script`
45
+ - `ScriptDetector2.identify_scripts`
45
46
 
46
47
  Regexp patterns are used to identify the script to which Han characters belong.
47
48
  These can be used directly as well:
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ScriptDetector2
4
- VERSION = '0.3.0'
4
+ VERSION = '0.4.0'
5
5
  end
@@ -8,7 +8,7 @@ require_relative 'script_detector_2/string'
8
8
  module ScriptDetector2
9
9
  class << self
10
10
  # @param string [String]
11
- # @return [Boolean]
11
+ # @return [Boolean] true if +string+ appears to be Japanese
12
12
  def japanese?(string)
13
13
  return true if kana?(string)
14
14
  return false if hangul?(string)
@@ -20,13 +20,14 @@ module ScriptDetector2
20
20
  end
21
21
 
22
22
  # @param string [String]
23
- # @return [Boolean]
23
+ # @return [Boolean] true if +string+ contains Hiragana or Katakana
24
24
  def kana?(string)
25
25
  /[\p{Hiragana}\p{Katakana}]/.match?(string)
26
26
  end
27
27
 
28
28
  # @param string [String]
29
- # @return [Boolean]
29
+ # @return [Boolean] true if +string+ appears to be Chinese (either
30
+ # Simplified or Traditional)
30
31
  def chinese?(string)
31
32
  return false if string =~ /[\p{Hiragana}\p{Katakana}\p{Hangul}]/
32
33
 
@@ -34,7 +35,7 @@ module ScriptDetector2
34
35
  end
35
36
 
36
37
  # @param string [String]
37
- # @return [Boolean]
38
+ # @return [Boolean] true if +string+ appears to be Simplified Chinese
38
39
  def simplified_chinese?(string)
39
40
  return false if string =~ /[\p{Hiragana}\p{Katakana}\p{Hangul}]/
40
41
 
@@ -45,7 +46,7 @@ module ScriptDetector2
45
46
  end
46
47
 
47
48
  # @param string [String]
48
- # @return [Boolean]
49
+ # @return [Boolean] true if +string+ appears to be Traditional Chinese
49
50
  def traditional_chinese?(string)
50
51
  return false if string =~ /[\p{Hiragana}\p{Katakana}\p{Hangul}]/
51
52
 
@@ -56,7 +57,7 @@ module ScriptDetector2
56
57
  end
57
58
 
58
59
  # @param string [String]
59
- # @return [Boolean]
60
+ # @return [Boolean] true if +string+ appears to be Korean
60
61
  def korean?(string)
61
62
  return true if hangul?(string)
62
63
  return false if kana?(string)
@@ -68,11 +69,24 @@ module ScriptDetector2
68
69
  end
69
70
 
70
71
  # @param string [String]
71
- # @return [Boolean]
72
+ # @return [Boolean] true if +string+ contains Hangul
72
73
  def hangul?(string)
73
74
  /\p{Hangul}/.match?(string)
74
75
  end
75
76
 
77
+ # Make a best-effort attempt to guess the singular script of +string+.
78
+ # Result is a symbol representing one of the scripts defined by ISO 15924,
79
+ # namely one of:
80
+ # - Hans (Simplified Chinese)
81
+ # - Hant (Traditional Chinese)
82
+ # - Hani (Unspecified Han)
83
+ # - Jpan (Japanese: Han, Hiragana, Katakana)
84
+ # - Kore (Korean: Hangul, Han)
85
+ # - Zyyy (Undetermined)
86
+ #
87
+ # Note that this is likely to give poor results for very short strings,
88
+ # which are often inherently ambiguous.
89
+ #
76
90
  # @param string [String]
77
91
  # @return [Symbol]
78
92
  def identify_script(string)
@@ -81,15 +95,50 @@ module ScriptDetector2
81
95
 
82
96
  is_hant = traditional_chinese?(string)
83
97
  is_hans = simplified_chinese?(string)
84
- if is_hant && is_hans then :Hani
85
- elsif is_hans then :Hans
98
+ return :Hani if is_hant && is_hans
99
+
100
+ is_japanese = japanese?(string)
101
+ return :Hani if is_japanese && (is_hant || is_hans)
102
+
103
+ # At this point we have determined that the string does not contain
104
+ # Hangul; for such a string to be Korean would be unusual. Allowing Korean
105
+ # to dilute the result to Hani is going to be a loss on average, so we
106
+ # don't handle it like Japanese above.
107
+
108
+ if is_hans then :Hans
86
109
  elsif is_hant then :Hant
87
- elsif japanese?(string) then :Jpan
110
+ elsif is_japanese then :Jpan
88
111
  elsif korean?(string) then :Kore
89
- elsif chinese?(string) then :Hani # rubocop:disable Lint/DuplicateBranch
112
+ elsif chinese?(string) then :Hani
90
113
  else
91
114
  :Zyyy
92
115
  end
93
116
  end
117
+
118
+ # Identify all CJK scripts represented in +string+. Result is a list of symbols
119
+ # representing scripts defined by ISO 15924, namely one or more of:
120
+ # - Hans (Simplified Chinese)
121
+ # - Hant (Traditional Chinese)
122
+ # - Hani (Unspecified Chinese)
123
+ # - Jpan (Japanese: Han, Hiragana, Katakana)
124
+ # - Kore (Korean: Hangul, Han)
125
+ # - Zyyy (Undetermined)
126
+ #
127
+ # This method does not attempt to identify other scripts such as Latn.
128
+ #
129
+ # @param string [String]
130
+ # @return [Array<Symbol>]
131
+ def identify_scripts(string)
132
+ result = []
133
+
134
+ result << :Hans if simplified_chinese?(string)
135
+ result << :Hant if traditional_chinese?(string)
136
+ result << :Jpan if japanese?(string)
137
+ result << :Kore if korean?(string)
138
+ result << :Hani if chinese?(string) && result.empty?
139
+ result << :Zyyy if result.empty?
140
+
141
+ result
142
+ end
94
143
  end
95
144
  end
@@ -17,6 +17,7 @@ Gem::Specification.new do |spec|
17
17
 
18
18
  spec.metadata['homepage_uri'] = spec.homepage
19
19
  spec.metadata['source_code_uri'] = 'https://github.com/amake/script_detector_2.git'
20
+ spec.metadata['rubygems_mfa_required'] = 'true'
20
21
 
21
22
  # Specify which files should be added to the gem when it is released.
22
23
  # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: script_detector_2
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aaron Madlon-Kay
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-10-13 00:00:00.000000000 Z
11
+ date: 2021-11-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: byebug
@@ -100,6 +100,7 @@ licenses:
100
100
  metadata:
101
101
  homepage_uri: https://github.com/amake/script_detector_2
102
102
  source_code_uri: https://github.com/amake/script_detector_2.git
103
+ rubygems_mfa_required: 'true'
103
104
  post_install_message:
104
105
  rdoc_options: []
105
106
  require_paths: